Coverage for src/sensai/vector_model.py: 76%

368 statements  

« prev     ^ index     » next       coverage.py v7.6.1, created at 2024-11-29 18:29 +0000

1""" 

2This module defines base classes for models that use pandas.DataFrames for inputs and outputs, where each data frame row represents 

3a single model input or output. Since every row contains a vector of data (one-dimensional array), we refer to them as vector-based 

4models. Hence the name of the module and of the central base class :class:`VectorModel`. 

5""" 

6 

7import logging 

8import typing 

9from abc import ABC, abstractmethod 

10from typing import List, Any, Optional, Union, Type, Dict 

11 

12import numpy as np 

13import pandas as pd 

14 

15from .util.deprecation import deprecated 

16from .data import InputOutputData 

17from .data_transformation import DataFrameTransformer, DataFrameTransformerChain, InvertibleDataFrameTransformer 

18from .featuregen import FeatureGenerator, FeatureCollector 

19from .util import mark_used, kwarg_if_not_none 

20from .util.cache import PickleLoadSaveMixin 

21from .util.logging import StopWatch 

22from .util.pickle import setstate, getstate 

23from .util.sequences import get_first_duplicate 

24from .util.string import ToStringMixin 

25 

26mark_used(InputOutputData) # for backward compatibility 

27 

28log = logging.getLogger(__name__) 

29TVectorModelBase = typing.TypeVar("TVectorModelBase", bound="VectorModelBase") 

30TVectorModel = typing.TypeVar("TVectorModel", bound="VectorModel") 

31TVectorRegressionModel = typing.TypeVar("TVectorRegressionModel", bound="VectorRegressionModel") 

32 

33 

34class VectorModelBase(ABC, ToStringMixin): 

35 """ 

36 Base class for vector models, which defines the fundamental prediction interface. 

37 A vector model takes data frames as input, where each row represents a vector of information. 

38 """ 

39 def __init__(self): 

40 self._name = None 

41 

42 @abstractmethod 

43 def predict(self, x: pd.DataFrame) -> pd.DataFrame: 

44 pass 

45 

46 @abstractmethod 

47 def is_regression_model(self) -> bool: 

48 pass 

49 

50 @abstractmethod 

51 def get_predicted_variable_names(self) -> list: 

52 pass 

53 

54 def with_name(self: TVectorModelBase, name: str) -> TVectorModelBase: 

55 """ 

56 Sets the model's name. 

57 

58 :param name: the name 

59 :return: self 

60 """ 

61 self.set_name(name) 

62 return self 

63 

64 def set_name(self, name): 

65 self._name = name 

66 

67 def get_name(self): 

68 if self._name is None: 

69 return "unnamed-%s-%x" % (self.__class__.__name__, id(self)) 

70 return self._name 

71 

72 

73class TrainingContext: 

74 """ 

75 Contains context information for an ongoing training process 

76 """ 

77 def __init__(self, original_input: pd.DataFrame, original_output: pd.DataFrame): 

78 self.original_input = original_input 

79 self.original_output = original_output 

80 

81 

82class VectorModel(VectorModelBase, PickleLoadSaveMixin, ABC): 

83 """ 

84 Represents a model which uses data frames as inputs and outputs whose rows define individual data points. 

85 Every data frame row represents a vector of information (one-dimensional array), hence the name of the model. 

86 Note that the vectors in question are not necessarily vectors in the mathematical sense, as the information in each cell is not 

87 required to be numeric or uniform but can be arbitrarily complex. 

88 """ 

89 TOSTRING_INCLUDE_PREPROCESSORS = True 

90 _TRANSIENT_MEMBERS = ["_trainingContext"] 

91 _RENAMED_MEMBERS = { 

92 "checkInputColumns": "_checkInputColumns", 

93 "_inputTransformerChain": "_featureTransformerChain" 

94 } 

95 

96 def __init__(self, check_input_columns=True): 

97 """ 

98 :param check_input_columns: whether to check if the input column list (that is fed to the underlying model, i.e. after feature 

99 generation) during inference coincides with the input column list that was observed during training. 

100 This should be disabled if feature generation is not performed by the model itself, e.g. in meta-models 

101 such as ensemble models. 

102 """ 

103 super().__init__() 

104 self._featureGenerator: Optional[FeatureGenerator] = None 

105 self._rawInputTransformerChain = DataFrameTransformerChain() 

106 self._featureTransformerChain = DataFrameTransformerChain() 

107 self._isFitted = False # Note: this keeps track only of the actual model being fitted, not the pre/postprocessors 

108 self._predictedVariableNames: Optional[list] = None 

109 self._modelInputVariableNames: Optional[list] = None 

110 self._checkInputColumns = check_input_columns 

111 

112 # transient members 

113 self._trainingContext: Optional[TrainingContext] = None 

114 

115 def __getstate__(self): 

116 return getstate(VectorModel, self, transient_properties=self._TRANSIENT_MEMBERS) 

117 

118 def __setstate__(self, state): 

119 for m in VectorModel._TRANSIENT_MEMBERS: 

120 state[m] = None 

121 setstate(VectorModel, self, state, renamed_properties=self._RENAMED_MEMBERS, 

122 new_default_properties={"_rawInputTransformerChain": DataFrameTransformerChain()}) 

123 

124 def _tostring_exclude_private(self) -> bool: 

125 return True 

126 

127 def _tostring_exclude_exceptions(self) -> List[str]: 

128 e = super()._tostring_exclude_exceptions() 

129 if self.TOSTRING_INCLUDE_PREPROCESSORS: 

130 e += ["_rawInputTransformerChain", "_featureGenerator", "_featureTransformerChain"] 

131 return e 

132 

133 def _tostring_additional_entries(self) -> Dict[str, Any]: 

134 d = super()._tostring_additional_entries() 

135 if self._featureGenerator is not None: 

136 d["featureGeneratorNames"] = self._featureGenerator.get_names() 

137 if self._name is not None: 

138 d["name"] = self._name 

139 return d 

140 

141 def with_raw_input_transformers(self: TVectorModel, 

142 *transformers: Union[DataFrameTransformer, List[DataFrameTransformer]]) -> TVectorModel: 

143 """ 

144 Makes the model use the given transformers (removing previously set raw input transformers, if any), which 

145 are to be applied to the raw input data frame (prior to feature generation). 

146 

147 :param transformers: :class:`DataFrameTransformer` instances to use (in sequence) for the transformation of inputs 

148 :return: self 

149 """ 

150 self._rawInputTransformerChain = DataFrameTransformerChain(*transformers) 

151 return self 

152 

153 def with_feature_transformers(self: TVectorModel, *transformers: Union[DataFrameTransformer, List[DataFrameTransformer]], 

154 add=False) -> TVectorModel: 

155 """ 

156 Makes the model use the given transformers 

157 which are to be applied to the data frames generated by feature generators. 

158 (If the model does not use feature generators, the transformers will be applied to 

159 whatever is produced by the raw input transformers or, if there are none, the original raw 

160 input data frame). 

161 

162 :param transformers: :class:`DataFrameTransformer` instances to use (in sequence) for the transformation of features 

163 :param add: whether to add the transformers to the existing transformers rather than replacing them 

164 :return: self 

165 """ 

166 if not add: 

167 self._featureTransformerChain = DataFrameTransformerChain(*transformers) 

168 else: 

169 for t in transformers: 

170 self._featureTransformerChain.append(t) 

171 return self 

172 

173 @deprecated("Use with_feature_transformers instead; this method will be removed in a future sensAI release.") 

174 def with_input_transformers(self: TVectorModel, 

175 *input_transformers: Union[DataFrameTransformer, List[DataFrameTransformer]]) -> TVectorModel: 

176 """ 

177 Makes the model use the given feature transformers (removing previously set transformers, if any), 

178 i.e. it transforms the data frame that is generated by the feature generators (if any). 

179 

180 :param input_transformers: :class:`DataFrameTransformer` instances to use (in sequence) for the transformation of inputs 

181 :return: self 

182 """ 

183 return self.with_feature_transformers(*input_transformers) 

184 

185 def with_feature_generator(self: TVectorModel, feature_generator: Optional[FeatureGenerator]) -> TVectorModel: 

186 """ 

187 Makes the model use the given feature generator in order to obtain the model inputs. 

188 If the model shall use more than one feature generator, pass a :class:`MultiFeatureGenerator` which combines them or 

189 use the perhaps more convenient :class:`FeatureCollector` in conjunction with :meth:`withFeatureCollector`. 

190 

191 Note: Feature computation takes place before input transformation. 

192 

193 :param feature_generator: the feature generator to use for input computation 

194 :return: self 

195 """ 

196 self._featureGenerator = feature_generator 

197 return self 

198 

199 def with_feature_collector(self: TVectorModel, feature_collector: FeatureCollector, 

200 shared: bool = False) -> TVectorModel: 

201 """ 

202 Makes the model use a multi-feature generator obtained from the given collector 

203 in order compute the underlying model's input from the data frame that is given. 

204 Overrides any feature generator previously passed to :meth:`withFeatureGenerator` (if any). 

205 

206 Note: Feature generation takes place before feature transformation. 

207 

208 :param feature_collector: the feature collector from which to obtain the multi-feature generator 

209 :param shared: whether the given feature collector is shared between models (i.e. whether 

210 the same instance is passed to multiple models). 

211 Passing `shared=True` ensures that models using the same collector do not end up 

212 using the same multi-feature collector instance and instead receive an independent instance. 

213 :return: self 

214 """ 

215 if shared: 

216 self._featureGenerator = feature_collector.create_multi_feature_generator() 

217 else: 

218 self._featureGenerator = feature_collector.get_multi_feature_generator() 

219 return self 

220 

221 def _pre_processors_are_fitted(self): 

222 result = self._rawInputTransformerChain.is_fitted() and self._featureTransformerChain.is_fitted() 

223 if self.get_feature_generator() is not None: 

224 result = result and self.get_feature_generator().is_fitted() 

225 return result 

226 

227 def is_fitted(self): 

228 """ 

229 :return: True if the model has been fitted, False otherwise 

230 """ 

231 if not self._is_underlying_model_fitted(): 

232 return False 

233 if not self._pre_processors_are_fitted(): 

234 return False 

235 return True 

236 

237 def _is_underlying_model_fitted(self): 

238 underlying_model_is_fitted = not self._underlying_model_requires_fitting() or self._isFitted 

239 return underlying_model_is_fitted 

240 

241 def _check_model_input_columns(self, model_input: pd.DataFrame): 

242 if self._checkInputColumns and list(model_input.columns) != self._modelInputVariableNames: 

243 raise Exception(f"Inadmissible input data frame: " 

244 f"expected columns {self._modelInputVariableNames}, got {list(model_input.columns)}") 

245 

246 def compute_model_inputs(self, x: pd.DataFrame): 

247 """ 

248 Applies feature generators and input transformers (if any) to generate from an input data frame the input for the 

249 underlying model 

250 

251 :param x: the input data frame, to which input preprocessing is to be applied 

252 :return: the input data frame that serves as input for the underlying model 

253 """ 

254 return self._compute_model_inputs(x) 

255 

256 def _compute_model_inputs(self, x: pd.DataFrame, y: pd.DataFrame = None, fit=False) -> pd.DataFrame: 

257 """ 

258 :param x: the input data frame 

259 :param y: the output data frame (when training); only has to be provided if ``fit=True`` and preprocessors require outputs 

260 for fitting 

261 :param fit: if True, preprocessors will be fitted before being applied to ``X`` 

262 :return: 

263 """ 

264 if fit: 

265 x = self._rawInputTransformerChain.fit_apply(x) 

266 if self._featureGenerator is not None: 

267 x = self._featureGenerator.fit_generate(x, y, self) 

268 x = self._featureTransformerChain.fit_apply(x) 

269 else: 

270 x = self._rawInputTransformerChain.apply(x) 

271 if self._featureGenerator is not None: 

272 x = self._featureGenerator.generate(x, self) 

273 x = self._featureTransformerChain.apply(x) 

274 return x 

275 

276 def _compute_model_outputs(self, y: pd.DataFrame) -> pd.DataFrame: 

277 return y 

278 

279 def compute_model_outputs(self, y: pd.DataFrame) -> pd.DataFrame: 

280 return self._compute_model_outputs(y) 

281 

282 def predict(self, x: pd.DataFrame) -> pd.DataFrame: 

283 """ 

284 Applies the model to the given input data frame 

285 

286 :param x: the input data frame 

287 :return: the model outputs in the form of a data frame whose index corresponds to the index of ``x`` 

288 """ 

289 if not self.is_fitted(): 

290 raise Exception(f"Calling predict with unfitted model {self} " 

291 f"(isUnderlyingModelFitted={self._is_underlying_model_fitted()}, " 

292 f"preProcessorsAreFitted={self._pre_processors_are_fitted()})") 

293 x = self._compute_model_inputs(x) 

294 self._check_model_input_columns(x) 

295 y = self._predict(x) 

296 return self._create_output_data_frame(y, x.index) 

297 

298 def _create_output_data_frame(self, y: Union[pd.DataFrame, list], index): 

299 if isinstance(y, pd.DataFrame): 

300 # make sure the data frame has the right index 

301 y.index = index 

302 return y 

303 else: 

304 predicted_columns = self.get_predicted_variable_names() 

305 if len(predicted_columns) != 1: 

306 raise ValueError(f"_predict must return a DataFrame as there are multiple predicted columns; got {type(y)}") 

307 return pd.DataFrame(pd.Series(y, name=predicted_columns[0], index=index)) 

308 

309 @abstractmethod 

310 def _predict(self, x: pd.DataFrame) -> Union[pd.DataFrame, list]: 

311 """ 

312 :param x: the input data frame 

313 :return: the output data frame, or, for the case where a single column is to be predicted, the list of values for that column 

314 """ 

315 pass 

316 

317 def _underlying_model_requires_fitting(self) -> bool: 

318 """ 

319 Designed to be overridden for rule-based models. 

320 

321 :return: True iff the underlying model requires fitting 

322 """ 

323 return True 

324 

325 def _fit_preprocessors(self, x: pd.DataFrame, y: pd.DataFrame = None): 

326 self._rawInputTransformerChain.fit(x) 

327 # no need for fitGenerate if chain is empty 

328 if self._featureGenerator is not None: 

329 if len(self._featureTransformerChain) == 0: 

330 self._featureGenerator.fit(x, y) 

331 else: 

332 x = self._featureGenerator.fit_generate(x, y, self) 

333 self._featureTransformerChain.fit(x) 

334 

335 def fit_input_output_data(self, io_data: InputOutputData, fit_preprocessors=True, fit_model=True): 

336 """ 

337 Fits the model using the given data 

338 

339 :param io_data: the input/output data 

340 :param fit_preprocessors: whether the model's preprocessors (feature generators and data frame transformers) shall be fitted 

341 :param fit_model: whether the model itself shall be fitted 

342 """ 

343 self.fit(io_data.inputs, io_data.outputs, weights=io_data.weights, fit_preprocessors=fit_preprocessors, fit_model=fit_model) 

344 

345 def fit(self, x: pd.DataFrame, y: Optional[pd.DataFrame], weights: Optional[pd.Series] = None, fit_preprocessors=True, fit_model=True): 

346 """ 

347 Fits the model using the given data 

348 

349 :param x: a data frame containing input data 

350 :param y: a data frame containing output data; may be None if the underlying model does not actually require 

351 fitting, e.g. in the case of a rule-based models, but fitting is still necessary for preprocessors 

352 :param weights: an optional series (with the same index as `x` and `y`) containing data point weights. 

353 Added in v1.2.0. 

354 :param fit_preprocessors: whether the model's preprocessors (feature generators and data frame transformers) shall be fitted 

355 :param fit_model: whether the model itself shall be fitted 

356 """ 

357 self._trainingContext = TrainingContext(x, y) 

358 try: 

359 log.info(f"Fitting {self.__class__.__name__} instance") 

360 sw = StopWatch() 

361 self._predictedVariableNames = list(y.columns) 

362 if not self._underlying_model_requires_fitting(): 

363 if fit_preprocessors: 

364 self._fit_preprocessors(x, y=y) 

365 self._modelInputVariableNames = None # not known for rule-based models because the fitting process is optimised 

366 else: 

367 if y is None: 

368 raise Exception(f"The underlying model requires a data frame for fitting but Y=None was passed") 

369 if len(x) != len(y): 

370 raise ValueError(f"Length of input ({len(x)}) does not match length of output ({len(y)})") 

371 y = self._compute_model_outputs(y) 

372 x = self._compute_model_inputs(x, y=y, fit=fit_preprocessors) 

373 if len(x) != len(y): 

374 log.debug(f"Input computation changed number of data points ({len(self._trainingContext.original_input)} -> {len(x)})") 

375 y = y.loc[x.index] 

376 if len(x) != len(y): 

377 raise ValueError("Could not recover matching outputs for changed inputs. Only input filtering is admissible, " 

378 "indices of input & ouput data frames must match.") 

379 self._modelInputVariableNames = list(x.columns) 

380 if fit_model: 

381 inputs_with_types = ', '.join([n + '/' + x[n].dtype.name for n in self._modelInputVariableNames]) 

382 log.debug(f"Fitting with outputs[{len(y.columns)}]={list(y.columns)}, " 

383 f"inputs[{len(self._modelInputVariableNames)}]=[{inputs_with_types}]; N={len(x)} data points") 

384 self._fit(x, y, **kwarg_if_not_none("weights", weights)) 

385 self._isFitted = True 

386 else: 

387 log.info("Fitting of underlying model skipped") 

388 log.info(f"Fitting completed in {sw.get_elapsed_time_secs():.2f} seconds: {self}") 

389 finally: 

390 self._trainingContext = None 

391 

392 def is_being_fitted(self) -> bool: 

393 """ 

394 :return: True if the model is currently in the process of being fitted, False otherwise 

395 """ 

396 return self._trainingContext is not None 

397 

398 @abstractmethod 

399 def _fit(self, x: pd.DataFrame, y: pd.DataFrame, weights: Optional[pd.DataFrame] = None): 

400 pass 

401 

402 def _warn_sample_weights_unsupported(self, is_weighting_supported: bool, weights: Optional[pd.Series]): 

403 if weights is not None and not is_weighting_supported: 

404 log.warning(f"Data point weighting not supported by {self.__class__.__name__}; ignoring weights") 

405 

406 def get_predicted_variable_names(self): 

407 """ 

408 :return: the list of variable names that are ultimately output by this model (i.e. the columns of the data frame output 

409 by :meth:`predict`) 

410 """ 

411 return self._predictedVariableNames 

412 

413 def get_model_input_variable_names(self) -> Optional[List[str]]: 

414 """ 

415 :return: the list of variable names required by the underlying model as input (after feature generation and data frame 

416 transformation) or None if the model has not been fitted (or is a rule-based model which does not determine the variable names). 

417 """ 

418 return self._modelInputVariableNames 

419 

420 @deprecated("Use getFeatureTransformer instead, this method will be removed in a future release") 

421 def get_input_transformer(self, cls: Type[DataFrameTransformer]): 

422 """ 

423 Gets the (first) feature transformer of the given type (if any) within this models feature transformer chain 

424 

425 :param cls: the type of transformer to look for 

426 :return: the first matching feature transformer or None 

427 """ 

428 for it in self._featureTransformerChain.dataFrameTransformers: 

429 if isinstance(it, cls): 

430 return it 

431 return None 

432 

433 def get_feature_transformer(self, cls: Type[DataFrameTransformer]): 

434 """ 

435 Gets the (first) feature transformer of the given type (if any) within this models feature transformer chain 

436 

437 :param cls: the type of transformer to look for 

438 :return: the first matching feature transformer or None 

439 """ 

440 for it in self._featureTransformerChain.dataFrameTransformers: 

441 if isinstance(it, cls): 

442 return it 

443 return None 

444 

445 def get_raw_input_transformer(self, cls: Type[DataFrameTransformer]): 

446 """ 

447 Gets the (first) raw input transformer of the given type (if any) within this models raw input transformer chain 

448 

449 :param cls: the type of transformer to look for 

450 :return: the first matching raw input transformer or None 

451 """ 

452 for it in self._rawInputTransformerChain.dataFrameTransformers: 

453 if isinstance(it, cls): 

454 return it 

455 return None 

456 

457 @deprecated("Use getFeatureTransformerChain instead, this method will be removed in a future release") 

458 def get_input_transformer_chain(self) -> DataFrameTransformerChain: 

459 """ 

460 :return: the model's feature transformer chain (which may be empty and contain no actual transformers), 

461 i.e. the transformers that are applied after feature generation 

462 """ 

463 return self._featureTransformerChain 

464 

465 def get_raw_input_transformer_chain(self) -> DataFrameTransformerChain: 

466 """ 

467 :return: the model's raw input transformer chain (which may be empty and contain no actual transformers), 

468 i.e. the transformers that are applied before feature generation 

469 """ 

470 return self._rawInputTransformerChain 

471 

472 def get_feature_transformer_chain(self) -> DataFrameTransformerChain: 

473 """ 

474 :return: the model's feature transformer chain (which may be empty and contain no actual transformers), 

475 i.e. the transformers that are applied after feature generation 

476 """ 

477 return self._featureTransformerChain 

478 

479 def set_feature_generator(self, feature_generator: Optional[FeatureGenerator]): 

480 self.with_feature_generator(feature_generator) 

481 

482 def get_feature_generator(self) -> Optional[FeatureGenerator]: 

483 """ 

484 :return: the model's feature generator (if any) 

485 """ 

486 return self._featureGenerator 

487 

488 def remove_input_preprocessors(self): 

489 """ 

490 Removes all input preprocessors (i.e. raw input transformers, feature generators and feature transformers) from the model 

491 """ 

492 self.with_raw_input_transformers() 

493 self.with_feature_generator(None) 

494 self.with_feature_transformers() 

495 

496 

497class VectorRegressionModel(VectorModel, ABC): 

498 def __init__(self, check_input_columns=True): 

499 """ 

500 :param check_input_columns: Whether to check if the input column list (after feature generation) 

501 during inference coincides with the input column list during fit. 

502 This should be disabled if feature generation is not performed by the model itself, 

503 e.g. in ensemble models. 

504 """ 

505 super().__init__(check_input_columns=check_input_columns) 

506 self._outputTransformerChain = DataFrameTransformerChain() 

507 self._modelOutputVariableNames: Optional[list] = None 

508 self._targetTransformer: Optional[InvertibleDataFrameTransformer] = None 

509 

510 def _tostring_exclude_exceptions(self) -> List[str]: 

511 e = super()._tostring_exclude_exceptions() 

512 if self.TOSTRING_INCLUDE_PREPROCESSORS: 

513 e += ["_targetTransformer"] 

514 return e 

515 

516 def is_regression_model(self) -> bool: 

517 return True 

518 

519 def with_output_transformers(self: TVectorRegressionModel, 

520 *output_transformers: Union[DataFrameTransformer, List[DataFrameTransformer]]) -> TVectorRegressionModel: 

521 """ 

522 Makes the model use the given output transformers. Call with empty input to remove existing output transformers. 

523 The transformers are ignored during the fit phase. Not supported for rule-based models. 

524 

525 **Important**: The output columns names of the last output transformer should be the same 

526 as the first one's input column names. If this fails to hold, an exception will be raised when :meth:`predict` is called. 

527 

528 **Note**: Output transformers perform post-processing after the actual predictions have been made. Contrary 

529 to invertible target transformers, they are not invoked during the fit phase. Therefore, any losses computed there, 

530 including the losses on validation sets (e.g. for early stopping), will be computed on the non-post-processed data. 

531 A possible use case for such post-processing is if you know how improve the predictions of your fittable model 

532 by some heuristics or by hand-crafted rules. 

533 

534 **How not to use**: Output transformers are not meant to transform the predictions into something with a 

535 different semantic meaning (e.g. normalized into non-normalized or something like that) - you should consider 

536 using a targetTransformer for this purpose. Instead, they give the possibility to improve predictions through 

537 post processing, when this is desired. 

538 

539 :param output_transformers: DataFrameTransformers for the transformation of outputs 

540 (after the model has been applied) 

541 :return: self 

542 """ 

543 # There is no reason for post-processing in rule-based models 

544 if not self._underlying_model_requires_fitting(): 

545 raise Exception(f"Output transformers are not supported for model of type {self.__class__.__name__}") 

546 self._outputTransformerChain = DataFrameTransformerChain(*output_transformers) 

547 return self 

548 

549 def with_target_transformer(self: TVectorRegressionModel, 

550 target_transformer: Optional[InvertibleDataFrameTransformer]) -> TVectorRegressionModel: 

551 """ 

552 Makes the model use the given target transformers such that the underlying low-level model is trained on the transformed 

553 targets, but this high-level model still outputs the original (untransformed) values, i.e. the transformation is applied 

554 to targets during training and the inverse transformation is applied to the underlying model's predictions during inference. 

555 Hence the requirement of the transformer being invertible. 

556 

557 This method is not supported for rule-based models, because they are not trained and therefore the transformation 

558 would serve no purpose. 

559 

560 NOTE: All feature generators and data frame transformers - should they make use of outputs - will be fit on the untransformed 

561 target. The targetTransformer only affects the fitting of the underlying model. 

562 

563 :param target_transformer: a transformer which transforms the targets (training data outputs) prior to learning the model, such 

564 that the model learns to predict the transformed outputs 

565 :return: self 

566 """ 

567 # Disabled for rule-based models which do not apply fitting and therefore cannot make use of transformed targets 

568 if not self._underlying_model_requires_fitting(): 

569 raise Exception(f"Target transformers are not supported for model of type {self.__class__.__name__}") 

570 self._targetTransformer = target_transformer 

571 return self 

572 

573 def get_target_transformer(self): 

574 return self._targetTransformer 

575 

576 def get_output_transformer_chain(self): 

577 return self._outputTransformerChain 

578 

579 def _apply_post_processing(self, y: pd.DataFrame): 

580 if self._targetTransformer is not None: 

581 y = self._targetTransformer.apply_inverse(y) 

582 y = self._outputTransformerChain.apply(y) 

583 

584 if list(y.columns) != self.get_predicted_variable_names(): 

585 raise Exception( 

586 f"The model's predicted variable names are not correct. Got " 

587 f"{list(y.columns)} but expected {self.get_predicted_variable_names()}. " 

588 f"This kind of error can happen if the model's outputTransformerChain changes a data frame's " 

589 f"columns (e.g. renames them or changes order). Only output transformer chains that do not change " 

590 f"columns are permitted in VectorModel. You can fix this by modifying this instance's outputTransformerChain, " 

591 f"e.g. by calling .withOutputTransformers() with the correct input " 

592 f"(which can be empty to remove existing output transformers)" 

593 ) 

594 return y 

595 

596 def _compute_model_outputs(self, y: pd.DataFrame) -> pd.DataFrame: 

597 if self._targetTransformer is not None: 

598 y = self._targetTransformer.fit_apply(y) 

599 if self.is_being_fitted(): 

600 self._modelOutputVariableNames = list(y.columns) 

601 return y 

602 

603 def predict(self, x: pd.DataFrame) -> pd.DataFrame: 

604 y = super().predict(x) 

605 return self._apply_post_processing(y) 

606 

607 def is_fitted(self): 

608 if not super().is_fitted(): 

609 return False 

610 if self._targetTransformer is not None and not self._targetTransformer.is_fitted(): 

611 return False 

612 if not self._outputTransformerChain.is_fitted(): 

613 return False 

614 return True 

615 

616 def get_model_output_variable_names(self): 

617 """ 

618 Gets the list of variable names predicted by the underlying model. 

619 For the case where at training time the ground truth is transformed by a target transformer 

620 which changes column names, the names of the variables prior to the transformation will be returned. 

621 Thus this method always returns the variable names that are actually predicted by the underlying model alone. 

622 For the variable names that are ultimately output by the entire VectorModel instance when calling predict, 

623 use getPredictedVariableNames. 

624 """ 

625 return self._modelOutputVariableNames 

626 

627 

628def get_predicted_var_name(specified_var_name: Optional[str], predicted_var_names: List[str]): 

629 if specified_var_name is not None: 

630 return specified_var_name 

631 else: 

632 if len(predicted_var_names) > 1: 

633 raise ValueError("Must explicitly specify the predicted variable name for a model with multiple output variables " 

634 f"({predicted_var_names})") 

635 return predicted_var_names[0] 

636 

637 

638class VectorClassificationModel(VectorModel, ABC): 

639 def __init__(self, check_input_columns=True): 

640 """ 

641 :param check_input_columns: Whether to check if the input column list (after feature generation) 

642 during inference coincides with the input column list during fit. 

643 This should be disabled if feature generation is not performed by the model itself, 

644 e.g. in ensemble models. 

645 """ 

646 super().__init__(check_input_columns=check_input_columns) 

647 self._labels = None 

648 

649 def is_regression_model(self) -> bool: 

650 return False 

651 

652 def _fit(self, x: pd.DataFrame, y: pd.DataFrame, weights: Optional[pd.Series] = None): 

653 if len(y.columns) != 1: 

654 raise ValueError("Classification requires exactly one output column with class labels") 

655 self._labels = sorted([label for label in y.iloc[:, 0].unique()]) 

656 self._fit_classifier(x, y, **kwarg_if_not_none("weights", weights)) 

657 

658 def get_class_labels(self) -> List[Any]: 

659 return self._labels 

660 

661 @abstractmethod 

662 def _fit_classifier(self, x: pd.DataFrame, y: pd.DataFrame, weights: Optional[pd.Series] = None): 

663 pass 

664 

665 def convert_class_probabilities_to_predictions(self, df: pd.DataFrame): 

666 """ 

667 Converts from a data frame as returned by predictClassProbabilities to a result as return by predict. 

668 

669 :param df: the output data frame from predictClassProbabilities 

670 :return: an output data frame as it would be returned by predict 

671 """ 

672 labels = self.get_class_labels() 

673 df_cols = list(df.columns) 

674 if sorted(df_cols) != labels: 

675 raise ValueError(f"Expected data frame with columns {labels}, got {df_cols}") 

676 y_array = df.values 

677 max_indices = np.argmax(y_array, axis=1) 

678 result = [df_cols[i] for i in max_indices] 

679 return pd.DataFrame(result, columns=self.get_predicted_variable_names()) 

680 

681 def predict_class_probabilities(self, x: pd.DataFrame) -> pd.DataFrame: 

682 """ 

683 :param x: the input data 

684 :return: a data frame where the list of columns is the list of class labels and the values are probabilities, with the same 

685 index as the input data frame. 

686 Raises an exception if the classifier cannot predict probabilities. 

687 """ 

688 if not self.is_fitted(): 

689 raise Exception(f"Calling predict with unfitted model. " 

690 f"This might lead to errors down the line, especially if input/output checks are enabled") 

691 x = self._compute_model_inputs(x) 

692 result = self._predict_class_probabilities(x) 

693 result.index = x.index 

694 self._check_prediction(result) 

695 return result 

696 

697 def _check_prediction(self, prediction_df: pd.DataFrame, max_rows_to_check=5): 

698 """ 

699 Checks whether the column names are correctly set, sorted and whether the entries correspond to probabilities 

700 """ 

701 labels = self.get_class_labels() 

702 if list(prediction_df.columns) != labels: 

703 raise Exception(f"{self} _predictClassProbabilities returned DataFrame with incorrect columns: " 

704 f"expected {labels}, got {prediction_df.columns}") 

705 

706 df_to_check = prediction_df.iloc[:max_rows_to_check] 

707 for i, (_, valueSeries) in enumerate(df_to_check.iterrows(), start=1): 

708 

709 if not all(0 <= valueSeries) or not all(valueSeries <= 1): 

710 log.warning(f"Probabilities data frame may not be correctly normalised, " 

711 f"got probabilities outside the range [0, 1]: checked row {i}/{max_rows_to_check} contains {list(valueSeries)}") 

712 

713 s = valueSeries.sum() 

714 if not np.isclose(s, 1, atol=1e-2): 

715 log.warning(f"Probabilities data frame may not be correctly normalised: " 

716 f"checked row {i}/{max_rows_to_check} contains {list(valueSeries)}") 

717 

718 @abstractmethod 

719 def _predict_class_probabilities(self, x: pd.DataFrame) -> pd.DataFrame: 

720 """ 

721 If you are implementing a probabilistic classifier, this method has to return a data frame with probabilities 

722 (one column per label). The default implementation of _predict will then use the output of 

723 this method and convert it to predicted labels (via argmax). 

724 

725 In case you want to predict labels only or have a more efficient implementation of predicting labels than 

726 using argmax, you may override _predict instead of implementing this method. In the case of a 

727 non-probabilistic classifier, the implementation of this method should raise an exception. 

728 """ 

729 raise NotImplementedError(f"{self.__class__.__name__} does not implement _predictClassProbabilities.") 

730 

731 def _predict(self, x: pd.DataFrame) -> pd.DataFrame: 

732 predicted_probabilities_df = self._predict_class_probabilities(x) 

733 return self.convert_class_probabilities_to_predictions(predicted_probabilities_df) 

734 

735 

736class RuleBasedVectorRegressionModel(VectorRegressionModel, ABC): 

737 def __init__(self, predicted_variable_names: list): 

738 """ 

739 :param predicted_variable_names: These are typically known at init time for rule-based models 

740 """ 

741 super().__init__(check_input_columns=False) 

742 self._predictedVariableNames = predicted_variable_names 

743 # guaranteed to be the same as predictedVariableNames since target transformers and output transformers are disallowed 

744 self._modelOutputVariableNames = predicted_variable_names 

745 

746 def _underlying_model_requires_fitting(self): 

747 return False 

748 

749 def _fit(self, x: pd.DataFrame, y: pd.DataFrame, weights: Optional[pd.Series] = None): 

750 pass 

751 

752 

753class RuleBasedVectorClassificationModel(VectorClassificationModel, ABC): 

754 def __init__(self, labels: list, predicted_variable_name="predictedLabel"): 

755 """ 

756 :param labels: 

757 :param predicted_variable_name: 

758 """ 

759 super().__init__(check_input_columns=False) 

760 

761 duplicate = get_first_duplicate(labels) 

762 if duplicate is not None: 

763 raise Exception(f"Found duplicate label: {duplicate}") 

764 self._labels = sorted(labels) 

765 self._predictedVariableNames = [predicted_variable_name] 

766 

767 def _underlying_model_requires_fitting(self): 

768 return False 

769 

770 def _fit(self, x: pd.DataFrame, y: pd.DataFrame, weights: Optional[pd.Series] = None): 

771 pass 

772 

773 def _fit_classifier(self, x: pd.DataFrame, y: pd.DataFrame, weights: Optional[pd.Series] = None): 

774 pass