Coverage for src/sensai/vector_model.py: 77%

366 statements  

« prev     ^ index     » next       coverage.py v7.6.1, created at 2024-08-13 22:17 +0000

1""" 

2This module defines base classes for models that use pandas.DataFrames for inputs and outputs, where each data frame row represents 

3a single model input or output. Since every row contains a vector of data (one-dimensional array), we refer to them as vector-based 

4models. Hence the name of the module and of the central base class :class:`VectorModel`. 

5""" 

6 

7import logging 

8import typing 

9from abc import ABC, abstractmethod 

10from typing import List, Any, Optional, Union, Type, Dict 

11 

12import numpy as np 

13import pandas as pd 

14 

15from .util.deprecation import deprecated 

16from .data import InputOutputData 

17from .data_transformation import DataFrameTransformer, DataFrameTransformerChain, InvertibleDataFrameTransformer 

18from .featuregen import FeatureGenerator, FeatureCollector 

19from .util import mark_used 

20from .util.cache import PickleLoadSaveMixin 

21from .util.logging import StopWatch 

22from .util.pickle import setstate, getstate 

23from .util.sequences import get_first_duplicate 

24from .util.string import ToStringMixin 

25 

26mark_used(InputOutputData) # for backward compatibility 

27 

28log = logging.getLogger(__name__) 

29TVectorModelBase = typing.TypeVar("TVectorModelBase", bound="VectorModelBase") 

30TVectorModel = typing.TypeVar("TVectorModel", bound="VectorModel") 

31TVectorRegressionModel = typing.TypeVar("TVectorRegressionModel", bound="VectorRegressionModel") 

32 

33 

34class VectorModelBase(ABC, ToStringMixin): 

35 """ 

36 Base class for vector models, which defines the fundamental prediction interface. 

37 A vector model takes data frames as input, where each row represents a vector of information. 

38 """ 

39 def __init__(self): 

40 self._name = None 

41 

42 @abstractmethod 

43 def predict(self, x: pd.DataFrame) -> pd.DataFrame: 

44 pass 

45 

46 @abstractmethod 

47 def is_regression_model(self) -> bool: 

48 pass 

49 

50 @abstractmethod 

51 def get_predicted_variable_names(self) -> list: 

52 pass 

53 

54 def with_name(self: TVectorModelBase, name: str) -> TVectorModelBase: 

55 """ 

56 Sets the model's name. 

57 

58 :param name: the name 

59 :return: self 

60 """ 

61 self.set_name(name) 

62 return self 

63 

64 def set_name(self, name): 

65 self._name = name 

66 

67 def get_name(self): 

68 if self._name is None: 

69 return "unnamed-%s-%x" % (self.__class__.__name__, id(self)) 

70 return self._name 

71 

72 

73class VectorModelFittableBase(VectorModelBase, ABC): 

74 """ 

75 Base class for vector models, which encompasses the fundamental prediction and fitting interfaces. 

76 A vector model takes data frames as input, where each row represents a vector of information. 

77 """ 

78 @abstractmethod 

79 def fit(self, x: pd.DataFrame, y: pd.DataFrame): 

80 pass 

81 

82 @abstractmethod 

83 def is_fitted(self) -> bool: 

84 pass 

85 

86 

87class TrainingContext: 

88 """ 

89 Contains context information for an ongoing training process 

90 """ 

91 def __init__(self, original_input: pd.DataFrame, original_output: pd.DataFrame): 

92 self.original_input = original_input 

93 self.original_output = original_output 

94 

95 

96class VectorModel(VectorModelFittableBase, PickleLoadSaveMixin, ABC): 

97 """ 

98 Represents a model which uses data frames as inputs and outputs whose rows define individual data points. 

99 Every data frame row represents a vector of information (one-dimensional array), hence the name of the model. 

100 Note that the vectors in question are not necessarily vectors in the mathematical sense, as the information in each cell is not 

101 required to be numeric or uniform but can be arbitrarily complex. 

102 """ 

103 TOSTRING_INCLUDE_PREPROCESSORS = True 

104 _TRANSIENT_MEMBERS = ["_trainingContext"] 

105 _RENAMED_MEMBERS = { 

106 "checkInputColumns": "_checkInputColumns", 

107 "_inputTransformerChain": "_featureTransformerChain" 

108 } 

109 

110 def __init__(self, check_input_columns=True): 

111 """ 

112 :param check_input_columns: whether to check if the input column list (that is fed to the underlying model, i.e. after feature 

113 generation) during inference coincides with the input column list that was observed during training. 

114 This should be disabled if feature generation is not performed by the model itself, e.g. in meta-models 

115 such as ensemble models. 

116 """ 

117 super().__init__() 

118 self._featureGenerator: Optional[FeatureGenerator] = None 

119 self._rawInputTransformerChain = DataFrameTransformerChain() 

120 self._featureTransformerChain = DataFrameTransformerChain() 

121 self._isFitted = False # Note: this keeps track only of the actual model being fitted, not the pre/postprocessors 

122 self._predictedVariableNames: Optional[list] = None 

123 self._modelInputVariableNames: Optional[list] = None 

124 self._checkInputColumns = check_input_columns 

125 

126 # transient members 

127 self._trainingContext: Optional[TrainingContext] = None 

128 

129 def __getstate__(self): 

130 return getstate(VectorModel, self, transient_properties=self._TRANSIENT_MEMBERS) 

131 

132 def __setstate__(self, state): 

133 for m in VectorModel._TRANSIENT_MEMBERS: 

134 state[m] = None 

135 setstate(VectorModel, self, state, renamed_properties=self._RENAMED_MEMBERS, 

136 new_default_properties={"_rawInputTransformerChain": DataFrameTransformerChain()}) 

137 

138 def _tostring_exclude_private(self) -> bool: 

139 return True 

140 

141 def _tostring_exclude_exceptions(self) -> List[str]: 

142 e = super()._tostring_exclude_exceptions() 

143 if self.TOSTRING_INCLUDE_PREPROCESSORS: 

144 e += ["_rawInputTransformerChain", "_featureGenerator", "_featureTransformerChain"] 

145 return e 

146 

147 def _tostring_additional_entries(self) -> Dict[str, Any]: 

148 d = super()._tostring_additional_entries() 

149 if self._featureGenerator is not None: 

150 d["featureGeneratorNames"] = self._featureGenerator.get_names() 

151 if self._name is not None: 

152 d["name"] = self._name 

153 return d 

154 

155 def with_raw_input_transformers(self: TVectorModel, 

156 *transformers: Union[DataFrameTransformer, List[DataFrameTransformer]]) -> TVectorModel: 

157 """ 

158 Makes the model use the given transformers (removing previously set raw input transformers, if any), which 

159 are to be applied to the raw input data frame (prior to feature generation). 

160 

161 :param transformers: :class:`DataFrameTransformer` instances to use (in sequence) for the transformation of inputs 

162 :return: self 

163 """ 

164 self._rawInputTransformerChain = DataFrameTransformerChain(*transformers) 

165 return self 

166 

167 def with_feature_transformers(self: TVectorModel, *transformers: Union[DataFrameTransformer, List[DataFrameTransformer]], 

168 add=False) -> TVectorModel: 

169 """ 

170 Makes the model use the given transformers 

171 which are to be applied to the data frames generated by feature generators. 

172 (If the model does not use feature generators, the transformers will be applied to 

173 whatever is produced by the raw input transformers or, if there are none, the original raw 

174 input data frame). 

175 

176 :param transformers: :class:`DataFrameTransformer` instances to use (in sequence) for the transformation of features 

177 :param add: whether to add the transformers to the existing transformers rather than replacing them 

178 :return: self 

179 """ 

180 if not add: 

181 self._featureTransformerChain = DataFrameTransformerChain(*transformers) 

182 else: 

183 for t in transformers: 

184 self._featureTransformerChain.append(t) 

185 return self 

186 

187 @deprecated("Use with_feature_transformers instead; this method will be removed in a future sensAI release.") 

188 def with_input_transformers(self: TVectorModel, 

189 *input_transformers: Union[DataFrameTransformer, List[DataFrameTransformer]]) -> TVectorModel: 

190 """ 

191 Makes the model use the given feature transformers (removing previously set transformers, if any), 

192 i.e. it transforms the data frame that is generated by the feature generators (if any). 

193 

194 :param input_transformers: :class:`DataFrameTransformer` instances to use (in sequence) for the transformation of inputs 

195 :return: self 

196 """ 

197 return self.with_feature_transformers(*input_transformers) 

198 

199 def with_feature_generator(self: TVectorModel, feature_generator: Optional[FeatureGenerator]) -> TVectorModel: 

200 """ 

201 Makes the model use the given feature generator in order to obtain the model inputs. 

202 If the model shall use more than one feature generator, pass a :class:`MultiFeatureGenerator` which combines them or 

203 use the perhaps more convenient :class:`FeatureCollector` in conjunction with :meth:`withFeatureCollector`. 

204 

205 Note: Feature computation takes place before input transformation. 

206 

207 :param feature_generator: the feature generator to use for input computation 

208 :return: self 

209 """ 

210 self._featureGenerator = feature_generator 

211 return self 

212 

213 def with_feature_collector(self: TVectorModel, feature_collector: FeatureCollector, 

214 shared: bool = False) -> TVectorModel: 

215 """ 

216 Makes the model use a multi-feature generator obtained from the given collector 

217 in order compute the underlying model's input from the data frame that is given. 

218 Overrides any feature generator previously passed to :meth:`withFeatureGenerator` (if any). 

219 

220 Note: Feature generation takes place before feature transformation. 

221 

222 :param feature_collector: the feature collector from which to obtain the multi-feature generator 

223 :param shared: whether the given feature collector is shared between models (i.e. whether 

224 the same instance is passed to multiple models). 

225 Passing `shared=True` ensures that models using the same collector do not end up 

226 using the same multi-feature collector instance and instead receive an independent instance. 

227 :return: self 

228 """ 

229 if shared: 

230 self._featureGenerator = feature_collector.create_multi_feature_generator() 

231 else: 

232 self._featureGenerator = feature_collector.get_multi_feature_generator() 

233 return self 

234 

235 def _pre_processors_are_fitted(self): 

236 result = self._rawInputTransformerChain.is_fitted() and self._featureTransformerChain.is_fitted() 

237 if self.get_feature_generator() is not None: 

238 result = result and self.get_feature_generator().is_fitted() 

239 return result 

240 

241 def is_fitted(self): 

242 """ 

243 :return: True if the model has been fitted, False otherwise 

244 """ 

245 if not self._is_underlying_model_fitted(): 

246 return False 

247 if not self._pre_processors_are_fitted(): 

248 return False 

249 return True 

250 

251 def _is_underlying_model_fitted(self): 

252 underlying_model_is_fitted = not self._underlying_model_requires_fitting() or self._isFitted 

253 return underlying_model_is_fitted 

254 

255 def _check_model_input_columns(self, model_input: pd.DataFrame): 

256 if self._checkInputColumns and list(model_input.columns) != self._modelInputVariableNames: 

257 raise Exception(f"Inadmissible input data frame: " 

258 f"expected columns {self._modelInputVariableNames}, got {list(model_input.columns)}") 

259 

260 def compute_model_inputs(self, x: pd.DataFrame): 

261 """ 

262 Applies feature generators and input transformers (if any) to generate from an input data frame the input for the 

263 underlying model 

264 

265 :param x: the input data frame, to which input preprocessing is to be applied 

266 :return: the input data frame that serves as input for the underlying model 

267 """ 

268 return self._compute_model_inputs(x) 

269 

270 def _compute_model_inputs(self, x: pd.DataFrame, y: pd.DataFrame = None, fit=False) -> pd.DataFrame: 

271 """ 

272 :param x: the input data frame 

273 :param y: the output data frame (when training); only has to be provided if ``fit=True`` and preprocessors require outputs 

274 for fitting 

275 :param fit: if True, preprocessors will be fitted before being applied to ``X`` 

276 :return: 

277 """ 

278 if fit: 

279 x = self._rawInputTransformerChain.fit_apply(x) 

280 if self._featureGenerator is not None: 

281 x = self._featureGenerator.fit_generate(x, y, self) 

282 x = self._featureTransformerChain.fit_apply(x) 

283 else: 

284 x = self._rawInputTransformerChain.apply(x) 

285 if self._featureGenerator is not None: 

286 x = self._featureGenerator.generate(x, self) 

287 x = self._featureTransformerChain.apply(x) 

288 return x 

289 

290 def _compute_model_outputs(self, y: pd.DataFrame) -> pd.DataFrame: 

291 return y 

292 

293 def compute_model_outputs(self, y: pd.DataFrame) -> pd.DataFrame: 

294 return self._compute_model_outputs(y) 

295 

296 def predict(self, x: pd.DataFrame) -> pd.DataFrame: 

297 """ 

298 Applies the model to the given input data frame 

299 

300 :param x: the input data frame 

301 :return: the model outputs in the form of a data frame whose index corresponds to the index of ``x`` 

302 """ 

303 if not self.is_fitted(): 

304 raise Exception(f"Calling predict with unfitted model {self} " 

305 f"(isUnderlyingModelFitted={self._is_underlying_model_fitted()}, " 

306 f"preProcessorsAreFitted={self._pre_processors_are_fitted()})") 

307 x = self._compute_model_inputs(x) 

308 self._check_model_input_columns(x) 

309 y = self._predict(x) 

310 return self._create_output_data_frame(y, x.index) 

311 

312 def _create_output_data_frame(self, y: Union[pd.DataFrame, list], index): 

313 if isinstance(y, pd.DataFrame): 

314 # make sure the data frame has the right index 

315 y.index = index 

316 return y 

317 else: 

318 predicted_columns = self.get_predicted_variable_names() 

319 if len(predicted_columns) != 1: 

320 raise ValueError(f"_predict must return a DataFrame as there are multiple predicted columns; got {type(y)}") 

321 return pd.DataFrame(pd.Series(y, name=predicted_columns[0], index=index)) 

322 

323 @abstractmethod 

324 def _predict(self, x: pd.DataFrame) -> Union[pd.DataFrame, list]: 

325 """ 

326 :param x: the input data frame 

327 :return: the output data frame, or, for the case where a single column is to be predicted, the list of values for that column 

328 """ 

329 pass 

330 

331 def _underlying_model_requires_fitting(self) -> bool: 

332 """ 

333 Designed to be overridden for rule-based models. 

334 

335 :return: True iff the underlying model requires fitting 

336 """ 

337 return True 

338 

339 def _fit_preprocessors(self, x: pd.DataFrame, y: pd.DataFrame = None): 

340 self._rawInputTransformerChain.fit(x) 

341 # no need for fitGenerate if chain is empty 

342 if self._featureGenerator is not None: 

343 if len(self._featureTransformerChain) == 0: 

344 self._featureGenerator.fit(x, y) 

345 else: 

346 x = self._featureGenerator.fit_generate(x, y, self) 

347 self._featureTransformerChain.fit(x) 

348 

349 def fit_input_output_data(self, io_data: InputOutputData, fit_preprocessors=True, fit_model=True): 

350 """ 

351 Fits the model using the given data 

352 

353 :param io_data: the input/output data 

354 :param fit_preprocessors: whether the model's preprocessors (feature generators and data frame transformers) shall be fitted 

355 :param fit_model: whether the model itself shall be fitted 

356 """ 

357 self.fit(io_data.inputs, io_data.outputs, fit_preprocessors=fit_preprocessors, fit_model=fit_model) 

358 

359 def fit(self, x: pd.DataFrame, y: Optional[pd.DataFrame], fit_preprocessors=True, fit_model=True): 

360 """ 

361 Fits the model using the given data 

362 

363 :param x: a data frame containing input data 

364 :param y: a data frame containing output data; may be None if the underlying model does not actually require 

365 fitting, e.g. in the case of a rule-based models, but fitting is still necessary for preprocessors 

366 :param fit_preprocessors: whether the model's preprocessors (feature generators and data frame transformers) shall be fitted 

367 :param fit_model: whether the model itself shall be fitted 

368 """ 

369 self._trainingContext = TrainingContext(x, y) 

370 try: 

371 log.info(f"Fitting {self.__class__.__name__} instance") 

372 sw = StopWatch() 

373 self._predictedVariableNames = list(y.columns) 

374 if not self._underlying_model_requires_fitting(): 

375 if fit_preprocessors: 

376 self._fit_preprocessors(x, y=y) 

377 self._modelInputVariableNames = None # not known for rule-based models because the fitting process is optimised 

378 else: 

379 if y is None: 

380 raise Exception(f"The underlying model requires a data frame for fitting but Y=None was passed") 

381 if len(x) != len(y): 

382 raise ValueError(f"Length of input ({len(x)}) does not match length of output ({len(y)})") 

383 y = self._compute_model_outputs(y) 

384 x = self._compute_model_inputs(x, y=y, fit=fit_preprocessors) 

385 if len(x) != len(y): 

386 log.debug(f"Input computation changed number of data points ({len(self._trainingContext.original_input)} -> {len(x)})") 

387 y = y.loc[x.index] 

388 if len(x) != len(y): 

389 raise ValueError("Could not recover matching outputs for changed inputs. Only input filtering is admissible, " 

390 "indices of input & ouput data frames must match.") 

391 self._modelInputVariableNames = list(x.columns) 

392 if fit_model: 

393 inputs_with_types = ', '.join([n + '/' + x[n].dtype.name for n in self._modelInputVariableNames]) 

394 log.debug(f"Fitting with outputs[{len(y.columns)}]={list(y.columns)}, " 

395 f"inputs[{len(self._modelInputVariableNames)}]=[{inputs_with_types}]; N={len(x)} data points") 

396 self._fit(x, y) 

397 self._isFitted = True 

398 else: 

399 log.info("Fitting of underlying model skipped") 

400 log.info(f"Fitting completed in {sw.get_elapsed_time_secs():.2f} seconds: {self}") 

401 finally: 

402 self._trainingContext = None 

403 

404 def is_being_fitted(self) -> bool: 

405 """ 

406 :return: True if the model is currently in the process of being fitted, False otherwise 

407 """ 

408 return self._trainingContext is not None 

409 

410 @abstractmethod 

411 def _fit(self, x: pd.DataFrame, y: pd.DataFrame): 

412 pass 

413 

414 def get_predicted_variable_names(self): 

415 """ 

416 :return: the list of variable names that are ultimately output by this model (i.e. the columns of the data frame output 

417 by :meth:`predict`) 

418 """ 

419 return self._predictedVariableNames 

420 

421 def get_model_input_variable_names(self) -> Optional[List[str]]: 

422 """ 

423 :return: the list of variable names required by the underlying model as input (after feature generation and data frame 

424 transformation) or None if the model has not been fitted (or is a rule-based model which does not determine the variable names). 

425 """ 

426 return self._modelInputVariableNames 

427 

428 @deprecated("Use getFeatureTransformer instead, this method will be removed in a future release") 

429 def get_input_transformer(self, cls: Type[DataFrameTransformer]): 

430 """ 

431 Gets the (first) feature transformer of the given type (if any) within this models feature transformer chain 

432 

433 :param cls: the type of transformer to look for 

434 :return: the first matching feature transformer or None 

435 """ 

436 for it in self._featureTransformerChain.dataFrameTransformers: 

437 if isinstance(it, cls): 

438 return it 

439 return None 

440 

441 def get_feature_transformer(self, cls: Type[DataFrameTransformer]): 

442 """ 

443 Gets the (first) feature transformer of the given type (if any) within this models feature transformer chain 

444 

445 :param cls: the type of transformer to look for 

446 :return: the first matching feature transformer or None 

447 """ 

448 for it in self._featureTransformerChain.dataFrameTransformers: 

449 if isinstance(it, cls): 

450 return it 

451 return None 

452 

453 def get_raw_input_transformer(self, cls: Type[DataFrameTransformer]): 

454 """ 

455 Gets the (first) raw input transformer of the given type (if any) within this models raw input transformer chain 

456 

457 :param cls: the type of transformer to look for 

458 :return: the first matching raw input transformer or None 

459 """ 

460 for it in self._rawInputTransformerChain.dataFrameTransformers: 

461 if isinstance(it, cls): 

462 return it 

463 return None 

464 

465 @deprecated("Use getFeatureTransformerChain instead, this method will be removed in a future release") 

466 def get_input_transformer_chain(self) -> DataFrameTransformerChain: 

467 """ 

468 :return: the model's feature transformer chain (which may be empty and contain no actual transformers), 

469 i.e. the transformers that are applied after feature generation 

470 """ 

471 return self._featureTransformerChain 

472 

473 def get_raw_input_transformer_chain(self) -> DataFrameTransformerChain: 

474 """ 

475 :return: the model's raw input transformer chain (which may be empty and contain no actual transformers), 

476 i.e. the transformers that are applied before feature generation 

477 """ 

478 return self._rawInputTransformerChain 

479 

480 def get_feature_transformer_chain(self) -> DataFrameTransformerChain: 

481 """ 

482 :return: the model's feature transformer chain (which may be empty and contain no actual transformers), 

483 i.e. the transformers that are applied after feature generation 

484 """ 

485 return self._featureTransformerChain 

486 

487 def set_feature_generator(self, feature_generator: Optional[FeatureGenerator]): 

488 self.with_feature_generator(feature_generator) 

489 

490 def get_feature_generator(self) -> Optional[FeatureGenerator]: 

491 """ 

492 :return: the model's feature generator (if any) 

493 """ 

494 return self._featureGenerator 

495 

496 def remove_input_preprocessors(self): 

497 """ 

498 Removes all input preprocessors (i.e. raw input transformers, feature generators and feature transformers) from the model 

499 """ 

500 self.with_raw_input_transformers() 

501 self.with_feature_generator(None) 

502 self.with_feature_transformers() 

503 

504 

505class VectorRegressionModel(VectorModel, ABC): 

506 def __init__(self, check_input_columns=True): 

507 """ 

508 :param check_input_columns: Whether to check if the input column list (after feature generation) 

509 during inference coincides with the input column list during fit. 

510 This should be disabled if feature generation is not performed by the model itself, 

511 e.g. in ensemble models. 

512 """ 

513 super().__init__(check_input_columns=check_input_columns) 

514 self._outputTransformerChain = DataFrameTransformerChain() 

515 self._modelOutputVariableNames: Optional[list] = None 

516 self._targetTransformer: Optional[InvertibleDataFrameTransformer] = None 

517 

518 def _tostring_exclude_exceptions(self) -> List[str]: 

519 e = super()._tostring_exclude_exceptions() 

520 if self.TOSTRING_INCLUDE_PREPROCESSORS: 

521 e += ["_targetTransformer"] 

522 return e 

523 

524 def is_regression_model(self) -> bool: 

525 return True 

526 

527 def with_output_transformers(self: TVectorRegressionModel, 

528 *output_transformers: Union[DataFrameTransformer, List[DataFrameTransformer]]) -> TVectorRegressionModel: 

529 """ 

530 Makes the model use the given output transformers. Call with empty input to remove existing output transformers. 

531 The transformers are ignored during the fit phase. Not supported for rule-based models. 

532 

533 **Important**: The output columns names of the last output transformer should be the same 

534 as the first one's input column names. If this fails to hold, an exception will be raised when :meth:`predict` is called. 

535 

536 **Note**: Output transformers perform post-processing after the actual predictions have been made. Contrary 

537 to invertible target transformers, they are not invoked during the fit phase. Therefore, any losses computed there, 

538 including the losses on validation sets (e.g. for early stopping), will be computed on the non-post-processed data. 

539 A possible use case for such post-processing is if you know how improve the predictions of your fittable model 

540 by some heuristics or by hand-crafted rules. 

541 

542 **How not to use**: Output transformers are not meant to transform the predictions into something with a 

543 different semantic meaning (e.g. normalized into non-normalized or something like that) - you should consider 

544 using a targetTransformer for this purpose. Instead, they give the possibility to improve predictions through 

545 post processing, when this is desired. 

546 

547 :param output_transformers: DataFrameTransformers for the transformation of outputs 

548 (after the model has been applied) 

549 :return: self 

550 """ 

551 # There is no reason for post-processing in rule-based models 

552 if not self._underlying_model_requires_fitting(): 

553 raise Exception(f"Output transformers are not supported for model of type {self.__class__.__name__}") 

554 self._outputTransformerChain = DataFrameTransformerChain(*output_transformers) 

555 return self 

556 

557 def with_target_transformer(self: TVectorRegressionModel, 

558 target_transformer: Optional[InvertibleDataFrameTransformer]) -> TVectorRegressionModel: 

559 """ 

560 Makes the model use the given target transformers such that the underlying low-level model is trained on the transformed 

561 targets, but this high-level model still outputs the original (untransformed) values, i.e. the transformation is applied 

562 to targets during training and the inverse transformation is applied to the underlying model's predictions during inference. 

563 Hence the requirement of the transformer being invertible. 

564 

565 This method is not supported for rule-based models, because they are not trained and therefore the transformation 

566 would serve no purpose. 

567 

568 NOTE: All feature generators and data frame transformers - should they make use of outputs - will be fit on the untransformed 

569 target. The targetTransformer only affects the fitting of the underlying model. 

570 

571 :param target_transformer: a transformer which transforms the targets (training data outputs) prior to learning the model, such 

572 that the model learns to predict the transformed outputs 

573 :return: self 

574 """ 

575 # Disabled for rule-based models which do not apply fitting and therefore cannot make use of transformed targets 

576 if not self._underlying_model_requires_fitting(): 

577 raise Exception(f"Target transformers are not supported for model of type {self.__class__.__name__}") 

578 self._targetTransformer = target_transformer 

579 return self 

580 

581 def get_target_transformer(self): 

582 return self._targetTransformer 

583 

584 def get_output_transformer_chain(self): 

585 return self._outputTransformerChain 

586 

587 def _apply_post_processing(self, y: pd.DataFrame): 

588 if self._targetTransformer is not None: 

589 y = self._targetTransformer.apply_inverse(y) 

590 y = self._outputTransformerChain.apply(y) 

591 

592 if list(y.columns) != self.get_predicted_variable_names(): 

593 raise Exception( 

594 f"The model's predicted variable names are not correct. Got " 

595 f"{list(y.columns)} but expected {self.get_predicted_variable_names()}. " 

596 f"This kind of error can happen if the model's outputTransformerChain changes a data frame's " 

597 f"columns (e.g. renames them or changes order). Only output transformer chains that do not change " 

598 f"columns are permitted in VectorModel. You can fix this by modifying this instance's outputTransformerChain, " 

599 f"e.g. by calling .withOutputTransformers() with the correct input " 

600 f"(which can be empty to remove existing output transformers)" 

601 ) 

602 return y 

603 

604 def _compute_model_outputs(self, y: pd.DataFrame) -> pd.DataFrame: 

605 if self._targetTransformer is not None: 

606 y = self._targetTransformer.fit_apply(y) 

607 if self.is_being_fitted(): 

608 self._modelOutputVariableNames = list(y.columns) 

609 return y 

610 

611 def predict(self, x: pd.DataFrame) -> pd.DataFrame: 

612 y = super().predict(x) 

613 return self._apply_post_processing(y) 

614 

615 def is_fitted(self): 

616 if not super().is_fitted(): 

617 return False 

618 if self._targetTransformer is not None and not self._targetTransformer.is_fitted(): 

619 return False 

620 if not self._outputTransformerChain.is_fitted(): 

621 return False 

622 return True 

623 

624 def get_model_output_variable_names(self): 

625 """ 

626 Gets the list of variable names predicted by the underlying model. 

627 For the case where at training time the ground truth is transformed by a target transformer 

628 which changes column names, the names of the variables prior to the transformation will be returned. 

629 Thus this method always returns the variable names that are actually predicted by the underlying model alone. 

630 For the variable names that are ultimately output by the entire VectorModel instance when calling predict, 

631 use getPredictedVariableNames. 

632 """ 

633 return self._modelOutputVariableNames 

634 

635 

636class VectorClassificationModel(VectorModel, ABC): 

637 def __init__(self, check_input_columns=True): 

638 """ 

639 :param check_input_columns: Whether to check if the input column list (after feature generation) 

640 during inference coincides with the input column list during fit. 

641 This should be disabled if feature generation is not performed by the model itself, 

642 e.g. in ensemble models. 

643 """ 

644 super().__init__(check_input_columns=check_input_columns) 

645 self._labels = None 

646 

647 def is_regression_model(self) -> bool: 

648 return False 

649 

650 def _fit(self, x: pd.DataFrame, y: pd.DataFrame): 

651 if len(y.columns) != 1: 

652 raise ValueError("Classification requires exactly one output column with class labels") 

653 self._labels = sorted([label for label in y.iloc[:, 0].unique()]) 

654 self._fit_classifier(x, y) 

655 

656 def get_class_labels(self) -> List[Any]: 

657 return self._labels 

658 

659 @abstractmethod 

660 def _fit_classifier(self, x: pd.DataFrame, y: pd.DataFrame): 

661 pass 

662 

663 def convert_class_probabilities_to_predictions(self, df: pd.DataFrame): 

664 """ 

665 Converts from a data frame as returned by predictClassProbabilities to a result as return by predict. 

666 

667 :param df: the output data frame from predictClassProbabilities 

668 :return: an output data frame as it would be returned by predict 

669 """ 

670 labels = self.get_class_labels() 

671 df_cols = list(df.columns) 

672 if sorted(df_cols) != labels: 

673 raise ValueError(f"Expected data frame with columns {labels}, got {df_cols}") 

674 y_array = df.values 

675 max_indices = np.argmax(y_array, axis=1) 

676 result = [df_cols[i] for i in max_indices] 

677 return pd.DataFrame(result, columns=self.get_predicted_variable_names()) 

678 

679 def predict_class_probabilities(self, x: pd.DataFrame) -> pd.DataFrame: 

680 """ 

681 :param x: the input data 

682 :return: a data frame where the list of columns is the list of class labels and the values are probabilities, with the same 

683 index as the input data frame. 

684 Raises an exception if the classifier cannot predict probabilities. 

685 """ 

686 if not self.is_fitted(): 

687 raise Exception(f"Calling predict with unfitted model. " 

688 f"This might lead to errors down the line, especially if input/output checks are enabled") 

689 x = self._compute_model_inputs(x) 

690 result = self._predict_class_probabilities(x) 

691 result.index = x.index 

692 self._check_prediction(result) 

693 return result 

694 

695 def _check_prediction(self, prediction_df: pd.DataFrame, max_rows_to_check=5): 

696 """ 

697 Checks whether the column names are correctly set, sorted and whether the entries correspond to probabilities 

698 """ 

699 labels = self.get_class_labels() 

700 if list(prediction_df.columns) != labels: 

701 raise Exception(f"{self} _predictClassProbabilities returned DataFrame with incorrect columns: " 

702 f"expected {labels}, got {prediction_df.columns}") 

703 

704 df_to_check = prediction_df.iloc[:max_rows_to_check] 

705 for i, (_, valueSeries) in enumerate(df_to_check.iterrows(), start=1): 

706 

707 if not all(0 <= valueSeries) or not all(valueSeries <= 1): 

708 log.warning(f"Probabilities data frame may not be correctly normalised, " 

709 f"got probabilities outside the range [0, 1]: checked row {i}/{max_rows_to_check} contains {list(valueSeries)}") 

710 

711 s = valueSeries.sum() 

712 if not np.isclose(s, 1, atol=1e-2): 

713 log.warning(f"Probabilities data frame may not be correctly normalised: " 

714 f"checked row {i}/{max_rows_to_check} contains {list(valueSeries)}") 

715 

716 @abstractmethod 

717 def _predict_class_probabilities(self, x: pd.DataFrame) -> pd.DataFrame: 

718 """ 

719 If you are implementing a probabilistic classifier, this method has to return a data frame with probabilities 

720 (one column per label). The default implementation of _predict will then use the output of 

721 this method and convert it to predicted labels (via argmax). 

722 

723 In case you want to predict labels only or have a more efficient implementation of predicting labels than 

724 using argmax, you may override _predict instead of implementing this method. In the case of a 

725 non-probabilistic classifier, the implementation of this method should raise an exception. 

726 """ 

727 raise NotImplementedError(f"{self.__class__.__name__} does not implement _predictClassProbabilities.") 

728 

729 def _predict(self, x: pd.DataFrame) -> pd.DataFrame: 

730 predicted_probabilities_df = self._predict_class_probabilities(x) 

731 return self.convert_class_probabilities_to_predictions(predicted_probabilities_df) 

732 

733 

734class RuleBasedVectorRegressionModel(VectorRegressionModel, ABC): 

735 def __init__(self, predicted_variable_names: list): 

736 """ 

737 :param predicted_variable_names: These are typically known at init time for rule-based models 

738 """ 

739 super().__init__(check_input_columns=False) 

740 self._predictedVariableNames = predicted_variable_names 

741 # guaranteed to be the same as predictedVariableNames since target transformers and output transformers are disallowed 

742 self._modelOutputVariableNames = predicted_variable_names 

743 

744 def _underlying_model_requires_fitting(self): 

745 return False 

746 

747 def _fit(self, x: pd.DataFrame, y: pd.DataFrame): 

748 pass 

749 

750 

751class RuleBasedVectorClassificationModel(VectorClassificationModel, ABC): 

752 def __init__(self, labels: list, predicted_variable_name="predictedLabel"): 

753 """ 

754 :param labels: 

755 :param predicted_variable_name: 

756 """ 

757 super().__init__(check_input_columns=False) 

758 

759 duplicate = get_first_duplicate(labels) 

760 if duplicate is not None: 

761 raise Exception(f"Found duplicate label: {duplicate}") 

762 self._labels = sorted(labels) 

763 self._predictedVariableNames = [predicted_variable_name] 

764 

765 def _underlying_model_requires_fitting(self): 

766 return False 

767 

768 def _fit(self, x: pd.DataFrame, y: pd.DataFrame): 

769 pass 

770 

771 def _fit_classifier(self, x: pd.DataFrame, y: pd.DataFrame): 

772 pass