Coverage for src/sensai/tensor_model.py: 32%

179 statements  

« prev     ^ index     » next       coverage.py v7.6.1, created at 2024-11-29 18:29 +0000

1""" 

2This module contains base classes for models that input and output tensors, for examples CNNs. 

3The fitting and predictions will still be performed on data frames, like in VectorModel, 

4but now it will be expected that all entries of the input data frame passed to the model are tensors of the same shape. 

5Lists of scalars of the same lengths are also accepted. The same is expected of the ground truth data frames. 

6Everything will work as well if the entries are just scalars but in this case it is recommended to use 

7VectorModel instead. 

8 

9If we denote the shapes of entries in the dfs as inputTensorShape and outputTensorShape, 

10the model will be fit on input tensors of shape (N_rows, N_inputColumns, inputTensorShape) and output tensors of 

11shape (N_rows, N_outputColumns, outputTensorShape), where empty dimensions (e.g. for one-column data frames) 

12will be stripped. 

13""" 

14 

15 

16import logging 

17from abc import ABC, abstractmethod 

18from typing import Optional, Tuple 

19 

20import numpy as np 

21import pandas as pd 

22 

23from .util.pandas import extract_array 

24from .vector_model import VectorRegressionModel, VectorClassificationModel, VectorModel 

25 

26log = logging.getLogger(__name__) 

27# we set the default level to debug because it is often interesting for the user to receive 

28# debug information about shapes as data frames get converted to arrays 

29log.setLevel(logging.DEBUG) 

30 

31 

32class InvalidShapeError(Exception): 

33 pass 

34 

35 

36def _get_datapoint_shape(df: pd.DataFrame): 

37 first_row_df = df.iloc[:1] 

38 # Note that the empty first dimension with N_Datapoints=1 is stripped by extractArray 

39 return extract_array(first_row_df).shape 

40 

41 

42def _check_df_shape(df: pd.DataFrame, desired_shape: tuple): 

43 datapoint_shape = _get_datapoint_shape(df) 

44 if datapoint_shape != desired_shape: 

45 raise InvalidShapeError(f"Wrong input shape for data point. Expected {desired_shape} but got {datapoint_shape}") 

46 

47 

48# This is implemented as a mixin because there can be no functional common class for all tensor models. 

49# The reason is that actual implementations need to inherit from Vector-Regression/Classification-Model 

50# (or duplicate a lot of code) and thus it is not possible to inherit from something like TensorModel(VectorModel) 

51# without getting into a mess. 

52class TensorModel(ABC): 

53 def __init__(self): 

54 self._modelInputShape = None 

55 self._modelOutputShape = None 

56 

57 @abstractmethod 

58 def _fit_to_array(self, x: np.ndarray, y: np.ndarray): 

59 pass 

60 

61 @abstractmethod 

62 def _predict_array(self, x: np.ndarray) -> np.ndarray: 

63 """ 

64 The result should be of shape `(N_DataPoints, *predictedTensorShape)` if a single column is predicted 

65 or of shape `(N_DataPoints, N_Columns, *predictedTensorShape)` if multiple columns are predicted 

66 (e.g. for multiple regression targets). Note that in both cases, the number of predicted columns 

67 should coincide with corresponding number in the ground truth data frame the model was fitted on 

68 

69 :param x: a tensor of shape `(N_DataPoints, *inputTensorShape)` 

70 """ 

71 pass 

72 

73 def _predict_df_through_array(self, x: pd.DataFrame, output_columns: list) -> pd.DataFrame: 

74 """ 

75 To be used within _predict in implementations of this class. Performs predictions by 

76 transforming X into an array, computing the predicted array from it and turning the result into a 

77 predictions data frame. 

78 

79 :param x: input data frame (of same type as for _fitTensorModel) 

80 :param output_columns: columns of the outputDF, typically the result of calling `getPredictedVariableNames()` in 

81 an implementation 

82 :return: 

83 """ 

84 y = self._predict_array(extract_array(x)) 

85 if not len(y) == len(x): 

86 raise InvalidShapeError(f"Number of data points (lengths) of input data frame and predictions must agree. " 

87 f"Expected {len(x)} but got {len(y)}") 

88 

89 result = pd.DataFrame(index=x.index) 

90 n_columns = len(output_columns) 

91 if n_columns == 1: 

92 result[output_columns[0]] = list(y) 

93 else: 

94 

95 if not n_columns == y.shape[1]: 

96 raise InvalidShapeError(f"Wrong shape of predictions array for a data frame with {n_columns} columns ({output_columns}). " 

97 f"Expected shape ({len(x)}, {n_columns}, ...) but got: {y.shape}") 

98 for i, col in enumerate(output_columns): 

99 result[col] = list(y[:, i]) 

100 return result 

101 

102 def _fit_tensor_model(self, x: pd.DataFrame, y: pd.DataFrame): 

103 """ 

104 To be used within _fit in implementations of this class 

105 """ 

106 log.debug(f"Stacking input tensors from columns {x.columns} and from all rows to a single array. " 

107 f"Note that all tensors need to have the same shape") 

108 x = extract_array(x) 

109 y = extract_array(y) 

110 self._modelInputShape = x[0].shape 

111 self._modelOutputShape = y[0].shape 

112 log.debug(f"Fitting on {len(x)} datapoints of shape {self._modelInputShape}. " 

113 f"The ground truth are tensors of shape {self._modelOutputShape}") 

114 self._fit_to_array(x, y) 

115 

116 def get_model_input_shape(self) -> Optional[Tuple]: 

117 return self._modelInputShape 

118 

119 def get_model_output_shape(self): 

120 return self._modelInputShape 

121 

122 

123class TensorToScalarRegressionModel(VectorRegressionModel, TensorModel, ABC): 

124 def __init__(self, check_input_shape=True, check_input_columns=True): 

125 """ 

126 Base class for regression models that take tensors as input and output scalars. They can be evaluated 

127 in the same way as non-tensor regression models 

128 

129 :param check_input_shape: Whether to check if during predict input tensors have the same shape as during fit. 

130 For certain applications, e.g. using CNNs on larger inputs than the training set, this has 

131 to be disabled 

132 :param check_input_columns: Whether to check if input columns at predict time coincide with those at fit time 

133 """ 

134 VectorRegressionModel.__init__(self, check_input_columns=check_input_columns) 

135 TensorModel.__init__(self) 

136 self.check_input_shape = check_input_shape 

137 

138 def _fit(self, x: pd.DataFrame, y: pd.DataFrame, weights: Optional[pd.Series] = None): 

139 self._warn_sample_weights_unsupported(False, weights) 

140 self._fit_tensor_model(x, y) 

141 

142 def _predict(self, x: pd.DataFrame) -> pd.DataFrame: 

143 return self._predict_df_through_array(x, self.get_predicted_variable_names()) 

144 

145 def predict(self, x: pd.DataFrame) -> pd.DataFrame: 

146 if self.check_input_shape: 

147 _check_df_shape(x, self.get_model_input_shape()) 

148 return super().predict(x) 

149 

150 

151class TensorToScalarClassificationModel(VectorClassificationModel, TensorModel, ABC): 

152 def __init__(self, check_input_shape=True, check_input_columns=True): 

153 """ 

154 Base class for classification models that take tensors as input and output scalars. They can be evaluated 

155 in the same way as non-tensor classification models 

156 

157 :param check_input_shape: Whether to check if during predict input tensors have the same shape as during fit. 

158 For certain applications, e.g. using CNNs on larger inputs than the training set, this has 

159 to be disabled 

160 :param check_input_columns: Whether to check if input columns at predict time coincide with those at fit time 

161 """ 

162 VectorClassificationModel.__init__(self, check_input_columns=check_input_columns) 

163 TensorModel.__init__(self) 

164 self.checkInputShape = check_input_shape 

165 

166 def _predict_class_probabilities(self, x: pd.DataFrame) -> pd.DataFrame: 

167 return self._predict_df_through_array(x, self.get_class_labels()) 

168 

169 def _fit_classifier(self, x: pd.DataFrame, y: pd.DataFrame, weights: Optional[pd.Series] = None): 

170 self._warn_sample_weights_unsupported(False, weights) 

171 self._fit_tensor_model(x, y) 

172 

173 def predict(self, x: pd.DataFrame) -> pd.DataFrame: 

174 if self.checkInputShape: 

175 _check_df_shape(x, self.get_model_input_shape()) 

176 return super().predict(x) 

177 

178 # just renaming the abstract method to implement 

179 def _predict_array(self, x: np.ndarray) -> np.ndarray: 

180 return self._predict_probabilities_array(x) 

181 

182 @abstractmethod 

183 def _predict_probabilities_array(self, x: np.ndarray) -> np.ndarray: 

184 """ 

185 If you are implementing a probabilistic classifier, this method should return a tensor with probabilities 

186 of shape `(N_DataPoints, N_Labels)`. It is assumed that labels are lexicographically sorted and the order 

187 of predictions in the output array should respect this. 

188 

189 The default implementation of _predict will then use the output of this method and convert it to predicted labels (via argmax). 

190 

191 In case you want to predict labels only or have a more efficient implementation of predicting labels than 

192 using argmax, your will have to override _predict in your implementation. In the former case of a 

193 non-probabilistic classifier, the implementation of this method should raise an exception, like the one below. 

194 """ 

195 raise NotImplementedError(f"Model {self.__class__.__name__} does not support prediction of probabilities") 

196 

197 

198# Note: for tensor to tensor models the output shape is not trivial. There will be dedicated evaluators 

199# and metrics for them. Examples for such models are auto-encoders, models performing semantic segregation, 

200# models for super-resolution and so on 

201class TensorToTensorRegressionModel(VectorRegressionModel, TensorModel, ABC): 

202 def __init__(self, check_input_shape=True, check_output_shape=True, check_input_columns=True): 

203 """ 

204 Base class for regression models that output tensors. Multiple targets can be used by putting 

205 them into separate columns. In that case it is required that all target tensors have the same shape. 

206 

207 :param check_input_shape: Whether to check if during predict tensors have the same shape as during fit. 

208 For certain applications, e.g. using CNNs on larger inputs than the training set, this has 

209 to be disabled 

210 :param check_output_shape: Whether to check if predictions have the same shape as ground truth data during fit. 

211 For certain applications, e.g. using CNNs on larger inputs than the training set, this has 

212 to be disabled 

213 :param check_input_columns: Whether to check if input columns at predict time coincide with those at fit time 

214 """ 

215 VectorRegressionModel.__init__(self, check_input_columns=check_input_columns) 

216 TensorModel.__init__(self) 

217 self.checkInputShape = check_input_shape 

218 self.checkOutputShape = check_output_shape 

219 

220 def _fit(self, x: pd.DataFrame, y: pd.DataFrame, weights: Optional[pd.Series] = None): 

221 self._warn_sample_weights_unsupported(False, weights) 

222 self._fit_tensor_model(x, y) 

223 

224 def _predict(self, x: pd.DataFrame) -> pd.DataFrame: 

225 return self._predict_df_through_array(x, self.get_predicted_variable_names()) 

226 

227 def predict(self, x: pd.DataFrame) -> pd.DataFrame: 

228 if not self.is_fitted(): 

229 raise Exception(f"Calling predict with unfitted model. " 

230 f"This might lead to errors down the line, especially if input/output checks are enabled") 

231 if self.checkInputShape: 

232 _check_df_shape(x, self.get_model_input_shape()) 

233 y = super().predict(x) 

234 if self.checkOutputShape: 

235 _check_df_shape(y, self.get_model_output_shape()) 

236 return y 

237 

238 

239class TensorToTensorClassificationModel(VectorModel, TensorModel, ABC): 

240 def __init__(self, check_input_shape=True, check_output_shape=True, check_input_columns=True): 

241 """ 

242 Base class for classification models that output tensors, e.g. for semantic segregation. The models 

243 can be fit on a ground truth data frame with a single column. The entries in this column should be 

244 binary tensors with one-hot-encoded labels, i.e. of shape `(*predictionShape, numLabels)` 

245 

246 :param check_input_shape: Whether to check if during predict tensors have the same shape as during fit. 

247 For certain applications, e.g. using CNNs on larger inputs than the training set, this has 

248 to be disabled 

249 :param check_output_shape: Whether to check if predictions have the same shape as ground truth data during fit. 

250 For certain applications, e.g. using CNNs on larger inputs than the training set, this has 

251 to be disabled 

252 :param check_input_columns: Whether to check if input columns at predict time coincide with those at fit time 

253 """ 

254 VectorModel.__init__(self, check_input_columns=check_input_columns) 

255 TensorModel.__init__(self) 

256 self.check_input_shape = check_input_shape 

257 self.check_output_shape = check_output_shape 

258 self._numPredictedClasses: Optional[int] = None 

259 

260 def _fit(self, x: pd.DataFrame, y: pd.DataFrame, weights: Optional[pd.Series] = None): 

261 self._warn_sample_weights_unsupported(False, weights) 

262 self._fit_tensor_model(x, y) 

263 

264 def is_regression_model(self) -> bool: 

265 return False 

266 

267 def get_num_predicted_classes(self): 

268 return self._numPredictedClasses 

269 

270 def fit(self, x: pd.DataFrame, y: pd.DataFrame, weights: Optional[pd.Series] = None, fit_preprocessors=True, fit_model=True): 

271 """ 

272 

273 :param x: data frame containing input tensors on which to train 

274 :param y: ground truth has to be an array containing only zeroes and ones (one-hot-encoded labels) of the shape 

275 `(*prediction_shape, numLabels)` 

276 :param weights: data point weights (must be None; not supported by this model!) 

277 

278 :param fit_preprocessors: whether the model's preprocessors (feature generators and data frame transformers) shall be fitted 

279 :param fit_model: whether the model itself shall be fitted 

280 """ 

281 if len(y.columns) != 1: 

282 raise ValueError(f"{self.__class__.__name__} requires exactly one output " 

283 f"column with tensors containing one-hot-encoded labels") 

284 

285 # checking if Y is a binary array of one hot encoded labels 

286 df_y_to_check = extract_array(y.iloc[:5]) 

287 if not np.array_equal(df_y_to_check, df_y_to_check.astype(bool)): 

288 raise Exception(f"Ground truth data points have to be binary arrays of one-hot-encoded labels " 

289 f"of shape (*prediction_shape, numLabels). Did you forget to one-hot-encode your labels " 

290 f"before training?") 

291 # df_y_to_check has shape (N_datapoints=5, *prediction_shape, N_labels) 

292 prediction_shape = df_y_to_check.shape[1:-1] 

293 if len(prediction_shape) == 0: 

294 raise InvalidShapeError(f"Ground truth data points have to be binary arrays of one-hot-encoded labels " 

295 f"of shape (*prediction_shape, numLabels). However, received array of trivial " 

296 f"prediction_shape. If the predictions are scalars, a TensorToScalarClassificationModel " 

297 f"should be used instead of {self.__class__.__name__}") 

298 self._numPredictedClasses = df_y_to_check.shape[-1] 

299 super().fit(x, y, weights=weights, fit_preprocessors=fit_preprocessors, fit_model=True) 

300 

301 def get_model_output_shape(self): 

302 # The ground truth contains one-hot-encoded labels in the last dimension 

303 # The model output predicts the labels as ints, without one-hot-encoding 

304 one_hot_encoded_output_shape = super().get_model_output_shape() 

305 if one_hot_encoded_output_shape is None: 

306 return None 

307 return one_hot_encoded_output_shape[:-1] 

308 

309 def convert_class_probabilities_to_predictions(self, df: pd.DataFrame): 

310 """ 

311 Converts from a result returned by predictClassProbabilities to a result as return by predict. 

312 

313 :param df: the output data frame from predictClassProbabilities 

314 :return: an output data frame as it would be returned by predict 

315 """ 

316 df = df.copy() 

317 col_name = self.get_predicted_variable_names()[0] 

318 df[col_name] = df[col_name].apply(lambda probas_array: probas_array.argmax(axis=-1)) 

319 return df 

320 

321 def predict_class_probabilities(self, x: pd.DataFrame) -> pd.DataFrame: 

322 """ 

323 :param x: the input data 

324 :return: a data frame with a single column containing arrays of shape `(*tensorShape, numLabels)`. 

325 Raises an exception if the classifier cannot predict probabilities. 

326 """ 

327 x = self._compute_model_inputs(x) 

328 if self.check_input_shape: 

329 _check_df_shape(x, self.get_model_input_shape()) 

330 result = self._predict_class_probabilities(x) 

331 self._check_prediction(result) 

332 return result 

333 

334 def _check_prediction(self, prediction_df: pd.DataFrame, max_rows_to_check=5): 

335 """ 

336 Checks whether the column name is correctly, whether the shapes match the ground truth and whether the entries 

337 correspond to probabilities 

338 """ 

339 if self.check_output_shape: 

340 _check_df_shape(prediction_df, self.get_model_output_shape()) 

341 

342 array_to_check = extract_array(prediction_df.iloc[:max_rows_to_check]) 

343 

344 if not np.all(0 <= array_to_check) or not np.all(array_to_check <= 1): 

345 log.warning(f"Probability arrays may not be correctly normalised, " 

346 f"got probabilities outside the range [0, 1]") 

347 

348 s = array_to_check.sum(axis=-1) 

349 if not np.all(np.isclose(s, 1)): 

350 log.warning( 

351 f"Probability array data frame may not be correctly normalised, " 

352 f"received probabilities do not sum to 1") 

353 

354 def _predict_class_probabilities(self, x: pd.DataFrame) -> pd.DataFrame: 

355 return self._predict_df_through_array(x, self.get_predicted_variable_names()) 

356 

357 # just renaming the abstract method to implement 

358 def _predict_array(self, x: np.ndarray) -> np.ndarray: 

359 return self._predict_probabilities_array(x) 

360 

361 @abstractmethod 

362 def _predict_probabilities_array(self, x: np.ndarray) -> np.ndarray: 

363 """ 

364 If you are implementing a probabilistic classifier, this method should return a tensor with probabilities 

365 of shape `(N_DataPoints, N_Labels)`. It is assumed that labels are lexicographically sorted and the order 

366 of predictions in the output array should respect this. 

367 

368 The default implementation of _predict will then use the output of this method and convert it to predicted labels (via argmax). 

369 

370 In case you want to predict labels only or have a more efficient implementation of predicting labels than 

371 using argmax, your will have to override _predict in your implementation. In the former case of a 

372 non-probabilistic classifier, the implementation of this method should raise an exception, like the one below. 

373 """ 

374 raise NotImplementedError(f"Model {self.__class__.__name__} does not support prediction of probabilities") 

375 

376 def _predict(self, x: pd.DataFrame) -> pd.DataFrame: 

377 try: 

378 predicted_probabilities_df = self._predict_class_probabilities(x) 

379 except Exception: 

380 raise Exception(f"Wrong implementation of {self.__class__.__name__}. For non-probabilistic classifiers " 

381 "_predict has to be overrode!") 

382 return self.convert_class_probabilities_to_predictions(predicted_probabilities_df) 

383 

384 # TODO or not TODO: I don't see how to reduce the code duplication here... 

385 def predict(self, x: pd.DataFrame) -> pd.DataFrame: 

386 """ 

387 Returns an array of integers. If the model was fitted on binary ground truth arrays of 

388 shape `(*tensorShape, numLabels)`, predictions will have the shape `tensorShape` and contain integers 

389 0, 1, ..., numLabels - 1. They correspond to the predicted labels 

390 """ 

391 if not self.is_fitted(): 

392 raise Exception(f"Calling predict with unfitted model. " 

393 f"This might lead to errors down the line, especially if input/output checks are enabled") 

394 if self.check_input_shape: 

395 _check_df_shape(x, self.get_model_input_shape()) 

396 y = super().predict(x) 

397 if self.check_output_shape: 

398 _check_df_shape(y, self.get_model_output_shape()) 

399 return y