Coverage for src/sensai/tensor_model.py: 33%

175 statements  

« prev     ^ index     » next       coverage.py v7.6.1, created at 2024-08-13 22:17 +0000

1""" 

2This module contains base classes for models that input and output tensors, for examples CNNs. 

3The fitting and predictions will still be performed on data frames, like in VectorModel, 

4but now it will be expected that all entries of the input data frame passed to the model are tensors of the same shape. 

5Lists of scalars of the same lengths are also accepted. The same is expected of the ground truth data frames. 

6Everything will work as well if the entries are just scalars but in this case it is recommended to use 

7VectorModel instead. 

8 

9If we denote the shapes of entries in the dfs as inputTensorShape and outputTensorShape, 

10the model will be fit on input tensors of shape (N_rows, N_inputColumns, inputTensorShape) and output tensors of 

11shape (N_rows, N_outputColumns, outputTensorShape), where empty dimensions (e.g. for one-column data frames) 

12will be stripped. 

13""" 

14 

15 

16import logging 

17from abc import ABC, abstractmethod 

18from typing import Optional, Tuple 

19 

20import numpy as np 

21import pandas as pd 

22 

23from .util.pandas import extract_array 

24from .vector_model import VectorRegressionModel, VectorClassificationModel, VectorModel 

25 

26log = logging.getLogger(__name__) 

27# we set the default level to debug because it is often interesting for the user to receive 

28# debug information about shapes as data frames get converted to arrays 

29log.setLevel(logging.DEBUG) 

30 

31 

32class InvalidShapeError(Exception): 

33 pass 

34 

35 

36def _get_datapoint_shape(df: pd.DataFrame): 

37 first_row_df = df.iloc[:1] 

38 # Note that the empty first dimension with N_Datapoints=1 is stripped by extractArray 

39 return extract_array(first_row_df).shape 

40 

41 

42def _check_df_shape(df: pd.DataFrame, desired_shape: tuple): 

43 datapoint_shape = _get_datapoint_shape(df) 

44 if datapoint_shape != desired_shape: 

45 raise InvalidShapeError(f"Wrong input shape for data point. Expected {desired_shape} but got {datapoint_shape}") 

46 

47 

48# This is implemented as a mixin because there can be no functional common class for all tensor models. 

49# The reason is that actual implementations need to inherit from Vector-Regression/Classification-Model 

50# (or duplicate a lot of code) and thus it is not possible to inherit from something like TensorModel(VectorModel) 

51# without getting into a mess. 

52class TensorModel(ABC): 

53 def __init__(self): 

54 self._modelInputShape = None 

55 self._modelOutputShape = None 

56 

57 @abstractmethod 

58 def _fit_to_array(self, x: np.ndarray, y: np.ndarray): 

59 pass 

60 

61 @abstractmethod 

62 def _predict_array(self, x: np.ndarray) -> np.ndarray: 

63 """ 

64 The result should be of shape `(N_DataPoints, *predictedTensorShape)` if a single column is predicted 

65 or of shape `(N_DataPoints, N_Columns, *predictedTensorShape)` if multiple columns are predicted 

66 (e.g. for multiple regression targets). Note that in both cases, the number of predicted columns 

67 should coincide with corresponding number in the ground truth data frame the model was fitted on 

68 

69 :param x: a tensor of shape `(N_DataPoints, *inputTensorShape)` 

70 """ 

71 pass 

72 

73 def _predict_df_through_array(self, x: pd.DataFrame, output_columns: list) -> pd.DataFrame: 

74 """ 

75 To be used within _predict in implementations of this class. Performs predictions by 

76 transforming X into an array, computing the predicted array from it and turning the result into a 

77 predictions data frame. 

78 

79 :param x: input data frame (of same type as for _fitTensorModel) 

80 :param output_columns: columns of the outputDF, typically the result of calling `getPredictedVariableNames()` in 

81 an implementation 

82 :return: 

83 """ 

84 y = self._predict_array(extract_array(x)) 

85 if not len(y) == len(x): 

86 raise InvalidShapeError(f"Number of data points (lengths) of input data frame and predictions must agree. " 

87 f"Expected {len(x)} but got {len(y)}") 

88 

89 result = pd.DataFrame(index=x.index) 

90 n_columns = len(output_columns) 

91 if n_columns == 1: 

92 result[output_columns[0]] = list(y) 

93 else: 

94 

95 if not n_columns == y.shape[1]: 

96 raise InvalidShapeError(f"Wrong shape of predictions array for a data frame with {n_columns} columns ({output_columns}). " 

97 f"Expected shape ({len(x)}, {n_columns}, ...) but got: {y.shape}") 

98 for i, col in enumerate(output_columns): 

99 result[col] = list(y[:, i]) 

100 return result 

101 

102 def _fit_tensor_model(self, x: pd.DataFrame, y: pd.DataFrame): 

103 """ 

104 To be used within _fit in implementations of this class 

105 """ 

106 log.debug(f"Stacking input tensors from columns {x.columns} and from all rows to a single array. " 

107 f"Note that all tensors need to have the same shape") 

108 x = extract_array(x) 

109 y = extract_array(y) 

110 self._modelInputShape = x[0].shape 

111 self._modelOutputShape = y[0].shape 

112 log.debug(f"Fitting on {len(x)} datapoints of shape {self._modelInputShape}. " 

113 f"The ground truth are tensors of shape {self._modelOutputShape}") 

114 self._fit_to_array(x, y) 

115 

116 def get_model_input_shape(self) -> Optional[Tuple]: 

117 return self._modelInputShape 

118 

119 def get_model_output_shape(self): 

120 return self._modelInputShape 

121 

122 

123class TensorToScalarRegressionModel(VectorRegressionModel, TensorModel, ABC): 

124 def __init__(self, check_input_shape=True, check_input_columns=True): 

125 """ 

126 Base class for regression models that take tensors as input and output scalars. They can be evaluated 

127 in the same way as non-tensor regression models 

128 

129 :param check_input_shape: Whether to check if during predict input tensors have the same shape as during fit. 

130 For certain applications, e.g. using CNNs on larger inputs than the training set, this has 

131 to be disabled 

132 :param check_input_columns: Whether to check if input columns at predict time coincide with those at fit time 

133 """ 

134 VectorRegressionModel.__init__(self, check_input_columns=check_input_columns) 

135 TensorModel.__init__(self) 

136 self.check_input_shape = check_input_shape 

137 

138 def _fit(self, x: pd.DataFrame, y: pd.DataFrame): 

139 self._fit_tensor_model(x, y) 

140 

141 def _predict(self, x: pd.DataFrame) -> pd.DataFrame: 

142 return self._predict_df_through_array(x, self.get_predicted_variable_names()) 

143 

144 def predict(self, x: pd.DataFrame) -> pd.DataFrame: 

145 if self.check_input_shape: 

146 _check_df_shape(x, self.get_model_input_shape()) 

147 return super().predict(x) 

148 

149 

150class TensorToScalarClassificationModel(VectorClassificationModel, TensorModel, ABC): 

151 def __init__(self, check_input_shape=True, check_input_columns=True): 

152 """ 

153 Base class for classification models that take tensors as input and output scalars. They can be evaluated 

154 in the same way as non-tensor classification models 

155 

156 :param check_input_shape: Whether to check if during predict input tensors have the same shape as during fit. 

157 For certain applications, e.g. using CNNs on larger inputs than the training set, this has 

158 to be disabled 

159 :param check_input_columns: Whether to check if input columns at predict time coincide with those at fit time 

160 """ 

161 VectorClassificationModel.__init__(self, check_input_columns=check_input_columns) 

162 TensorModel.__init__(self) 

163 self.checkInputShape = check_input_shape 

164 

165 def _predict_class_probabilities(self, x: pd.DataFrame) -> pd.DataFrame: 

166 return self._predict_df_through_array(x, self.get_class_labels()) 

167 

168 def _fit_classifier(self, x: pd.DataFrame, y: pd.DataFrame): 

169 self._fit_tensor_model(x, y) 

170 

171 def predict(self, x: pd.DataFrame) -> pd.DataFrame: 

172 if self.checkInputShape: 

173 _check_df_shape(x, self.get_model_input_shape()) 

174 return super().predict(x) 

175 

176 # just renaming the abstract method to implement 

177 def _predict_array(self, x: np.ndarray) -> np.ndarray: 

178 return self._predict_probabilities_array(x) 

179 

180 @abstractmethod 

181 def _predict_probabilities_array(self, x: np.ndarray) -> np.ndarray: 

182 """ 

183 If you are implementing a probabilistic classifier, this method should return a tensor with probabilities 

184 of shape `(N_DataPoints, N_Labels)`. It is assumed that labels are lexicographically sorted and the order 

185 of predictions in the output array should respect this. 

186 

187 The default implementation of _predict will then use the output of this method and convert it to predicted labels (via argmax). 

188 

189 In case you want to predict labels only or have a more efficient implementation of predicting labels than 

190 using argmax, your will have to override _predict in your implementation. In the former case of a 

191 non-probabilistic classifier, the implementation of this method should raise an exception, like the one below. 

192 """ 

193 raise NotImplementedError(f"Model {self.__class__.__name__} does not support prediction of probabilities") 

194 

195 

196# Note: for tensor to tensor models the output shape is not trivial. There will be dedicated evaluators 

197# and metrics for them. Examples for such models are auto-encoders, models performing semantic segregation, 

198# models for super-resolution and so on 

199class TensorToTensorRegressionModel(VectorRegressionModel, TensorModel, ABC): 

200 def __init__(self, check_input_shape=True, check_output_shape=True, check_input_columns=True): 

201 """ 

202 Base class for regression models that output tensors. Multiple targets can be used by putting 

203 them into separate columns. In that case it is required that all target tensors have the same shape. 

204 

205 :param check_input_shape: Whether to check if during predict tensors have the same shape as during fit. 

206 For certain applications, e.g. using CNNs on larger inputs than the training set, this has 

207 to be disabled 

208 :param check_output_shape: Whether to check if predictions have the same shape as ground truth data during fit. 

209 For certain applications, e.g. using CNNs on larger inputs than the training set, this has 

210 to be disabled 

211 :param check_input_columns: Whether to check if input columns at predict time coincide with those at fit time 

212 """ 

213 VectorRegressionModel.__init__(self, check_input_columns=check_input_columns) 

214 TensorModel.__init__(self) 

215 self.checkInputShape = check_input_shape 

216 self.checkOutputShape = check_output_shape 

217 

218 def _fit(self, x: pd.DataFrame, y: pd.DataFrame): 

219 self._fit_tensor_model(x, y) 

220 

221 def _predict(self, x: pd.DataFrame) -> pd.DataFrame: 

222 return self._predict_df_through_array(x, self.get_predicted_variable_names()) 

223 

224 def predict(self, x: pd.DataFrame) -> pd.DataFrame: 

225 if not self.is_fitted(): 

226 raise Exception(f"Calling predict with unfitted model. " 

227 f"This might lead to errors down the line, especially if input/output checks are enabled") 

228 if self.checkInputShape: 

229 _check_df_shape(x, self.get_model_input_shape()) 

230 y = super().predict(x) 

231 if self.checkOutputShape: 

232 _check_df_shape(y, self.get_model_output_shape()) 

233 return y 

234 

235 

236class TensorToTensorClassificationModel(VectorModel, TensorModel, ABC): 

237 def __init__(self, check_input_shape=True, check_output_shape=True, check_input_columns=True): 

238 """ 

239 Base class for classification models that output tensors, e.g. for semantic segregation. The models 

240 can be fit on a ground truth data frame with a single column. The entries in this column should be 

241 binary tensors with one-hot-encoded labels, i.e. of shape `(*predictionShape, numLabels)` 

242 

243 :param check_input_shape: Whether to check if during predict tensors have the same shape as during fit. 

244 For certain applications, e.g. using CNNs on larger inputs than the training set, this has 

245 to be disabled 

246 :param check_output_shape: Whether to check if predictions have the same shape as ground truth data during fit. 

247 For certain applications, e.g. using CNNs on larger inputs than the training set, this has 

248 to be disabled 

249 :param check_input_columns: Whether to check if input columns at predict time coincide with those at fit time 

250 """ 

251 VectorModel.__init__(self, check_input_columns=check_input_columns) 

252 TensorModel.__init__(self) 

253 self.check_input_shape = check_input_shape 

254 self.check_output_shape = check_output_shape 

255 self._numPredictedClasses: Optional[int] = None 

256 

257 def _fit(self, x: pd.DataFrame, y: pd.DataFrame): 

258 self._fit_tensor_model(x, y) 

259 

260 def is_regression_model(self) -> bool: 

261 return False 

262 

263 def get_num_predicted_classes(self): 

264 return self._numPredictedClasses 

265 

266 def fit(self, x: pd.DataFrame, y: pd.DataFrame, fit_preprocessors=True, fit_model=True): 

267 """ 

268 

269 :param x: data frame containing input tensors on which to train 

270 :param y: ground truth has to be an array containing only zeroes and ones (one-hot-encoded labels) of the shape 

271 `(*prediction_shape, numLabels)` 

272 

273 :param fit_preprocessors: whether the model's preprocessors (feature generators and data frame transformers) shall be fitted 

274 :param fit_model: whether the model itself shall be fitted 

275 """ 

276 if len(y.columns) != 1: 

277 raise ValueError(f"{self.__class__.__name__} requires exactly one output " 

278 f"column with tensors containing one-hot-encoded labels") 

279 

280 # checking if Y is a binary array of one hot encoded labels 

281 df_y_to_check = extract_array(y.iloc[:5]) 

282 if not np.array_equal(df_y_to_check, df_y_to_check.astype(bool)): 

283 raise Exception(f"Ground truth data points have to be binary arrays of one-hot-encoded labels " 

284 f"of shape (*prediction_shape, numLabels). Did you forget to one-hot-encode your labels " 

285 f"before training?") 

286 # df_y_to_check has shape (N_datapoints=5, *prediction_shape, N_labels) 

287 prediction_shape = df_y_to_check.shape[1:-1] 

288 if len(prediction_shape) == 0: 

289 raise InvalidShapeError(f"Ground truth data points have to be binary arrays of one-hot-encoded labels " 

290 f"of shape (*prediction_shape, numLabels). However, received array of trivial " 

291 f"prediction_shape. If the predictions are scalars, a TensorToScalarClassificationModel " 

292 f"should be used instead of {self.__class__.__name__}") 

293 self._numPredictedClasses = df_y_to_check.shape[-1] 

294 super().fit(x, y, fit_preprocessors=fit_preprocessors, fit_model=True) 

295 

296 def get_model_output_shape(self): 

297 # The ground truth contains one-hot-encoded labels in the last dimension 

298 # The model output predicts the labels as ints, without one-hot-encoding 

299 one_hot_encoded_output_shape = super().get_model_output_shape() 

300 if one_hot_encoded_output_shape is None: 

301 return None 

302 return one_hot_encoded_output_shape[:-1] 

303 

304 def convert_class_probabilities_to_predictions(self, df: pd.DataFrame): 

305 """ 

306 Converts from a result returned by predictClassProbabilities to a result as return by predict. 

307 

308 :param df: the output data frame from predictClassProbabilities 

309 :return: an output data frame as it would be returned by predict 

310 """ 

311 df = df.copy() 

312 col_name = self.get_predicted_variable_names()[0] 

313 df[col_name] = df[col_name].apply(lambda probas_array: probas_array.argmax(axis=-1)) 

314 return df 

315 

316 def predict_class_probabilities(self, x: pd.DataFrame) -> pd.DataFrame: 

317 """ 

318 :param x: the input data 

319 :return: a data frame with a single column containing arrays of shape `(*tensorShape, numLabels)`. 

320 Raises an exception if the classifier cannot predict probabilities. 

321 """ 

322 x = self._compute_model_inputs(x) 

323 if self.check_input_shape: 

324 _check_df_shape(x, self.get_model_input_shape()) 

325 result = self._predict_class_probabilities(x) 

326 self._check_prediction(result) 

327 return result 

328 

329 def _check_prediction(self, prediction_df: pd.DataFrame, max_rows_to_check=5): 

330 """ 

331 Checks whether the column name is correctly, whether the shapes match the ground truth and whether the entries 

332 correspond to probabilities 

333 """ 

334 if self.check_output_shape: 

335 _check_df_shape(prediction_df, self.get_model_output_shape()) 

336 

337 array_to_check = extract_array(prediction_df.iloc[:max_rows_to_check]) 

338 

339 if not np.all(0 <= array_to_check) or not np.all(array_to_check <= 1): 

340 log.warning(f"Probability arrays may not be correctly normalised, " 

341 f"got probabilities outside the range [0, 1]") 

342 

343 s = array_to_check.sum(axis=-1) 

344 if not np.all(np.isclose(s, 1)): 

345 log.warning( 

346 f"Probability array data frame may not be correctly normalised, " 

347 f"received probabilities do not sum to 1") 

348 

349 def _predict_class_probabilities(self, x: pd.DataFrame) -> pd.DataFrame: 

350 return self._predict_df_through_array(x, self.get_predicted_variable_names()) 

351 

352 # just renaming the abstract method to implement 

353 def _predict_array(self, x: np.ndarray) -> np.ndarray: 

354 return self._predict_probabilities_array(x) 

355 

356 @abstractmethod 

357 def _predict_probabilities_array(self, x: np.ndarray) -> np.ndarray: 

358 """ 

359 If you are implementing a probabilistic classifier, this method should return a tensor with probabilities 

360 of shape `(N_DataPoints, N_Labels)`. It is assumed that labels are lexicographically sorted and the order 

361 of predictions in the output array should respect this. 

362 

363 The default implementation of _predict will then use the output of this method and convert it to predicted labels (via argmax). 

364 

365 In case you want to predict labels only or have a more efficient implementation of predicting labels than 

366 using argmax, your will have to override _predict in your implementation. In the former case of a 

367 non-probabilistic classifier, the implementation of this method should raise an exception, like the one below. 

368 """ 

369 raise NotImplementedError(f"Model {self.__class__.__name__} does not support prediction of probabilities") 

370 

371 def _predict(self, x: pd.DataFrame) -> pd.DataFrame: 

372 try: 

373 predicted_probabilities_df = self._predict_class_probabilities(x) 

374 except Exception: 

375 raise Exception(f"Wrong implementation of {self.__class__.__name__}. For non-probabilistic classifiers " 

376 "_predict has to be overrode!") 

377 return self.convert_class_probabilities_to_predictions(predicted_probabilities_df) 

378 

379 # TODO or not TODO: I don't see how to reduce the code duplication here... 

380 def predict(self, x: pd.DataFrame) -> pd.DataFrame: 

381 """ 

382 Returns an array of integers. If the model was fitted on binary ground truth arrays of 

383 shape `(*tensorShape, numLabels)`, predictions will have the shape `tensorShape` and contain integers 

384 0, 1, ..., numLabels - 1. They correspond to the predicted labels 

385 """ 

386 if not self.is_fitted(): 

387 raise Exception(f"Calling predict with unfitted model. " 

388 f"This might lead to errors down the line, especially if input/output checks are enabled") 

389 if self.check_input_shape: 

390 _check_df_shape(x, self.get_model_input_shape()) 

391 y = super().predict(x) 

392 if self.check_output_shape: 

393 _check_df_shape(y, self.get_model_output_shape()) 

394 return y