Coverage for src/sensai/tensor_model.py: 32%
179 statements
« prev ^ index » next coverage.py v7.6.1, created at 2024-11-29 18:29 +0000
« prev ^ index » next coverage.py v7.6.1, created at 2024-11-29 18:29 +0000
1"""
2This module contains base classes for models that input and output tensors, for examples CNNs.
3The fitting and predictions will still be performed on data frames, like in VectorModel,
4but now it will be expected that all entries of the input data frame passed to the model are tensors of the same shape.
5Lists of scalars of the same lengths are also accepted. The same is expected of the ground truth data frames.
6Everything will work as well if the entries are just scalars but in this case it is recommended to use
7VectorModel instead.
9If we denote the shapes of entries in the dfs as inputTensorShape and outputTensorShape,
10the model will be fit on input tensors of shape (N_rows, N_inputColumns, inputTensorShape) and output tensors of
11shape (N_rows, N_outputColumns, outputTensorShape), where empty dimensions (e.g. for one-column data frames)
12will be stripped.
13"""
16import logging
17from abc import ABC, abstractmethod
18from typing import Optional, Tuple
20import numpy as np
21import pandas as pd
23from .util.pandas import extract_array
24from .vector_model import VectorRegressionModel, VectorClassificationModel, VectorModel
26log = logging.getLogger(__name__)
27# we set the default level to debug because it is often interesting for the user to receive
28# debug information about shapes as data frames get converted to arrays
29log.setLevel(logging.DEBUG)
32class InvalidShapeError(Exception):
33 pass
36def _get_datapoint_shape(df: pd.DataFrame):
37 first_row_df = df.iloc[:1]
38 # Note that the empty first dimension with N_Datapoints=1 is stripped by extractArray
39 return extract_array(first_row_df).shape
42def _check_df_shape(df: pd.DataFrame, desired_shape: tuple):
43 datapoint_shape = _get_datapoint_shape(df)
44 if datapoint_shape != desired_shape:
45 raise InvalidShapeError(f"Wrong input shape for data point. Expected {desired_shape} but got {datapoint_shape}")
48# This is implemented as a mixin because there can be no functional common class for all tensor models.
49# The reason is that actual implementations need to inherit from Vector-Regression/Classification-Model
50# (or duplicate a lot of code) and thus it is not possible to inherit from something like TensorModel(VectorModel)
51# without getting into a mess.
52class TensorModel(ABC):
53 def __init__(self):
54 self._modelInputShape = None
55 self._modelOutputShape = None
57 @abstractmethod
58 def _fit_to_array(self, x: np.ndarray, y: np.ndarray):
59 pass
61 @abstractmethod
62 def _predict_array(self, x: np.ndarray) -> np.ndarray:
63 """
64 The result should be of shape `(N_DataPoints, *predictedTensorShape)` if a single column is predicted
65 or of shape `(N_DataPoints, N_Columns, *predictedTensorShape)` if multiple columns are predicted
66 (e.g. for multiple regression targets). Note that in both cases, the number of predicted columns
67 should coincide with corresponding number in the ground truth data frame the model was fitted on
69 :param x: a tensor of shape `(N_DataPoints, *inputTensorShape)`
70 """
71 pass
73 def _predict_df_through_array(self, x: pd.DataFrame, output_columns: list) -> pd.DataFrame:
74 """
75 To be used within _predict in implementations of this class. Performs predictions by
76 transforming X into an array, computing the predicted array from it and turning the result into a
77 predictions data frame.
79 :param x: input data frame (of same type as for _fitTensorModel)
80 :param output_columns: columns of the outputDF, typically the result of calling `getPredictedVariableNames()` in
81 an implementation
82 :return:
83 """
84 y = self._predict_array(extract_array(x))
85 if not len(y) == len(x):
86 raise InvalidShapeError(f"Number of data points (lengths) of input data frame and predictions must agree. "
87 f"Expected {len(x)} but got {len(y)}")
89 result = pd.DataFrame(index=x.index)
90 n_columns = len(output_columns)
91 if n_columns == 1:
92 result[output_columns[0]] = list(y)
93 else:
95 if not n_columns == y.shape[1]:
96 raise InvalidShapeError(f"Wrong shape of predictions array for a data frame with {n_columns} columns ({output_columns}). "
97 f"Expected shape ({len(x)}, {n_columns}, ...) but got: {y.shape}")
98 for i, col in enumerate(output_columns):
99 result[col] = list(y[:, i])
100 return result
102 def _fit_tensor_model(self, x: pd.DataFrame, y: pd.DataFrame):
103 """
104 To be used within _fit in implementations of this class
105 """
106 log.debug(f"Stacking input tensors from columns {x.columns} and from all rows to a single array. "
107 f"Note that all tensors need to have the same shape")
108 x = extract_array(x)
109 y = extract_array(y)
110 self._modelInputShape = x[0].shape
111 self._modelOutputShape = y[0].shape
112 log.debug(f"Fitting on {len(x)} datapoints of shape {self._modelInputShape}. "
113 f"The ground truth are tensors of shape {self._modelOutputShape}")
114 self._fit_to_array(x, y)
116 def get_model_input_shape(self) -> Optional[Tuple]:
117 return self._modelInputShape
119 def get_model_output_shape(self):
120 return self._modelInputShape
123class TensorToScalarRegressionModel(VectorRegressionModel, TensorModel, ABC):
124 def __init__(self, check_input_shape=True, check_input_columns=True):
125 """
126 Base class for regression models that take tensors as input and output scalars. They can be evaluated
127 in the same way as non-tensor regression models
129 :param check_input_shape: Whether to check if during predict input tensors have the same shape as during fit.
130 For certain applications, e.g. using CNNs on larger inputs than the training set, this has
131 to be disabled
132 :param check_input_columns: Whether to check if input columns at predict time coincide with those at fit time
133 """
134 VectorRegressionModel.__init__(self, check_input_columns=check_input_columns)
135 TensorModel.__init__(self)
136 self.check_input_shape = check_input_shape
138 def _fit(self, x: pd.DataFrame, y: pd.DataFrame, weights: Optional[pd.Series] = None):
139 self._warn_sample_weights_unsupported(False, weights)
140 self._fit_tensor_model(x, y)
142 def _predict(self, x: pd.DataFrame) -> pd.DataFrame:
143 return self._predict_df_through_array(x, self.get_predicted_variable_names())
145 def predict(self, x: pd.DataFrame) -> pd.DataFrame:
146 if self.check_input_shape:
147 _check_df_shape(x, self.get_model_input_shape())
148 return super().predict(x)
151class TensorToScalarClassificationModel(VectorClassificationModel, TensorModel, ABC):
152 def __init__(self, check_input_shape=True, check_input_columns=True):
153 """
154 Base class for classification models that take tensors as input and output scalars. They can be evaluated
155 in the same way as non-tensor classification models
157 :param check_input_shape: Whether to check if during predict input tensors have the same shape as during fit.
158 For certain applications, e.g. using CNNs on larger inputs than the training set, this has
159 to be disabled
160 :param check_input_columns: Whether to check if input columns at predict time coincide with those at fit time
161 """
162 VectorClassificationModel.__init__(self, check_input_columns=check_input_columns)
163 TensorModel.__init__(self)
164 self.checkInputShape = check_input_shape
166 def _predict_class_probabilities(self, x: pd.DataFrame) -> pd.DataFrame:
167 return self._predict_df_through_array(x, self.get_class_labels())
169 def _fit_classifier(self, x: pd.DataFrame, y: pd.DataFrame, weights: Optional[pd.Series] = None):
170 self._warn_sample_weights_unsupported(False, weights)
171 self._fit_tensor_model(x, y)
173 def predict(self, x: pd.DataFrame) -> pd.DataFrame:
174 if self.checkInputShape:
175 _check_df_shape(x, self.get_model_input_shape())
176 return super().predict(x)
178 # just renaming the abstract method to implement
179 def _predict_array(self, x: np.ndarray) -> np.ndarray:
180 return self._predict_probabilities_array(x)
182 @abstractmethod
183 def _predict_probabilities_array(self, x: np.ndarray) -> np.ndarray:
184 """
185 If you are implementing a probabilistic classifier, this method should return a tensor with probabilities
186 of shape `(N_DataPoints, N_Labels)`. It is assumed that labels are lexicographically sorted and the order
187 of predictions in the output array should respect this.
189 The default implementation of _predict will then use the output of this method and convert it to predicted labels (via argmax).
191 In case you want to predict labels only or have a more efficient implementation of predicting labels than
192 using argmax, your will have to override _predict in your implementation. In the former case of a
193 non-probabilistic classifier, the implementation of this method should raise an exception, like the one below.
194 """
195 raise NotImplementedError(f"Model {self.__class__.__name__} does not support prediction of probabilities")
198# Note: for tensor to tensor models the output shape is not trivial. There will be dedicated evaluators
199# and metrics for them. Examples for such models are auto-encoders, models performing semantic segregation,
200# models for super-resolution and so on
201class TensorToTensorRegressionModel(VectorRegressionModel, TensorModel, ABC):
202 def __init__(self, check_input_shape=True, check_output_shape=True, check_input_columns=True):
203 """
204 Base class for regression models that output tensors. Multiple targets can be used by putting
205 them into separate columns. In that case it is required that all target tensors have the same shape.
207 :param check_input_shape: Whether to check if during predict tensors have the same shape as during fit.
208 For certain applications, e.g. using CNNs on larger inputs than the training set, this has
209 to be disabled
210 :param check_output_shape: Whether to check if predictions have the same shape as ground truth data during fit.
211 For certain applications, e.g. using CNNs on larger inputs than the training set, this has
212 to be disabled
213 :param check_input_columns: Whether to check if input columns at predict time coincide with those at fit time
214 """
215 VectorRegressionModel.__init__(self, check_input_columns=check_input_columns)
216 TensorModel.__init__(self)
217 self.checkInputShape = check_input_shape
218 self.checkOutputShape = check_output_shape
220 def _fit(self, x: pd.DataFrame, y: pd.DataFrame, weights: Optional[pd.Series] = None):
221 self._warn_sample_weights_unsupported(False, weights)
222 self._fit_tensor_model(x, y)
224 def _predict(self, x: pd.DataFrame) -> pd.DataFrame:
225 return self._predict_df_through_array(x, self.get_predicted_variable_names())
227 def predict(self, x: pd.DataFrame) -> pd.DataFrame:
228 if not self.is_fitted():
229 raise Exception(f"Calling predict with unfitted model. "
230 f"This might lead to errors down the line, especially if input/output checks are enabled")
231 if self.checkInputShape:
232 _check_df_shape(x, self.get_model_input_shape())
233 y = super().predict(x)
234 if self.checkOutputShape:
235 _check_df_shape(y, self.get_model_output_shape())
236 return y
239class TensorToTensorClassificationModel(VectorModel, TensorModel, ABC):
240 def __init__(self, check_input_shape=True, check_output_shape=True, check_input_columns=True):
241 """
242 Base class for classification models that output tensors, e.g. for semantic segregation. The models
243 can be fit on a ground truth data frame with a single column. The entries in this column should be
244 binary tensors with one-hot-encoded labels, i.e. of shape `(*predictionShape, numLabels)`
246 :param check_input_shape: Whether to check if during predict tensors have the same shape as during fit.
247 For certain applications, e.g. using CNNs on larger inputs than the training set, this has
248 to be disabled
249 :param check_output_shape: Whether to check if predictions have the same shape as ground truth data during fit.
250 For certain applications, e.g. using CNNs on larger inputs than the training set, this has
251 to be disabled
252 :param check_input_columns: Whether to check if input columns at predict time coincide with those at fit time
253 """
254 VectorModel.__init__(self, check_input_columns=check_input_columns)
255 TensorModel.__init__(self)
256 self.check_input_shape = check_input_shape
257 self.check_output_shape = check_output_shape
258 self._numPredictedClasses: Optional[int] = None
260 def _fit(self, x: pd.DataFrame, y: pd.DataFrame, weights: Optional[pd.Series] = None):
261 self._warn_sample_weights_unsupported(False, weights)
262 self._fit_tensor_model(x, y)
264 def is_regression_model(self) -> bool:
265 return False
267 def get_num_predicted_classes(self):
268 return self._numPredictedClasses
270 def fit(self, x: pd.DataFrame, y: pd.DataFrame, weights: Optional[pd.Series] = None, fit_preprocessors=True, fit_model=True):
271 """
273 :param x: data frame containing input tensors on which to train
274 :param y: ground truth has to be an array containing only zeroes and ones (one-hot-encoded labels) of the shape
275 `(*prediction_shape, numLabels)`
276 :param weights: data point weights (must be None; not supported by this model!)
278 :param fit_preprocessors: whether the model's preprocessors (feature generators and data frame transformers) shall be fitted
279 :param fit_model: whether the model itself shall be fitted
280 """
281 if len(y.columns) != 1:
282 raise ValueError(f"{self.__class__.__name__} requires exactly one output "
283 f"column with tensors containing one-hot-encoded labels")
285 # checking if Y is a binary array of one hot encoded labels
286 df_y_to_check = extract_array(y.iloc[:5])
287 if not np.array_equal(df_y_to_check, df_y_to_check.astype(bool)):
288 raise Exception(f"Ground truth data points have to be binary arrays of one-hot-encoded labels "
289 f"of shape (*prediction_shape, numLabels). Did you forget to one-hot-encode your labels "
290 f"before training?")
291 # df_y_to_check has shape (N_datapoints=5, *prediction_shape, N_labels)
292 prediction_shape = df_y_to_check.shape[1:-1]
293 if len(prediction_shape) == 0:
294 raise InvalidShapeError(f"Ground truth data points have to be binary arrays of one-hot-encoded labels "
295 f"of shape (*prediction_shape, numLabels). However, received array of trivial "
296 f"prediction_shape. If the predictions are scalars, a TensorToScalarClassificationModel "
297 f"should be used instead of {self.__class__.__name__}")
298 self._numPredictedClasses = df_y_to_check.shape[-1]
299 super().fit(x, y, weights=weights, fit_preprocessors=fit_preprocessors, fit_model=True)
301 def get_model_output_shape(self):
302 # The ground truth contains one-hot-encoded labels in the last dimension
303 # The model output predicts the labels as ints, without one-hot-encoding
304 one_hot_encoded_output_shape = super().get_model_output_shape()
305 if one_hot_encoded_output_shape is None:
306 return None
307 return one_hot_encoded_output_shape[:-1]
309 def convert_class_probabilities_to_predictions(self, df: pd.DataFrame):
310 """
311 Converts from a result returned by predictClassProbabilities to a result as return by predict.
313 :param df: the output data frame from predictClassProbabilities
314 :return: an output data frame as it would be returned by predict
315 """
316 df = df.copy()
317 col_name = self.get_predicted_variable_names()[0]
318 df[col_name] = df[col_name].apply(lambda probas_array: probas_array.argmax(axis=-1))
319 return df
321 def predict_class_probabilities(self, x: pd.DataFrame) -> pd.DataFrame:
322 """
323 :param x: the input data
324 :return: a data frame with a single column containing arrays of shape `(*tensorShape, numLabels)`.
325 Raises an exception if the classifier cannot predict probabilities.
326 """
327 x = self._compute_model_inputs(x)
328 if self.check_input_shape:
329 _check_df_shape(x, self.get_model_input_shape())
330 result = self._predict_class_probabilities(x)
331 self._check_prediction(result)
332 return result
334 def _check_prediction(self, prediction_df: pd.DataFrame, max_rows_to_check=5):
335 """
336 Checks whether the column name is correctly, whether the shapes match the ground truth and whether the entries
337 correspond to probabilities
338 """
339 if self.check_output_shape:
340 _check_df_shape(prediction_df, self.get_model_output_shape())
342 array_to_check = extract_array(prediction_df.iloc[:max_rows_to_check])
344 if not np.all(0 <= array_to_check) or not np.all(array_to_check <= 1):
345 log.warning(f"Probability arrays may not be correctly normalised, "
346 f"got probabilities outside the range [0, 1]")
348 s = array_to_check.sum(axis=-1)
349 if not np.all(np.isclose(s, 1)):
350 log.warning(
351 f"Probability array data frame may not be correctly normalised, "
352 f"received probabilities do not sum to 1")
354 def _predict_class_probabilities(self, x: pd.DataFrame) -> pd.DataFrame:
355 return self._predict_df_through_array(x, self.get_predicted_variable_names())
357 # just renaming the abstract method to implement
358 def _predict_array(self, x: np.ndarray) -> np.ndarray:
359 return self._predict_probabilities_array(x)
361 @abstractmethod
362 def _predict_probabilities_array(self, x: np.ndarray) -> np.ndarray:
363 """
364 If you are implementing a probabilistic classifier, this method should return a tensor with probabilities
365 of shape `(N_DataPoints, N_Labels)`. It is assumed that labels are lexicographically sorted and the order
366 of predictions in the output array should respect this.
368 The default implementation of _predict will then use the output of this method and convert it to predicted labels (via argmax).
370 In case you want to predict labels only or have a more efficient implementation of predicting labels than
371 using argmax, your will have to override _predict in your implementation. In the former case of a
372 non-probabilistic classifier, the implementation of this method should raise an exception, like the one below.
373 """
374 raise NotImplementedError(f"Model {self.__class__.__name__} does not support prediction of probabilities")
376 def _predict(self, x: pd.DataFrame) -> pd.DataFrame:
377 try:
378 predicted_probabilities_df = self._predict_class_probabilities(x)
379 except Exception:
380 raise Exception(f"Wrong implementation of {self.__class__.__name__}. For non-probabilistic classifiers "
381 "_predict has to be overrode!")
382 return self.convert_class_probabilities_to_predictions(predicted_probabilities_df)
384 # TODO or not TODO: I don't see how to reduce the code duplication here...
385 def predict(self, x: pd.DataFrame) -> pd.DataFrame:
386 """
387 Returns an array of integers. If the model was fitted on binary ground truth arrays of
388 shape `(*tensorShape, numLabels)`, predictions will have the shape `tensorShape` and contain integers
389 0, 1, ..., numLabels - 1. They correspond to the predicted labels
390 """
391 if not self.is_fitted():
392 raise Exception(f"Calling predict with unfitted model. "
393 f"This might lead to errors down the line, especially if input/output checks are enabled")
394 if self.check_input_shape:
395 _check_df_shape(x, self.get_model_input_shape())
396 y = super().predict(x)
397 if self.check_output_shape:
398 _check_df_shape(y, self.get_model_output_shape())
399 return y