Coverage for src/sensai/tensor_model.py: 33%
175 statements
« prev ^ index » next coverage.py v7.6.1, created at 2024-08-13 22:17 +0000
« prev ^ index » next coverage.py v7.6.1, created at 2024-08-13 22:17 +0000
1"""
2This module contains base classes for models that input and output tensors, for examples CNNs.
3The fitting and predictions will still be performed on data frames, like in VectorModel,
4but now it will be expected that all entries of the input data frame passed to the model are tensors of the same shape.
5Lists of scalars of the same lengths are also accepted. The same is expected of the ground truth data frames.
6Everything will work as well if the entries are just scalars but in this case it is recommended to use
7VectorModel instead.
9If we denote the shapes of entries in the dfs as inputTensorShape and outputTensorShape,
10the model will be fit on input tensors of shape (N_rows, N_inputColumns, inputTensorShape) and output tensors of
11shape (N_rows, N_outputColumns, outputTensorShape), where empty dimensions (e.g. for one-column data frames)
12will be stripped.
13"""
16import logging
17from abc import ABC, abstractmethod
18from typing import Optional, Tuple
20import numpy as np
21import pandas as pd
23from .util.pandas import extract_array
24from .vector_model import VectorRegressionModel, VectorClassificationModel, VectorModel
26log = logging.getLogger(__name__)
27# we set the default level to debug because it is often interesting for the user to receive
28# debug information about shapes as data frames get converted to arrays
29log.setLevel(logging.DEBUG)
32class InvalidShapeError(Exception):
33 pass
36def _get_datapoint_shape(df: pd.DataFrame):
37 first_row_df = df.iloc[:1]
38 # Note that the empty first dimension with N_Datapoints=1 is stripped by extractArray
39 return extract_array(first_row_df).shape
42def _check_df_shape(df: pd.DataFrame, desired_shape: tuple):
43 datapoint_shape = _get_datapoint_shape(df)
44 if datapoint_shape != desired_shape:
45 raise InvalidShapeError(f"Wrong input shape for data point. Expected {desired_shape} but got {datapoint_shape}")
48# This is implemented as a mixin because there can be no functional common class for all tensor models.
49# The reason is that actual implementations need to inherit from Vector-Regression/Classification-Model
50# (or duplicate a lot of code) and thus it is not possible to inherit from something like TensorModel(VectorModel)
51# without getting into a mess.
52class TensorModel(ABC):
53 def __init__(self):
54 self._modelInputShape = None
55 self._modelOutputShape = None
57 @abstractmethod
58 def _fit_to_array(self, x: np.ndarray, y: np.ndarray):
59 pass
61 @abstractmethod
62 def _predict_array(self, x: np.ndarray) -> np.ndarray:
63 """
64 The result should be of shape `(N_DataPoints, *predictedTensorShape)` if a single column is predicted
65 or of shape `(N_DataPoints, N_Columns, *predictedTensorShape)` if multiple columns are predicted
66 (e.g. for multiple regression targets). Note that in both cases, the number of predicted columns
67 should coincide with corresponding number in the ground truth data frame the model was fitted on
69 :param x: a tensor of shape `(N_DataPoints, *inputTensorShape)`
70 """
71 pass
73 def _predict_df_through_array(self, x: pd.DataFrame, output_columns: list) -> pd.DataFrame:
74 """
75 To be used within _predict in implementations of this class. Performs predictions by
76 transforming X into an array, computing the predicted array from it and turning the result into a
77 predictions data frame.
79 :param x: input data frame (of same type as for _fitTensorModel)
80 :param output_columns: columns of the outputDF, typically the result of calling `getPredictedVariableNames()` in
81 an implementation
82 :return:
83 """
84 y = self._predict_array(extract_array(x))
85 if not len(y) == len(x):
86 raise InvalidShapeError(f"Number of data points (lengths) of input data frame and predictions must agree. "
87 f"Expected {len(x)} but got {len(y)}")
89 result = pd.DataFrame(index=x.index)
90 n_columns = len(output_columns)
91 if n_columns == 1:
92 result[output_columns[0]] = list(y)
93 else:
95 if not n_columns == y.shape[1]:
96 raise InvalidShapeError(f"Wrong shape of predictions array for a data frame with {n_columns} columns ({output_columns}). "
97 f"Expected shape ({len(x)}, {n_columns}, ...) but got: {y.shape}")
98 for i, col in enumerate(output_columns):
99 result[col] = list(y[:, i])
100 return result
102 def _fit_tensor_model(self, x: pd.DataFrame, y: pd.DataFrame):
103 """
104 To be used within _fit in implementations of this class
105 """
106 log.debug(f"Stacking input tensors from columns {x.columns} and from all rows to a single array. "
107 f"Note that all tensors need to have the same shape")
108 x = extract_array(x)
109 y = extract_array(y)
110 self._modelInputShape = x[0].shape
111 self._modelOutputShape = y[0].shape
112 log.debug(f"Fitting on {len(x)} datapoints of shape {self._modelInputShape}. "
113 f"The ground truth are tensors of shape {self._modelOutputShape}")
114 self._fit_to_array(x, y)
116 def get_model_input_shape(self) -> Optional[Tuple]:
117 return self._modelInputShape
119 def get_model_output_shape(self):
120 return self._modelInputShape
123class TensorToScalarRegressionModel(VectorRegressionModel, TensorModel, ABC):
124 def __init__(self, check_input_shape=True, check_input_columns=True):
125 """
126 Base class for regression models that take tensors as input and output scalars. They can be evaluated
127 in the same way as non-tensor regression models
129 :param check_input_shape: Whether to check if during predict input tensors have the same shape as during fit.
130 For certain applications, e.g. using CNNs on larger inputs than the training set, this has
131 to be disabled
132 :param check_input_columns: Whether to check if input columns at predict time coincide with those at fit time
133 """
134 VectorRegressionModel.__init__(self, check_input_columns=check_input_columns)
135 TensorModel.__init__(self)
136 self.check_input_shape = check_input_shape
138 def _fit(self, x: pd.DataFrame, y: pd.DataFrame):
139 self._fit_tensor_model(x, y)
141 def _predict(self, x: pd.DataFrame) -> pd.DataFrame:
142 return self._predict_df_through_array(x, self.get_predicted_variable_names())
144 def predict(self, x: pd.DataFrame) -> pd.DataFrame:
145 if self.check_input_shape:
146 _check_df_shape(x, self.get_model_input_shape())
147 return super().predict(x)
150class TensorToScalarClassificationModel(VectorClassificationModel, TensorModel, ABC):
151 def __init__(self, check_input_shape=True, check_input_columns=True):
152 """
153 Base class for classification models that take tensors as input and output scalars. They can be evaluated
154 in the same way as non-tensor classification models
156 :param check_input_shape: Whether to check if during predict input tensors have the same shape as during fit.
157 For certain applications, e.g. using CNNs on larger inputs than the training set, this has
158 to be disabled
159 :param check_input_columns: Whether to check if input columns at predict time coincide with those at fit time
160 """
161 VectorClassificationModel.__init__(self, check_input_columns=check_input_columns)
162 TensorModel.__init__(self)
163 self.checkInputShape = check_input_shape
165 def _predict_class_probabilities(self, x: pd.DataFrame) -> pd.DataFrame:
166 return self._predict_df_through_array(x, self.get_class_labels())
168 def _fit_classifier(self, x: pd.DataFrame, y: pd.DataFrame):
169 self._fit_tensor_model(x, y)
171 def predict(self, x: pd.DataFrame) -> pd.DataFrame:
172 if self.checkInputShape:
173 _check_df_shape(x, self.get_model_input_shape())
174 return super().predict(x)
176 # just renaming the abstract method to implement
177 def _predict_array(self, x: np.ndarray) -> np.ndarray:
178 return self._predict_probabilities_array(x)
180 @abstractmethod
181 def _predict_probabilities_array(self, x: np.ndarray) -> np.ndarray:
182 """
183 If you are implementing a probabilistic classifier, this method should return a tensor with probabilities
184 of shape `(N_DataPoints, N_Labels)`. It is assumed that labels are lexicographically sorted and the order
185 of predictions in the output array should respect this.
187 The default implementation of _predict will then use the output of this method and convert it to predicted labels (via argmax).
189 In case you want to predict labels only or have a more efficient implementation of predicting labels than
190 using argmax, your will have to override _predict in your implementation. In the former case of a
191 non-probabilistic classifier, the implementation of this method should raise an exception, like the one below.
192 """
193 raise NotImplementedError(f"Model {self.__class__.__name__} does not support prediction of probabilities")
196# Note: for tensor to tensor models the output shape is not trivial. There will be dedicated evaluators
197# and metrics for them. Examples for such models are auto-encoders, models performing semantic segregation,
198# models for super-resolution and so on
199class TensorToTensorRegressionModel(VectorRegressionModel, TensorModel, ABC):
200 def __init__(self, check_input_shape=True, check_output_shape=True, check_input_columns=True):
201 """
202 Base class for regression models that output tensors. Multiple targets can be used by putting
203 them into separate columns. In that case it is required that all target tensors have the same shape.
205 :param check_input_shape: Whether to check if during predict tensors have the same shape as during fit.
206 For certain applications, e.g. using CNNs on larger inputs than the training set, this has
207 to be disabled
208 :param check_output_shape: Whether to check if predictions have the same shape as ground truth data during fit.
209 For certain applications, e.g. using CNNs on larger inputs than the training set, this has
210 to be disabled
211 :param check_input_columns: Whether to check if input columns at predict time coincide with those at fit time
212 """
213 VectorRegressionModel.__init__(self, check_input_columns=check_input_columns)
214 TensorModel.__init__(self)
215 self.checkInputShape = check_input_shape
216 self.checkOutputShape = check_output_shape
218 def _fit(self, x: pd.DataFrame, y: pd.DataFrame):
219 self._fit_tensor_model(x, y)
221 def _predict(self, x: pd.DataFrame) -> pd.DataFrame:
222 return self._predict_df_through_array(x, self.get_predicted_variable_names())
224 def predict(self, x: pd.DataFrame) -> pd.DataFrame:
225 if not self.is_fitted():
226 raise Exception(f"Calling predict with unfitted model. "
227 f"This might lead to errors down the line, especially if input/output checks are enabled")
228 if self.checkInputShape:
229 _check_df_shape(x, self.get_model_input_shape())
230 y = super().predict(x)
231 if self.checkOutputShape:
232 _check_df_shape(y, self.get_model_output_shape())
233 return y
236class TensorToTensorClassificationModel(VectorModel, TensorModel, ABC):
237 def __init__(self, check_input_shape=True, check_output_shape=True, check_input_columns=True):
238 """
239 Base class for classification models that output tensors, e.g. for semantic segregation. The models
240 can be fit on a ground truth data frame with a single column. The entries in this column should be
241 binary tensors with one-hot-encoded labels, i.e. of shape `(*predictionShape, numLabels)`
243 :param check_input_shape: Whether to check if during predict tensors have the same shape as during fit.
244 For certain applications, e.g. using CNNs on larger inputs than the training set, this has
245 to be disabled
246 :param check_output_shape: Whether to check if predictions have the same shape as ground truth data during fit.
247 For certain applications, e.g. using CNNs on larger inputs than the training set, this has
248 to be disabled
249 :param check_input_columns: Whether to check if input columns at predict time coincide with those at fit time
250 """
251 VectorModel.__init__(self, check_input_columns=check_input_columns)
252 TensorModel.__init__(self)
253 self.check_input_shape = check_input_shape
254 self.check_output_shape = check_output_shape
255 self._numPredictedClasses: Optional[int] = None
257 def _fit(self, x: pd.DataFrame, y: pd.DataFrame):
258 self._fit_tensor_model(x, y)
260 def is_regression_model(self) -> bool:
261 return False
263 def get_num_predicted_classes(self):
264 return self._numPredictedClasses
266 def fit(self, x: pd.DataFrame, y: pd.DataFrame, fit_preprocessors=True, fit_model=True):
267 """
269 :param x: data frame containing input tensors on which to train
270 :param y: ground truth has to be an array containing only zeroes and ones (one-hot-encoded labels) of the shape
271 `(*prediction_shape, numLabels)`
273 :param fit_preprocessors: whether the model's preprocessors (feature generators and data frame transformers) shall be fitted
274 :param fit_model: whether the model itself shall be fitted
275 """
276 if len(y.columns) != 1:
277 raise ValueError(f"{self.__class__.__name__} requires exactly one output "
278 f"column with tensors containing one-hot-encoded labels")
280 # checking if Y is a binary array of one hot encoded labels
281 df_y_to_check = extract_array(y.iloc[:5])
282 if not np.array_equal(df_y_to_check, df_y_to_check.astype(bool)):
283 raise Exception(f"Ground truth data points have to be binary arrays of one-hot-encoded labels "
284 f"of shape (*prediction_shape, numLabels). Did you forget to one-hot-encode your labels "
285 f"before training?")
286 # df_y_to_check has shape (N_datapoints=5, *prediction_shape, N_labels)
287 prediction_shape = df_y_to_check.shape[1:-1]
288 if len(prediction_shape) == 0:
289 raise InvalidShapeError(f"Ground truth data points have to be binary arrays of one-hot-encoded labels "
290 f"of shape (*prediction_shape, numLabels). However, received array of trivial "
291 f"prediction_shape. If the predictions are scalars, a TensorToScalarClassificationModel "
292 f"should be used instead of {self.__class__.__name__}")
293 self._numPredictedClasses = df_y_to_check.shape[-1]
294 super().fit(x, y, fit_preprocessors=fit_preprocessors, fit_model=True)
296 def get_model_output_shape(self):
297 # The ground truth contains one-hot-encoded labels in the last dimension
298 # The model output predicts the labels as ints, without one-hot-encoding
299 one_hot_encoded_output_shape = super().get_model_output_shape()
300 if one_hot_encoded_output_shape is None:
301 return None
302 return one_hot_encoded_output_shape[:-1]
304 def convert_class_probabilities_to_predictions(self, df: pd.DataFrame):
305 """
306 Converts from a result returned by predictClassProbabilities to a result as return by predict.
308 :param df: the output data frame from predictClassProbabilities
309 :return: an output data frame as it would be returned by predict
310 """
311 df = df.copy()
312 col_name = self.get_predicted_variable_names()[0]
313 df[col_name] = df[col_name].apply(lambda probas_array: probas_array.argmax(axis=-1))
314 return df
316 def predict_class_probabilities(self, x: pd.DataFrame) -> pd.DataFrame:
317 """
318 :param x: the input data
319 :return: a data frame with a single column containing arrays of shape `(*tensorShape, numLabels)`.
320 Raises an exception if the classifier cannot predict probabilities.
321 """
322 x = self._compute_model_inputs(x)
323 if self.check_input_shape:
324 _check_df_shape(x, self.get_model_input_shape())
325 result = self._predict_class_probabilities(x)
326 self._check_prediction(result)
327 return result
329 def _check_prediction(self, prediction_df: pd.DataFrame, max_rows_to_check=5):
330 """
331 Checks whether the column name is correctly, whether the shapes match the ground truth and whether the entries
332 correspond to probabilities
333 """
334 if self.check_output_shape:
335 _check_df_shape(prediction_df, self.get_model_output_shape())
337 array_to_check = extract_array(prediction_df.iloc[:max_rows_to_check])
339 if not np.all(0 <= array_to_check) or not np.all(array_to_check <= 1):
340 log.warning(f"Probability arrays may not be correctly normalised, "
341 f"got probabilities outside the range [0, 1]")
343 s = array_to_check.sum(axis=-1)
344 if not np.all(np.isclose(s, 1)):
345 log.warning(
346 f"Probability array data frame may not be correctly normalised, "
347 f"received probabilities do not sum to 1")
349 def _predict_class_probabilities(self, x: pd.DataFrame) -> pd.DataFrame:
350 return self._predict_df_through_array(x, self.get_predicted_variable_names())
352 # just renaming the abstract method to implement
353 def _predict_array(self, x: np.ndarray) -> np.ndarray:
354 return self._predict_probabilities_array(x)
356 @abstractmethod
357 def _predict_probabilities_array(self, x: np.ndarray) -> np.ndarray:
358 """
359 If you are implementing a probabilistic classifier, this method should return a tensor with probabilities
360 of shape `(N_DataPoints, N_Labels)`. It is assumed that labels are lexicographically sorted and the order
361 of predictions in the output array should respect this.
363 The default implementation of _predict will then use the output of this method and convert it to predicted labels (via argmax).
365 In case you want to predict labels only or have a more efficient implementation of predicting labels than
366 using argmax, your will have to override _predict in your implementation. In the former case of a
367 non-probabilistic classifier, the implementation of this method should raise an exception, like the one below.
368 """
369 raise NotImplementedError(f"Model {self.__class__.__name__} does not support prediction of probabilities")
371 def _predict(self, x: pd.DataFrame) -> pd.DataFrame:
372 try:
373 predicted_probabilities_df = self._predict_class_probabilities(x)
374 except Exception:
375 raise Exception(f"Wrong implementation of {self.__class__.__name__}. For non-probabilistic classifiers "
376 "_predict has to be overrode!")
377 return self.convert_class_probabilities_to_predictions(predicted_probabilities_df)
379 # TODO or not TODO: I don't see how to reduce the code duplication here...
380 def predict(self, x: pd.DataFrame) -> pd.DataFrame:
381 """
382 Returns an array of integers. If the model was fitted on binary ground truth arrays of
383 shape `(*tensorShape, numLabels)`, predictions will have the shape `tensorShape` and contain integers
384 0, 1, ..., numLabels - 1. They correspond to the predicted labels
385 """
386 if not self.is_fitted():
387 raise Exception(f"Calling predict with unfitted model. "
388 f"This might lead to errors down the line, especially if input/output checks are enabled")
389 if self.check_input_shape:
390 _check_df_shape(x, self.get_model_input_shape())
391 y = super().predict(x)
392 if self.check_output_shape:
393 _check_df_shape(y, self.get_model_output_shape())
394 return y