Coverage for src/sensai/tensor

1"""

2This module contains base classes for models that input and output tensors, for examples CNNs.

3The fitting and predictions will still be performed on data frames, like in VectorModel,

4but now it will be expected that all entries of the input data frame passed to the model are tensors of the same shape.

5Lists of scalars of the same lengths are also accepted. The same is expected of the ground truth data frames.

6Everything will work as well if the entries are just scalars but in this case it is recommended to use

7VectorModel instead.

9If we denote the shapes of entries in the dfs as inputTensorShape and outputTensorShape,

10the model will be fit on input tensors of shape (N_rows, N_inputColumns, inputTensorShape) and output tensors of

11shape (N_rows, N_outputColumns, outputTensorShape), where empty dimensions (e.g. for one-column data frames)

12will be stripped.

13"""

16import logging

17from abc import ABC, abstractmethod

18from typing import Optional, Tuple

20import numpy as np

21import pandas as pd

23from .util.pandas import extract_array

24from .vector_model import VectorRegressionModel, VectorClassificationModel, VectorModel

26log = logging.getLogger(__name__)

27# we set the default level to debug because it is often interesting for the user to receive

28# debug information about shapes as data frames get converted to arrays

29log.setLevel(logging.DEBUG)

32class InvalidShapeError(Exception):

33 pass

36def _get_datapoint_shape(df: pd.DataFrame):

37 first_row_df = df.iloc[:1]

38 # Note that the empty first dimension with N_Datapoints=1 is stripped by extractArray

39 return extract_array(first_row_df).shape

42def _check_df_shape(df: pd.DataFrame, desired_shape: tuple):

43 datapoint_shape = _get_datapoint_shape(df)

44 if datapoint_shape != desired_shape:

45 raise InvalidShapeError(f"Wrong input shape for data point. Expected {desired_shape} but got {datapoint_shape}")

48# This is implemented as a mixin because there can be no functional common class for all tensor models.

49# The reason is that actual implementations need to inherit from Vector-Regression/Classification-Model

50# (or duplicate a lot of code) and thus it is not possible to inherit from something like TensorModel(VectorModel)

51# without getting into a mess.

52class TensorModel(ABC):

53 def __init__(self):

54 self._modelInputShape = None

55 self._modelOutputShape = None

57 @abstractmethod

58 def _fit_to_array(self, x: np.ndarray, y: np.ndarray):

59 pass

61 @abstractmethod

62 def _predict_array(self, x: np.ndarray) -> np.ndarray:

63 """

64 The result should be of shape `(N_DataPoints, *predictedTensorShape)` if a single column is predicted

65 or of shape `(N_DataPoints, N_Columns, *predictedTensorShape)` if multiple columns are predicted

66 (e.g. for multiple regression targets). Note that in both cases, the number of predicted columns

67 should coincide with corresponding number in the ground truth data frame the model was fitted on

69 :param x: a tensor of shape `(N_DataPoints, *inputTensorShape)`

70 """

71 pass

73 def _predict_df_through_array(self, x: pd.DataFrame, output_columns: list) -> pd.DataFrame:

74 """

75 To be used within _predict in implementations of this class. Performs predictions by

76 transforming X into an array, computing the predicted array from it and turning the result into a

77 predictions data frame.

79 :param x: input data frame (of same type as for _fitTensorModel)

80 :param output_columns: columns of the outputDF, typically the result of calling `getPredictedVariableNames()` in

81 an implementation

82 :return:

83 """

84 y = self._predict_array(extract_array(x))

85 if not len(y) == len(x):

86 raise InvalidShapeError(f"Number of data points (lengths) of input data frame and predictions must agree. "

87 f"Expected {len(x)} but got {len(y)}")

89 result = pd.DataFrame(index=x.index)

90 n_columns = len(output_columns)

91 if n_columns == 1:

92 result[output_columns[0]] = list(y)

93 else:

95 if not n_columns == y.shape[1]:

96 raise InvalidShapeError(f"Wrong shape of predictions array for a data frame with {n_columns} columns ({output_columns}). "

97 f"Expected shape ({len(x)}, {n_columns}, ...) but got: {y.shape}")

98 for i, col in enumerate(output_columns):

99 result[col] = list(y[:, i])

100 return result

101

102 def _fit_tensor_model(self, x: pd.DataFrame, y: pd.DataFrame):

103 """

104 To be used within _fit in implementations of this class

105 """

106 log.debug(f"Stacking input tensors from columns {x.columns} and from all rows to a single array. "

107 f"Note that all tensors need to have the same shape")

108 x = extract_array(x)

109 y = extract_array(y)

110 self._modelInputShape = x[0].shape

111 self._modelOutputShape = y[0].shape

112 log.debug(f"Fitting on {len(x)} datapoints of shape {self._modelInputShape}. "

113 f"The ground truth are tensors of shape {self._modelOutputShape}")

114 self._fit_to_array(x, y)

115

116 def get_model_input_shape(self) -> Optional[Tuple]:

117 return self._modelInputShape

118

119 def get_model_output_shape(self):

120 return self._modelInputShape

121

122

123class TensorToScalarRegressionModel(VectorRegressionModel, TensorModel, ABC):

124 def __init__(self, check_input_shape=True, check_input_columns=True):

125 """

126 Base class for regression models that take tensors as input and output scalars. They can be evaluated

127 in the same way as non-tensor regression models

128

129 :param check_input_shape: Whether to check if during predict input tensors have the same shape as during fit.

130 For certain applications, e.g. using CNNs on larger inputs than the training set, this has

131 to be disabled

132 :param check_input_columns: Whether to check if input columns at predict time coincide with those at fit time

133 """

134 VectorRegressionModel.__init__(self, check_input_columns=check_input_columns)

135 TensorModel.__init__(self)

136 self.check_input_shape = check_input_shape

137

138 def _fit(self, x: pd.DataFrame, y: pd.DataFrame, weights: Optional[pd.Series] = None):

139 self._warn_sample_weights_unsupported(False, weights)

140 self._fit_tensor_model(x, y)

141

142 def _predict(self, x: pd.DataFrame) -> pd.DataFrame:

143 return self._predict_df_through_array(x, self.get_predicted_variable_names())

144

145 def predict(self, x: pd.DataFrame) -> pd.DataFrame:

146 if self.check_input_shape:

147 _check_df_shape(x, self.get_model_input_shape())

148 return super().predict(x)

149

150

151class TensorToScalarClassificationModel(VectorClassificationModel, TensorModel, ABC):

152 def __init__(self, check_input_shape=True, check_input_columns=True):

153 """

154 Base class for classification models that take tensors as input and output scalars. They can be evaluated

155 in the same way as non-tensor classification models

156

157 :param check_input_shape: Whether to check if during predict input tensors have the same shape as during fit.

158 For certain applications, e.g. using CNNs on larger inputs than the training set, this has

159 to be disabled

160 :param check_input_columns: Whether to check if input columns at predict time coincide with those at fit time

161 """

162 VectorClassificationModel.__init__(self, check_input_columns=check_input_columns)

163 TensorModel.__init__(self)

164 self.checkInputShape = check_input_shape

165

166 def _predict_class_probabilities(self, x: pd.DataFrame) -> pd.DataFrame:

167 return self._predict_df_through_array(x, self.get_class_labels())

168

169 def _fit_classifier(self, x: pd.DataFrame, y: pd.DataFrame, weights: Optional[pd.Series] = None):

170 self._warn_sample_weights_unsupported(False, weights)

171 self._fit_tensor_model(x, y)

172

173 def predict(self, x: pd.DataFrame) -> pd.DataFrame:

174 if self.checkInputShape:

175 _check_df_shape(x, self.get_model_input_shape())

176 return super().predict(x)

177

178 # just renaming the abstract method to implement

179 def _predict_array(self, x: np.ndarray) -> np.ndarray:

180 return self._predict_probabilities_array(x)

181

182 @abstractmethod

183 def _predict_probabilities_array(self, x: np.ndarray) -> np.ndarray:

184 """

185 If you are implementing a probabilistic classifier, this method should return a tensor with probabilities

186 of shape `(N_DataPoints, N_Labels)`. It is assumed that labels are lexicographically sorted and the order

187 of predictions in the output array should respect this.

188

189 The default implementation of _predict will then use the output of this method and convert it to predicted labels (via argmax).

190

191 In case you want to predict labels only or have a more efficient implementation of predicting labels than

192 using argmax, your will have to override _predict in your implementation. In the former case of a

193 non-probabilistic classifier, the implementation of this method should raise an exception, like the one below.

194 """

195 raise NotImplementedError(f"Model {self.__class__.__name__} does not support prediction of probabilities")

196

197

198# Note: for tensor to tensor models the output shape is not trivial. There will be dedicated evaluators

199# and metrics for them. Examples for such models are auto-encoders, models performing semantic segregation,

200# models for super-resolution and so on

201class TensorToTensorRegressionModel(VectorRegressionModel, TensorModel, ABC):

202 def __init__(self, check_input_shape=True, check_output_shape=True, check_input_columns=True):

203 """

204 Base class for regression models that output tensors. Multiple targets can be used by putting

205 them into separate columns. In that case it is required that all target tensors have the same shape.

206

207 :param check_input_shape: Whether to check if during predict tensors have the same shape as during fit.

208 For certain applications, e.g. using CNNs on larger inputs than the training set, this has

209 to be disabled

210 :param check_output_shape: Whether to check if predictions have the same shape as ground truth data during fit.

211 For certain applications, e.g. using CNNs on larger inputs than the training set, this has

212 to be disabled

213 :param check_input_columns: Whether to check if input columns at predict time coincide with those at fit time

214 """

215 VectorRegressionModel.__init__(self, check_input_columns=check_input_columns)

216 TensorModel.__init__(self)

217 self.checkInputShape = check_input_shape

218 self.checkOutputShape = check_output_shape

219

220 def _fit(self, x: pd.DataFrame, y: pd.DataFrame, weights: Optional[pd.Series] = None):

221 self._warn_sample_weights_unsupported(False, weights)

222 self._fit_tensor_model(x, y)

223

224 def _predict(self, x: pd.DataFrame) -> pd.DataFrame:

225 return self._predict_df_through_array(x, self.get_predicted_variable_names())

226

227 def predict(self, x: pd.DataFrame) -> pd.DataFrame:

228 if not self.is_fitted():

229 raise Exception(f"Calling predict with unfitted model. "

230 f"This might lead to errors down the line, especially if input/output checks are enabled")

231 if self.checkInputShape:

232 _check_df_shape(x, self.get_model_input_shape())

233 y = super().predict(x)

234 if self.checkOutputShape:

235 _check_df_shape(y, self.get_model_output_shape())

236 return y

237

238

239class TensorToTensorClassificationModel(VectorModel, TensorModel, ABC):

240 def __init__(self, check_input_shape=True, check_output_shape=True, check_input_columns=True):

241 """

242 Base class for classification models that output tensors, e.g. for semantic segregation. The models

243 can be fit on a ground truth data frame with a single column. The entries in this column should be

244 binary tensors with one-hot-encoded labels, i.e. of shape `(*predictionShape, numLabels)`

245

246 :param check_input_shape: Whether to check if during predict tensors have the same shape as during fit.

247 For certain applications, e.g. using CNNs on larger inputs than the training set, this has

248 to be disabled

249 :param check_output_shape: Whether to check if predictions have the same shape as ground truth data during fit.

250 For certain applications, e.g. using CNNs on larger inputs than the training set, this has

251 to be disabled

252 :param check_input_columns: Whether to check if input columns at predict time coincide with those at fit time

253 """

254 VectorModel.__init__(self, check_input_columns=check_input_columns)

255 TensorModel.__init__(self)

256 self.check_input_shape = check_input_shape

257 self.check_output_shape = check_output_shape

258 self._numPredictedClasses: Optional[int] = None

259

260 def _fit(self, x: pd.DataFrame, y: pd.DataFrame, weights: Optional[pd.Series] = None):

261 self._warn_sample_weights_unsupported(False, weights)

262 self._fit_tensor_model(x, y)

263

264 def is_regression_model(self) -> bool:

265 return False

266

267 def get_num_predicted_classes(self):

268 return self._numPredictedClasses

269

270 def fit(self, x: pd.DataFrame, y: pd.DataFrame, weights: Optional[pd.Series] = None, fit_preprocessors=True, fit_model=True):

271 """

272

273 :param x: data frame containing input tensors on which to train

274 :param y: ground truth has to be an array containing only zeroes and ones (one-hot-encoded labels) of the shape

275 `(*prediction_shape, numLabels)`

276 :param weights: data point weights (must be None; not supported by this model!)

277

278 :param fit_preprocessors: whether the model's preprocessors (feature generators and data frame transformers) shall be fitted

279 :param fit_model: whether the model itself shall be fitted

280 """

281 if len(y.columns) != 1:

282 raise ValueError(f"{self.__class__.__name__} requires exactly one output "

283 f"column with tensors containing one-hot-encoded labels")

284

285 # checking if Y is a binary array of one hot encoded labels

286 df_y_to_check = extract_array(y.iloc[:5])

287 if not np.array_equal(df_y_to_check, df_y_to_check.astype(bool)):

288 raise Exception(f"Ground truth data points have to be binary arrays of one-hot-encoded labels "

289 f"of shape (*prediction_shape, numLabels). Did you forget to one-hot-encode your labels "

290 f"before training?")

291 # df_y_to_check has shape (N_datapoints=5, *prediction_shape, N_labels)

292 prediction_shape = df_y_to_check.shape[1:-1]

293 if len(prediction_shape) == 0:

294 raise InvalidShapeError(f"Ground truth data points have to be binary arrays of one-hot-encoded labels "

295 f"of shape (*prediction_shape, numLabels). However, received array of trivial "

296 f"prediction_shape. If the predictions are scalars, a TensorToScalarClassificationModel "

297 f"should be used instead of {self.__class__.__name__}")

298 self._numPredictedClasses = df_y_to_check.shape[-1]

299 super().fit(x, y, weights=weights, fit_preprocessors=fit_preprocessors, fit_model=True)

300

301 def get_model_output_shape(self):

302 # The ground truth contains one-hot-encoded labels in the last dimension

303 # The model output predicts the labels as ints, without one-hot-encoding

304 one_hot_encoded_output_shape = super().get_model_output_shape()

305 if one_hot_encoded_output_shape is None:

306 return None

307 return one_hot_encoded_output_shape[:-1]

308

309 def convert_class_probabilities_to_predictions(self, df: pd.DataFrame):

310 """

311 Converts from a result returned by predictClassProbabilities to a result as return by predict.

312

313 :param df: the output data frame from predictClassProbabilities

314 :return: an output data frame as it would be returned by predict

315 """

316 df = df.copy()

317 col_name = self.get_predicted_variable_names()[0]

318 df[col_name] = df[col_name].apply(lambda probas_array: probas_array.argmax(axis=-1))

319 return df

320

321 def predict_class_probabilities(self, x: pd.DataFrame) -> pd.DataFrame:

322 """

323 :param x: the input data

324 :return: a data frame with a single column containing arrays of shape `(*tensorShape, numLabels)`.

325 Raises an exception if the classifier cannot predict probabilities.

326 """

327 x = self._compute_model_inputs(x)

328 if self.check_input_shape:

329 _check_df_shape(x, self.get_model_input_shape())

330 result = self._predict_class_probabilities(x)

331 self._check_prediction(result)

332 return result

333

334 def _check_prediction(self, prediction_df: pd.DataFrame, max_rows_to_check=5):

335 """

336 Checks whether the column name is correctly, whether the shapes match the ground truth and whether the entries

337 correspond to probabilities

338 """

339 if self.check_output_shape:

340 _check_df_shape(prediction_df, self.get_model_output_shape())

341

342 array_to_check = extract_array(prediction_df.iloc[:max_rows_to_check])

343

344 if not np.all(0 <= array_to_check) or not np.all(array_to_check <= 1):

345 log.warning(f"Probability arrays may not be correctly normalised, "

346 f"got probabilities outside the range [0, 1]")

347

348 s = array_to_check.sum(axis=-1)

349 if not np.all(np.isclose(s, 1)):

350 log.warning(

351 f"Probability array data frame may not be correctly normalised, "

352 f"received probabilities do not sum to 1")

353

354 def _predict_class_probabilities(self, x: pd.DataFrame) -> pd.DataFrame:

355 return self._predict_df_through_array(x, self.get_predicted_variable_names())

356

357 # just renaming the abstract method to implement

358 def _predict_array(self, x: np.ndarray) -> np.ndarray:

359 return self._predict_probabilities_array(x)

360

361 @abstractmethod

362 def _predict_probabilities_array(self, x: np.ndarray) -> np.ndarray:

363 """

364 If you are implementing a probabilistic classifier, this method should return a tensor with probabilities

365 of shape `(N_DataPoints, N_Labels)`. It is assumed that labels are lexicographically sorted and the order

366 of predictions in the output array should respect this.

367

368 The default implementation of _predict will then use the output of this method and convert it to predicted labels (via argmax).

369

370 In case you want to predict labels only or have a more efficient implementation of predicting labels than

371 using argmax, your will have to override _predict in your implementation. In the former case of a

372 non-probabilistic classifier, the implementation of this method should raise an exception, like the one below.

373 """

374 raise NotImplementedError(f"Model {self.__class__.__name__} does not support prediction of probabilities")

375

376 def _predict(self, x: pd.DataFrame) -> pd.DataFrame:

377 try:

378 predicted_probabilities_df = self._predict_class_probabilities(x)

379 except Exception:

380 raise Exception(f"Wrong implementation of {self.__class__.__name__}. For non-probabilistic classifiers "

381 "_predict has to be overrode!")

382 return self.convert_class_probabilities_to_predictions(predicted_probabilities_df)

383

384 # TODO or not TODO: I don't see how to reduce the code duplication here...

385 def predict(self, x: pd.DataFrame) -> pd.DataFrame:

386 """

387 Returns an array of integers. If the model was fitted on binary ground truth arrays of

388 shape `(*tensorShape, numLabels)`, predictions will have the shape `tensorShape` and contain integers

389 0, 1, ..., numLabels - 1. They correspond to the predicted labels

390 """

391 if not self.is_fitted():

392 raise Exception(f"Calling predict with unfitted model. "

393 f"This might lead to errors down the line, especially if input/output checks are enabled")

394 if self.check_input_shape:

395 _check_df_shape(x, self.get_model_input_shape())

396 y = super().predict(x)

397 if self.check_output_shape:

398 _check_df_shape(y, self.get_model_output_shape())

399 return y

Coverage for src/sensai/tensor_model.py: 32%

179 statements