Coverage for src/sensai/evaluation/evaluator.py: 65%
243 statements
« prev ^ index » next coverage.py v7.6.1, created at 2024-08-13 22:17 +0000
« prev ^ index » next coverage.py v7.6.1, created at 2024-08-13 22:17 +0000
1import functools
2import logging
3from abc import ABC, abstractmethod
4from typing import Tuple, Dict, Any, Generator, Generic, TypeVar, Sequence, Optional, List, Union, Callable
6import pandas as pd
8from .eval_stats import GUESS
9from .eval_stats.eval_stats_base import EvalStats, EvalStatsCollection
10from .eval_stats.eval_stats_classification import ClassificationEvalStats, ClassificationMetric
11from .eval_stats.eval_stats_regression import RegressionEvalStats, RegressionEvalStatsCollection, RegressionMetric
12from ..data import DataSplitter, DataSplitterFractional, InputOutputData
13from ..data_transformation import DataFrameTransformer
14from ..tracking import TrackingMixin, TrackedExperiment
15from ..util.deprecation import deprecated
16from ..util.string import ToStringMixin
17from ..util.typing import PandasNamedTuple
18from ..vector_model import VectorClassificationModel, VectorModel, VectorModelBase, VectorModelFittableBase, VectorRegressionModel
20log = logging.getLogger(__name__)
22TModel = TypeVar("TModel", bound=VectorModel)
23TEvalStats = TypeVar("TEvalStats", bound=EvalStats)
24TEvalStatsCollection = TypeVar("TEvalStatsCollection", bound=EvalStatsCollection)
27class MetricsDictProvider(TrackingMixin, ABC):
28 @abstractmethod
29 def _compute_metrics(self, model, **kwargs) -> Dict[str, float]:
30 """
31 Computes metrics for the given model, typically by fitting the model and applying it to test data
33 :param model: the model
34 :param kwargs: parameters to pass on to the underlying evaluation method
35 :return: a dictionary with metrics values
36 """
37 pass
39 def compute_metrics(self, model, **kwargs) -> Optional[Dict[str, float]]:
40 """
41 Computes metrics for the given model, typically by fitting the model and applying it to test data.
42 If a tracked experiment was previously set, the metrics are tracked with the string representation
43 of the model added under an additional key 'str(model)'.
45 :param model: the model for which to compute metrics
46 :param kwargs: parameters to pass on to the underlying evaluation method
47 :return: a dictionary with metrics values
48 """
49 values_dict = self._compute_metrics(model, **kwargs)
50 if self.tracked_experiment is not None:
51 self.tracked_experiment.track_values(values_dict, add_values_dict={"str(model)": str(model)}) # TODO strings unsupported (mlflow)
52 return values_dict
55class MetricsDictProviderFromFunction(MetricsDictProvider):
56 def __init__(self, compute_metrics_fn: Callable[[VectorModel], Dict[str, float]]):
57 self._compute_metrics_fn = compute_metrics_fn
59 def _compute_metrics(self, model, **kwargs) -> Dict[str, float]:
60 return self._compute_metrics_fn(model)
63class VectorModelEvaluationData(ABC, Generic[TEvalStats]):
64 def __init__(self, stats_dict: Dict[str, TEvalStats], io_data: InputOutputData, model: VectorModelBase):
65 """
66 :param stats_dict: a dictionary mapping from output variable name to the evaluation statistics object
67 :param io_data: the input/output data that was used to produce the results
68 :param model: the model that was used to produce predictions
69 """
70 self.io_data = io_data
71 self.eval_stats_by_var_name = stats_dict
72 self.predicted_var_names = list(self.eval_stats_by_var_name.keys())
73 self.model = model
75 @property
76 def model_name(self):
77 return self.model.get_name()
79 @property
80 def input_data(self): # for backward compatibility
81 return self.io_data.inputs
83 def get_eval_stats(self, predicted_var_name=None) -> TEvalStats:
84 if predicted_var_name is None:
85 if len(self.eval_stats_by_var_name) != 1:
86 raise Exception(f"Must provide name of predicted variable name, as multiple variables were predicted:"
87 f" {list(self.eval_stats_by_var_name.keys())}")
88 else:
89 predicted_var_name = next(iter(self.eval_stats_by_var_name.keys()))
90 eval_stats = self.eval_stats_by_var_name.get(predicted_var_name)
91 if eval_stats is None:
92 raise ValueError(f"No evaluation data present for '{predicted_var_name}'; known output variables: "
93 f"{list(self.eval_stats_by_var_name.keys())}")
94 return eval_stats
96 def get_data_frame(self):
97 """
98 Returns an DataFrame with all evaluation metrics (one row per output variable)
100 :return: a DataFrame containing evaluation metrics
101 """
102 stats_dicts = []
103 var_names = []
104 for predictedVarName, evalStats in self.eval_stats_by_var_name.items():
105 stats_dicts.append(evalStats.metrics_dict())
106 var_names.append(predictedVarName)
107 df = pd.DataFrame(stats_dicts, index=var_names)
108 df.index.name = "predictedVar"
109 return df
111 def iter_input_output_ground_truth_tuples(self, predicted_var_name=None) -> Generator[Tuple[PandasNamedTuple, Any, Any], None, None]:
112 eval_stats = self.get_eval_stats(predicted_var_name)
113 for i, named_tuple in enumerate(self.input_data.itertuples()):
114 yield named_tuple, eval_stats.y_predicted[i], eval_stats.y_true[i]
117class VectorRegressionModelEvaluationData(VectorModelEvaluationData[RegressionEvalStats]):
118 def get_eval_stats_collection(self):
119 return RegressionEvalStatsCollection(list(self.eval_stats_by_var_name.values()))
122TEvalData = TypeVar("TEvalData", bound=VectorModelEvaluationData)
125class EvaluatorParams(ToStringMixin, ABC):
126 def __init__(self, data_splitter: DataSplitter = None, fractional_split_test_fraction: float = None, fractional_split_random_seed=42,
127 fractional_split_shuffle=True):
128 """
129 :param data_splitter: [if test data must be obtained via split] a splitter to use in order to obtain; if None, must specify
130 fractionalSplitTestFraction for fractional split (default)
131 :param fractional_split_test_fraction: [if test data must be obtained via split, dataSplitter is None] the fraction of the data to
132 use for testing/evaluation;
133 :param fractional_split_random_seed: [if test data must be obtained via split, dataSplitter is none] the random seed to use for the
134 fractional split of the data
135 :param fractional_split_shuffle: [if test data must be obtained via split, dataSplitter is None] whether to randomly (based on
136 randomSeed) shuffle the dataset before splitting it
137 """
138 self._dataSplitter = data_splitter
139 self._fractionalSplitTestFraction = fractional_split_test_fraction
140 self._fractionalSplitRandomSeed = fractional_split_random_seed
141 self._fractionalSplitShuffle = fractional_split_shuffle
143 def _tostring_exclude_private(self) -> bool:
144 return True
146 def _tostring_additional_entries(self) -> Dict[str, Any]:
147 d = {}
148 if self._dataSplitter is not None:
149 d["dataSplitter"] = self._dataSplitter
150 else:
151 d["fractionalSplitTestFraction"] = self._fractionalSplitTestFraction
152 d["fractionalSplitRandomSeed"] = self._fractionalSplitRandomSeed
153 d["fractionalSplitShuffle"] = self._fractionalSplitShuffle
154 return d
156 def get_data_splitter(self) -> DataSplitter:
157 if self._dataSplitter is None:
158 if self._fractionalSplitTestFraction is None:
159 raise ValueError("Cannot create default data splitter, as no split fraction was provided")
160 self._dataSplitter = DataSplitterFractional(1 - self._fractionalSplitTestFraction, shuffle=self._fractionalSplitShuffle,
161 random_seed=self._fractionalSplitRandomSeed)
162 return self._dataSplitter
164 def set_data_splitter(self, splitter: DataSplitter):
165 self._dataSplitter = splitter
168class VectorModelEvaluator(MetricsDictProvider, Generic[TEvalData], ABC):
169 def __init__(self, data: InputOutputData, test_data: InputOutputData = None, params: EvaluatorParams = None):
170 """
171 Constructs an evaluator with test and training data.
173 :param data: the full data set, or, if `test_data` is given, the training data
174 :param test_data: the data to use for testing/evaluation; if None, must specify appropriate parameters to define splitting
175 :param params: the parameters
176 """
177 if test_data is None:
178 if params is None:
179 raise ValueError("Parameters required for data split must be provided")
180 data_splitter = params.get_data_splitter()
181 self.training_data, self.test_data = data_splitter.split(data)
182 log.debug(f"{data_splitter} created split with {len(self.training_data)} "
183 f"({100 * len(self.training_data) / len(data):.2f}%) and "
184 f"{len(self.test_data)} ({100 * len(self.test_data) / len(data):.2f}%) training and test data points respectively")
185 else:
186 self.training_data = data
187 self.test_data = test_data
189 def set_tracked_experiment(self, tracked_experiment: TrackedExperiment):
190 """
191 Sets a tracked experiment which will result in metrics being saved whenever computeMetrics is called
192 or evalModel is called with track=True.
194 :param tracked_experiment: the experiment in which to track evaluation metrics.
195 """
196 super().set_tracked_experiment(tracked_experiment)
198 def eval_model(self, model: Union[VectorModelBase, VectorModelFittableBase], on_training_data=False, track=True,
199 fit=False) -> TEvalData:
200 """
201 Evaluates the given model
203 :param model: the model to evaluate
204 :param on_training_data: if True, evaluate on this evaluator's training data rather than the held-out test data
205 :param track: whether to track the evaluation metrics for the case where a tracked experiment was set on this object
206 :param fit: whether to fit the model before evaluating it (via this object's `fit_model` method); if enabled, the model
207 must support fitting
208 :return: the evaluation result
209 """
210 data = self.training_data if on_training_data else self.test_data
211 with self.begin_optional_tracking_context_for_model(model, track=track) as trackingContext:
212 if fit:
213 self.fit_model(model)
214 result: VectorModelEvaluationData = self._eval_model(model, data)
215 is_multiple_pred_vars = len(result.predicted_var_names) > 1
216 for pred_var_name in result.predicted_var_names:
217 metrics = result.get_eval_stats(pred_var_name).metrics_dict()
218 trackingContext.track_metrics(metrics, pred_var_name if is_multiple_pred_vars else None)
219 return result
221 @abstractmethod
222 def _eval_model(self, model: VectorModelBase, data: InputOutputData) -> TEvalData:
223 pass
225 def _compute_metrics(self, model: VectorModel, on_training_data=False) -> Dict[str, float]:
226 return self._compute_metrics_for_var_name(model, None, on_training_data=on_training_data)
228 def _compute_metrics_for_var_name(self, model, predicted_var_name: Optional[str], on_training_data=False):
229 self.fit_model(model)
230 track = False # avoid duplicate tracking (as this function is only called by computeMetrics, which already tracks)
231 eval_data: VectorModelEvaluationData = self.eval_model(model, on_training_data=on_training_data, track=track)
232 return eval_data.get_eval_stats(predicted_var_name=predicted_var_name).metrics_dict()
234 def create_metrics_dict_provider(self, predicted_var_name: Optional[str]) -> MetricsDictProvider:
235 """
236 Creates a metrics dictionary provider, e.g. for use in hyperparameter optimisation
238 :param predicted_var_name: the name of the predicted variable for which to obtain evaluation metrics; may be None only
239 if the model outputs but a single predicted variable
240 :return: a metrics dictionary provider instance for the given variable
241 """
242 return MetricsDictProviderFromFunction(functools.partial(self._compute_metrics_for_var_name, predictedVarName=predicted_var_name))
244 def fit_model(self, model: VectorModelFittableBase):
245 """Fits the given model's parameters using this evaluator's training data"""
246 if self.training_data is None:
247 raise Exception(f"Cannot fit model with evaluator {self.__class__.__name__}: no training data provided")
248 model.fit(self.training_data.inputs, self.training_data.outputs)
251class RegressionEvaluatorParams(EvaluatorParams):
252 def __init__(self,
253 data_splitter: DataSplitter = None,
254 fractional_split_test_fraction: float = None,
255 fractional_split_random_seed=42,
256 fractional_split_shuffle=True,
257 metrics: Sequence[RegressionMetric] = None,
258 additional_metrics: Sequence[RegressionMetric] = None,
259 output_data_frame_transformer: DataFrameTransformer = None):
260 """
261 :param data_splitter: [if test data must be obtained via split] a splitter to use in order to obtain; if None, must specify
262 fractionalSplitTestFraction for fractional split (default)
263 :param fractional_split_test_fraction: [if dataSplitter is None, test data must be obtained via split] the fraction of the data to
264 use for testing/evaluation;
265 :param fractional_split_random_seed: [if dataSplitter is none, test data must be obtained via split] the random seed to use for the
266 fractional split of the data
267 :param fractional_split_shuffle: [if dataSplitter is None, test data must be obtained via split] whether to randomly (based on
268 randomSeed) shuffle the dataset before splitting it
269 :param metrics: regression metrics to apply. If None, default regression metrics are used.
270 :param additional_metrics: additional regression metrics to apply
271 :param output_data_frame_transformer: a data frame transformer to apply to all output data frames (both model outputs and ground
272 truth), such that evaluation metrics are computed on the transformed data frame
273 """
274 super().__init__(data_splitter,
275 fractional_split_test_fraction=fractional_split_test_fraction,
276 fractional_split_random_seed=fractional_split_random_seed,
277 fractional_split_shuffle=fractional_split_shuffle)
278 self.metrics = metrics
279 self.additional_metrics = additional_metrics
280 self.output_data_frame_transformer = output_data_frame_transformer
282 @classmethod
283 def from_dict_or_instance(cls,
284 params: Optional[Union[Dict[str, Any], "RegressionEvaluatorParams"]]) -> "RegressionEvaluatorParams":
285 if params is None:
286 return RegressionEvaluatorParams()
287 elif type(params) == dict:
288 raise Exception("Old-style dictionary parametrisation is no longer supported")
289 elif isinstance(params, cls):
290 return params
291 else:
292 raise ValueError(f"Must provide dictionary or {cls} instance, got {params}, type {type(params)}")
295class VectorRegressionModelEvaluatorParams(RegressionEvaluatorParams):
296 @deprecated("Use RegressionEvaluatorParams instead")
297 def __init__(self, *args, **kwargs):
298 super().__init__(*args, **kwargs)
301class VectorRegressionModelEvaluator(VectorModelEvaluator[VectorRegressionModelEvaluationData]):
302 def __init__(self, data: InputOutputData, test_data: InputOutputData = None,
303 params: RegressionEvaluatorParams = None):
304 """
305 Constructs an evaluator with test and training data.
307 :param data: the full data set, or, if testData is given, the training data
308 :param test_data: the data to use for testing/evaluation; if None, must specify appropriate parameters to define splitting
309 :param params: the parameters
310 """
311 super().__init__(data=data, test_data=test_data, params=params)
312 self.params = params
314 def _eval_model(self, model: VectorRegressionModel, data: InputOutputData) -> VectorRegressionModelEvaluationData:
315 if not model.is_regression_model():
316 raise ValueError(f"Expected a regression model, got {model}")
317 eval_stats_by_var_name = {}
318 predictions, ground_truth = self._compute_outputs(model, data)
319 for predictedVarName in predictions.columns:
320 if predictedVarName in ground_truth.columns:
321 y_true = ground_truth[predictedVarName]
322 else:
323 if len(predictions.columns) == 1 and len(ground_truth.columns) == 1:
324 log.warning(f"Model output column '{predictedVarName}' does not match ground truth column '{ground_truth.columns[0]}'; "
325 f"assuming that this is not a problem since there is but a single column available")
326 y_true = ground_truth.iloc[:, 0]
327 else:
328 raise Exception(f"Model output column '{predictedVarName}' not found in ground truth columns {ground_truth.columns}")
329 eval_stats = RegressionEvalStats(y_predicted=predictions[predictedVarName], y_true=y_true,
330 metrics=self.params.metrics,
331 additional_metrics=self.params.additional_metrics,
332 model=model,
333 io_data=data)
334 eval_stats_by_var_name[predictedVarName] = eval_stats
335 return VectorRegressionModelEvaluationData(eval_stats_by_var_name, data, model)
337 def compute_test_data_outputs(self, model: VectorModelBase) -> Tuple[pd.DataFrame, pd.DataFrame]:
338 """
339 Applies the given model to the test data
341 :param model: the model to apply
342 :return: a pair (predictions, groundTruth)
343 """
344 return self._compute_outputs(model, self.test_data)
346 def _compute_outputs(self, model: VectorModelBase, io_data: InputOutputData):
347 """
348 Applies the given model to the given data
350 :param model: the model to apply
351 :param io_data: the data set
352 :return: a pair (predictions, ground_truth)
353 """
354 predictions = model.predict(io_data.inputs)
355 ground_truth = io_data.outputs
356 if self.params.output_data_frame_transformer:
357 predictions = self.params.output_data_frame_transformer.apply(predictions)
358 ground_truth = self.params.output_data_frame_transformer.apply(ground_truth)
359 return predictions, ground_truth
362class VectorClassificationModelEvaluationData(VectorModelEvaluationData[ClassificationEvalStats]):
363 def get_misclassified_inputs_data_frame(self) -> pd.DataFrame:
364 return self.input_data.iloc[self.get_eval_stats().get_misclassified_indices()]
366 def get_misclassified_triples_pred_true_input(self) -> List[Tuple[Any, Any, pd.Series]]:
367 """
368 :return: a list containing a triple (predicted class, true class, input series) for each misclassified data point
369 """
370 eval_stats = self.get_eval_stats()
371 indices = eval_stats.get_misclassified_indices()
372 return [(eval_stats.y_predicted[i], eval_stats.y_true[i], self.input_data.iloc[i]) for i in indices]
375class ClassificationEvaluatorParams(EvaluatorParams):
376 def __init__(self, data_splitter: DataSplitter = None, fractional_split_test_fraction: float = None, fractional_split_random_seed=42,
377 fractional_split_shuffle=True, additional_metrics: Sequence[ClassificationMetric] = None,
378 compute_probabilities: bool = False, binary_positive_label: Optional[str] = GUESS):
379 """
380 :param data_splitter: [if test data must be obtained via split] a splitter to use in order to obtain; if None, must specify
381 fractionalSplitTestFraction for fractional split (default)
382 :param fractional_split_test_fraction: [if dataSplitter is None, test data must be obtained via split] the fraction of the data to
383 use for testing/evaluation
384 :param fractional_split_random_seed: [if dataSplitter is none, test data must be obtained via split] the random seed to use for the
385 fractional split of the data
386 :param fractional_split_shuffle: [if dataSplitter is None, test data must be obtained via split] whether to randomly (based on
387 randomSeed) shuffle the dataset before splitting it
388 :param additional_metrics: additional metrics to apply
389 :param compute_probabilities: whether to compute class probabilities. Enabling this will enable many downstream computations
390 and visualisations (e.g. precision-recall plots) but requires the model to support probability computation in general.
391 :param binary_positive_label: the positive class label for binary classification; if GUESS, try to detect from labels;
392 if None, no detection (assume non-binary classification)
393 """
394 super().__init__(data_splitter,
395 fractional_split_test_fraction=fractional_split_test_fraction,
396 fractional_split_random_seed=fractional_split_random_seed,
397 fractional_split_shuffle=fractional_split_shuffle)
398 self.additionalMetrics = additional_metrics
399 self.computeProbabilities = compute_probabilities
400 self.binaryPositiveLabel = binary_positive_label
402 @classmethod
403 def from_dict_or_instance(cls,
404 params: Optional[Union[Dict[str, Any], "ClassificationEvaluatorParams"]]) \
405 -> "ClassificationEvaluatorParams":
406 if params is None:
407 return ClassificationEvaluatorParams()
408 elif type(params) == dict:
409 raise ValueError("Old-style dictionary parametrisation is no longer supported")
410 elif isinstance(params, ClassificationEvaluatorParams):
411 return params
412 else:
413 raise ValueError(f"Must provide dictionary or instance, got {params}")
416class VectorClassificationModelEvaluatorParams(ClassificationEvaluatorParams):
417 @deprecated("Use ClassificationEvaluatorParams instead")
418 def __init__(self, *args, **kwargs):
419 super().__init__(*args, **kwargs)
422class VectorClassificationModelEvaluator(VectorModelEvaluator[VectorClassificationModelEvaluationData]):
423 def __init__(self,
424 data: InputOutputData,
425 test_data: InputOutputData = None,
426 params: ClassificationEvaluatorParams = None):
427 """
428 Constructs an evaluator with test and training data.
430 :param data: the full data set, or, if `test_data` is given, the training data
431 :param test_data: the data to use for testing/evaluation; if None, must specify appropriate parameters to define splitting
432 :param params: the parameters
433 """
434 super().__init__(data=data, test_data=test_data, params=params)
435 self.params = params
437 def _eval_model(self, model: VectorClassificationModel, data: InputOutputData) -> VectorClassificationModelEvaluationData:
438 if model.is_regression_model():
439 raise ValueError(f"Expected a classification model, got {model}")
440 predictions, predictions_proba, ground_truth = self._compute_outputs(model, data)
441 eval_stats = ClassificationEvalStats(
442 y_predicted_class_probabilities=predictions_proba,
443 y_predicted=predictions,
444 y_true=ground_truth,
445 labels=model.get_class_labels(),
446 additional_metrics=self.params.additionalMetrics,
447 binary_positive_label=self.params.binaryPositiveLabel)
448 predicted_var_name = model.get_predicted_variable_names()[0]
449 return VectorClassificationModelEvaluationData({predicted_var_name: eval_stats}, data, model)
451 def compute_test_data_outputs(self, model) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
452 """
453 Applies the given model to the test data
455 :param model: the model to apply
456 :return: a triple (predictions, predicted class probability vectors, groundTruth) of DataFrames
457 """
458 return self._compute_outputs(model, self.test_data)
460 def _compute_outputs(self, model, io_data: InputOutputData) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
461 """
462 Applies the given model to the given data
464 :param model: the model to apply
465 :param io_data: the data set
466 :return: a triple (predictions, predicted class probability vectors, ground_truth) of DataFrames
467 """
468 if self.params.computeProbabilities:
469 class_probabilities = model.predict_class_probabilities(io_data.inputs)
470 predictions = model.convert_class_probabilities_to_predictions(class_probabilities)
471 else:
472 class_probabilities = None
473 predictions = model.predict(io_data.inputs)
474 ground_truth = io_data.outputs
475 return predictions, class_probabilities, ground_truth
478class RuleBasedVectorClassificationModelEvaluator(VectorClassificationModelEvaluator):
479 def __init__(self, data: InputOutputData):
480 super().__init__(data, test_data=data)
482 def eval_model(self, model: VectorModelBase, on_training_data=False, track=True,
483 fit=False) -> VectorClassificationModelEvaluationData:
484 """
485 Evaluate the rule based model. The training data and test data coincide, thus fitting the model
486 will fit the model's preprocessors on the full data set and evaluating it will evaluate the model on the
487 same data set.
489 :param model: the model to evaluate
490 :param on_training_data: has to be False here. Setting to True is not supported and will lead to an
491 exception
492 :param track: whether to track the evaluation metrics for the case where a tracked experiment was set on this object
493 :return: the evaluation result
494 """
495 if on_training_data:
496 raise Exception("Evaluating rule based models on training data is not supported. In this evaluator"
497 "training and test data coincide.")
498 return super().eval_model(model)
501class RuleBasedVectorRegressionModelEvaluator(VectorRegressionModelEvaluator):
502 def __init__(self, data: InputOutputData):
503 super().__init__(data, test_data=data)
505 def eval_model(self, model: Union[VectorModelBase, VectorModelFittableBase], on_training_data=False, track=True,
506 fit=False) -> VectorRegressionModelEvaluationData:
507 """
508 Evaluate the rule based model. The training data and test data coincide, thus fitting the model
509 will fit the model's preprocessors on the full data set and evaluating it will evaluate the model on the
510 same data set.
512 :param model: the model to evaluate
513 :param on_training_data: has to be False here. Setting to True is not supported and will lead to an
514 exception
515 :param track: whether to track the evaluation metrics for the case where a tracked experiment was set on this object
516 :return: the evaluation result
517 """
518 if on_training_data:
519 raise Exception("Evaluating rule based models on training data is not supported. In this evaluator"
520 "training and test data coincide.")
521 return super().eval_model(model)