Source code for sensai.evaluation.eval_stats.eval_stats_regression

import logging
from abc import abstractmethod, ABC
from typing import List, Sequence, Optional

import numpy as np
from matplotlib import pyplot as plt
from matplotlib.colors import LinearSegmentedColormap

from .eval_stats_base import PredictionEvalStats, Metric, EvalStatsCollection, PredictionArray, EvalStatsPlot
from ...vector_model import VectorRegressionModel, InputOutputData
from ...util.plot import HistogramPlot

log = logging.getLogger(__name__)


[docs]class RegressionMetric(Metric["RegressionEvalStats"], ABC):
[docs] def compute_value_for_eval_stats(self, eval_stats: "RegressionEvalStats", model: VectorRegressionModel = None, io_data: InputOutputData = None): return self.compute_value(np.array(eval_stats.y_true), np.array(eval_stats.y_predicted), model=model, io_data=io_data)
[docs] @classmethod @abstractmethod def compute_value(cls, y_true: np.ndarray, y_predicted: np.ndarray, model: VectorRegressionModel = None, io_data: InputOutputData = None): pass
[docs] @classmethod def compute_errors(cls, y_true: np.ndarray, y_predicted: np.ndarray): return y_predicted - y_true
[docs] @classmethod def compute_abs_errors(cls, y_true: np.ndarray, y_predicted: np.ndarray): return np.abs(cls.compute_errors(y_true, y_predicted))
[docs]class RegressionMetricMAE(RegressionMetric): name = "MAE"
[docs] @classmethod def compute_value(cls, y_true: np.ndarray, y_predicted: np.ndarray, model: VectorRegressionModel = None, io_data: InputOutputData = None): return np.mean(cls.compute_abs_errors(y_true, y_predicted))
[docs]class RegressionMetricMSE(RegressionMetric): name = "MSE"
[docs] @classmethod def compute_value(cls, y_true: np.ndarray, y_predicted: np.ndarray, model: VectorRegressionModel = None, io_data: InputOutputData = None): residuals = y_predicted - y_true return np.sum(residuals * residuals) / len(residuals)
[docs]class RegressionMetricRMSE(RegressionMetric): name = "RMSE"
[docs] @classmethod def compute_value(cls, y_true: np.ndarray, y_predicted: np.ndarray, model: VectorRegressionModel = None, io_data: InputOutputData = None): errors = cls.compute_errors(y_true, y_predicted) return np.sqrt(np.mean(errors * errors))
[docs]class RegressionMetricRRSE(RegressionMetric): name = "RRSE"
[docs] @classmethod def compute_value(cls, y_true: np.ndarray, y_predicted: np.ndarray, model: VectorRegressionModel = None, io_data: InputOutputData = None): mean_y = np.mean(y_true) residuals = y_predicted - y_true mean_deviation = y_true - mean_y return np.sqrt(np.sum(residuals * residuals) / np.sum(mean_deviation * mean_deviation))
[docs]class RegressionMetricR2(RegressionMetric): name = "R2"
[docs] @classmethod def compute_value(cls, y_true: np.ndarray, y_predicted: np.ndarray, model: VectorRegressionModel = None, io_data: InputOutputData = None): rrse = RegressionMetricRRSE.compute_value(y_true, y_predicted) return 1.0 - rrse*rrse
[docs]class RegressionMetricPCC(RegressionMetric): name = "PCC"
[docs] @classmethod def compute_value(cls, y_true: np.ndarray, y_predicted: np.ndarray, model: VectorRegressionModel = None, io_data: InputOutputData = None): cov = np.cov([y_true, y_predicted]) return cov[0][1] / np.sqrt(cov[0][0] * cov[1][1])
[docs]class RegressionMetricStdDevAE(RegressionMetric): name = "StdDevAE"
[docs] @classmethod def compute_value(cls, y_true: np.ndarray, y_predicted: np.ndarray, model: VectorRegressionModel = None, io_data: InputOutputData = None): return np.std(cls.compute_abs_errors(y_true, y_predicted))
[docs]class RegressionMetricMedianAE(RegressionMetric): name = "MedianAE"
[docs] @classmethod def compute_value(cls, y_true: np.ndarray, y_predicted: np.ndarray, model: VectorRegressionModel = None, io_data: InputOutputData = None): return np.median(cls.compute_abs_errors(y_true, y_predicted))
DEFAULT_REGRESSION_METRICS = (RegressionMetricRRSE(), RegressionMetricR2(), RegressionMetricMAE(), RegressionMetricMSE(), RegressionMetricRMSE(), RegressionMetricStdDevAE())
[docs]class RegressionEvalStats(PredictionEvalStats["RegressionMetric"]): """ Collects data for the evaluation of predicted continuous values and computes corresponding metrics """ # class members controlling plot appearance, which can be centrally overridden by a user if necessary HEATMAP_COLORMAP_FACTORY = lambda self: LinearSegmentedColormap.from_list("whiteToRed", ((0, (1, 1, 1)), (1/len(self.y_predicted), (1, 0.96, 0.96)), (1, (0.7, 0, 0))), len(self.y_predicted)) HEATMAP_DIAGONAL_COLOR = "green" HEATMAP_ERROR_BOUNDARY_VALUE = None HEATMAP_ERROR_BOUNDARY_COLOR = (0.8, 0.8, 0.8) SCATTER_PLOT_POINT_COLOR = (0, 0, 1, 0.05) def __init__(self, y_predicted: Optional[PredictionArray] = None, y_true: Optional[PredictionArray] = None, metrics: Optional[Sequence["RegressionMetric"]] = None, additional_metrics: Sequence["RegressionMetric"] = None, model: VectorRegressionModel = None, io_data: InputOutputData = None): """ :param y_predicted: the predicted values :param y_true: the true values :param metrics: the metrics to compute for evaluation; if None, will use DEFAULT_REGRESSION_METRICS :param additional_metrics: the metrics to additionally compute """ self.model = model self.ioData = io_data if metrics is None: metrics = DEFAULT_REGRESSION_METRICS metrics = list(metrics) super().__init__(y_predicted, y_true, metrics, additional_metrics=additional_metrics)
[docs] def compute_metric_value(self, metric: RegressionMetric) -> float: return metric.compute_value_for_eval_stats(self, model=self.model, io_data=self.ioData)
[docs] def compute_mse(self): """Computes the mean squared error (MSE)""" return self.compute_metric_value(RegressionMetricMSE())
[docs] def compute_rrse(self): """Computes the root relative squared error""" return self.compute_metric_value(RegressionMetricRRSE())
[docs] def compute_pcc(self): """Gets the Pearson correlation coefficient (PCC)""" return self.compute_metric_value(RegressionMetricPCC())
[docs] def compute_r2(self): """Gets the R^2 score""" return self.compute_metric_value(RegressionMetricR2())
[docs] def compute_mae(self): """Gets the mean absolute error""" return self.compute_metric_value(RegressionMetricMAE())
[docs] def compute_rmse(self): """Gets the root mean squared error""" return self.compute_metric_value(RegressionMetricRMSE())
[docs] def compute_std_dev_ae(self): """Gets the standard deviation of the absolute error""" return self.compute_metric_value(RegressionMetricStdDevAE())
[docs] def create_eval_stats_collection(self) -> "RegressionEvalStatsCollection": """ For the case where we collected data on multiple dimensions, obtain a stats collection where each object in the collection holds stats on just one dimension """ if self.y_true_multidim is None: raise Exception("No multi-dimensional data was collected") dim = len(self.y_true_multidim) stats_list = [] for i in range(dim): stats = RegressionEvalStats(self.y_predicted_multidim[i], self.y_true_multidim[i]) stats_list.append(stats) return RegressionEvalStatsCollection(stats_list)
[docs] def plot_error_distribution(self, bins="auto", title_add=None) -> Optional[plt.Figure]: """ :param bins: bin specification (see :class:`HistogramPlot`) :param title_add: a string to add to the title (on a second line) :return: the resulting figure object or None """ errors = np.array(self.y_predicted) - np.array(self.y_true) title = "Prediction Error Distribution" if title_add is not None: title += "\n" + title_add if bins == "auto" and len(errors) < 100: bins = 10 # seaborn can crash with low number of data points and bins="auto" (tries to allocate vast amounts of memory) plot = HistogramPlot(errors, bins=bins, kde=True) plot.title(title) plot.xlabel("error (prediction - ground truth)") plot.ylabel("probability density") return plot.fig
[docs] def plot_scatter_ground_truth_predictions(self, figure=True, title_add=None, **kwargs) -> Optional[plt.Figure]: """ :param figure: whether to plot in a separate figure and return that figure :param title_add: a string to be added to the title in a second line :param kwargs: parameters to be passed on to plt.scatter() :return: the resulting figure object or None """ fig = None title = "Scatter Plot of Predicted Values vs. Ground Truth" if title_add is not None: title += "\n" + title_add if figure: fig = plt.figure(title.replace("\n", " ")) y_range = [min(self.y_true), max(self.y_true)] plt.scatter(self.y_true, self.y_predicted, c=[self.SCATTER_PLOT_POINT_COLOR], zorder=2, **kwargs) plt.plot(y_range, y_range, '-', lw=1, label="_not in legend", color="green", zorder=1) plt.xlabel("ground truth") plt.ylabel("prediction") plt.title(title) return fig
[docs] def plot_heatmap_ground_truth_predictions(self, figure=True, cmap=None, bins=60, title_add=None, error_boundary: Optional[float] = None, **kwargs) -> Optional[plt.Figure]: """ :param figure: whether to plot in a separate figure and return that figure :param cmap: the colour map to use (see corresponding parameter of plt.imshow for further information); if None, use factory defined in HEATMAP_COLORMAP_FACTORY (which can be centrally set to achieve custom behaviour throughout an application) :param bins: how many bins to use for constructing the heatmap :param title_add: a string to add to the title (on a second line) :param error_boundary: if not None, add two lines (above and below the diagonal) indicating this absolute regression error boundary; if None (default), use static member HEATMAP_ERROR_BOUNDARY_VALUE (which is also None by default, but can be centrally set to achieve custom behaviour throughout an application) :param kwargs: will be passed to plt.imshow() :return: the resulting figure object or None """ fig = None title = "Heat Map of Predicted Values vs. Ground Truth" if title_add: title += "\n" + title_add if figure: fig = plt.figure(title.replace("\n", " ")) y_range = [min(min(self.y_true), min(self.y_predicted)), max(max(self.y_true), max(self.y_predicted))] # diagonal plt.plot(y_range, y_range, '-', lw=0.75, label="_not in legend", color=self.HEATMAP_DIAGONAL_COLOR, zorder=2) # error boundaries if error_boundary is None: error_boundary = self.HEATMAP_ERROR_BOUNDARY_VALUE if error_boundary is not None: d = np.array(y_range) offs = np.array([error_boundary, error_boundary]) plt.plot(d, d + offs, '-', lw=0.75, label="_not in legend", color=self.HEATMAP_ERROR_BOUNDARY_COLOR, zorder=2) plt.plot(d, d - offs, '-', lw=0.75, label="_not in legend", color=self.HEATMAP_ERROR_BOUNDARY_COLOR, zorder=2) # heat map heatmap, _, _ = np.histogram2d(self.y_true, self.y_predicted, range=[y_range, y_range], bins=bins, density=False) extent = [y_range[0], y_range[1], y_range[0], y_range[1]] if cmap is None: cmap = self.HEATMAP_COLORMAP_FACTORY() plt.imshow(heatmap.T, extent=extent, origin='lower', interpolation="none", cmap=cmap, zorder=1, **kwargs) plt.xlabel("ground truth") plt.ylabel("prediction") plt.title(title) return fig
[docs]class RegressionEvalStatsCollection(EvalStatsCollection[RegressionEvalStats, RegressionMetric]): def __init__(self, eval_stats_list: List[RegressionEvalStats]): super().__init__(eval_stats_list) self.globalStats = None
[docs] def get_combined_eval_stats(self) -> RegressionEvalStats: if self.globalStats is None: y_true = np.concatenate([evalStats.y_true for evalStats in self.statsList]) y_predicted = np.concatenate([evalStats.y_predicted for evalStats in self.statsList]) es0 = self.statsList[0] self.globalStats = RegressionEvalStats(y_predicted, y_true, metrics=es0.metrics) return self.globalStats
[docs]class RegressionEvalStatsPlot(EvalStatsPlot[RegressionEvalStats], ABC): pass
[docs]class RegressionEvalStatsPlotErrorDistribution(RegressionEvalStatsPlot):
[docs] def create_figure(self, eval_stats: RegressionEvalStats, subtitle: str) -> plt.Figure: return eval_stats.plot_error_distribution(title_add=subtitle)
[docs]class RegressionEvalStatsPlotHeatmapGroundTruthPredictions(RegressionEvalStatsPlot):
[docs] def create_figure(self, eval_stats: RegressionEvalStats, subtitle: str) -> plt.Figure: return eval_stats.plot_heatmap_ground_truth_predictions(title_add=subtitle)
[docs]class RegressionEvalStatsPlotScatterGroundTruthPredictions(RegressionEvalStatsPlot):
[docs] def create_figure(self, eval_stats: RegressionEvalStats, subtitle: str) -> plt.Figure: return eval_stats.plot_scatter_ground_truth_predictions(title_add=subtitle)