"""
This module contains methods and classes that facilitate evaluation of different types of models. The suggested
workflow for evaluation is to use these higher-level functionalities instead of instantiating
the evaluation classes directly.
"""
import functools
import logging
from abc import ABC, abstractmethod
from collections import defaultdict
from dataclasses import dataclass
from typing import Dict, Any, Union, Generic, TypeVar, Optional, Sequence, Callable, Set, Iterable, List, Iterator, Tuple
import matplotlib.figure
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from .crossval import VectorModelCrossValidationData, VectorRegressionModelCrossValidationData, \
VectorClassificationModelCrossValidationData, \
VectorClassificationModelCrossValidator, VectorRegressionModelCrossValidator, VectorModelCrossValidator, VectorModelCrossValidatorParams
from .eval_stats import RegressionEvalStatsCollection, ClassificationEvalStatsCollection, RegressionEvalStatsPlotErrorDistribution, \
RegressionEvalStatsPlotHeatmapGroundTruthPredictions, RegressionEvalStatsPlotScatterGroundTruthPredictions, \
ClassificationEvalStatsPlotConfusionMatrix, ClassificationEvalStatsPlotPrecisionRecall, RegressionEvalStatsPlot, \
ClassificationEvalStatsPlotProbabilityThresholdPrecisionRecall, ClassificationEvalStatsPlotProbabilityThresholdCounts, \
Metric
from .eval_stats.eval_stats_base import EvalStats, EvalStatsCollection, EvalStatsPlot
from .eval_stats.eval_stats_classification import ClassificationEvalStats
from .eval_stats.eval_stats_regression import RegressionEvalStats
from .evaluator import VectorModelEvaluator, VectorModelEvaluationData, VectorRegressionModelEvaluator, \
VectorRegressionModelEvaluationData, VectorClassificationModelEvaluator, VectorClassificationModelEvaluationData, \
RegressionEvaluatorParams, ClassificationEvaluatorParams
from ..data import InputOutputData
from ..feature_importance import AggregatedFeatureImportance, FeatureImportanceProvider, plot_feature_importance, FeatureImportance
from ..tracking import TrackedExperiment
from ..tracking.tracking_base import TrackingContext
from ..util.deprecation import deprecated
from ..util.io import ResultWriter
from ..util.string import pretty_string_repr
from ..vector_model import VectorClassificationModel, VectorRegressionModel, VectorModel, VectorModelBase
log = logging.getLogger(__name__)
TModel = TypeVar("TModel", bound=VectorModel)
TEvalStats = TypeVar("TEvalStats", bound=EvalStats)
TEvalStatsPlot = TypeVar("TEvalStatsPlot", bound=EvalStatsPlot)
TEvalStatsCollection = TypeVar("TEvalStatsCollection", bound=EvalStatsCollection)
TEvaluator = TypeVar("TEvaluator", bound=VectorModelEvaluator)
TCrossValidator = TypeVar("TCrossValidator", bound=VectorModelCrossValidator)
TEvalData = TypeVar("TEvalData", bound=VectorModelEvaluationData)
TCrossValData = TypeVar("TCrossValData", bound=VectorModelCrossValidationData)
def _is_regression(model: Optional[VectorModel], is_regression: Optional[bool]) -> bool:
if model is None and is_regression is None or (model is not None and is_regression is not None):
raise ValueError("One of the two parameters have to be passed: model or isRegression")
if is_regression is None:
model: VectorModel
return model.is_regression_model()
return is_regression
[docs]def create_vector_model_evaluator(data: InputOutputData, model: VectorModel = None,
is_regression: bool = None, params: Union[RegressionEvaluatorParams, ClassificationEvaluatorParams] = None,
test_data: Optional[InputOutputData] = None) \
-> Union[VectorRegressionModelEvaluator, VectorClassificationModelEvaluator]:
is_regression = _is_regression(model, is_regression)
if params is None:
if is_regression:
params = RegressionEvaluatorParams(fractional_split_test_fraction=0.2)
else:
params = ClassificationEvaluatorParams(fractional_split_test_fraction=0.2)
log.debug(f"No evaluator parameters specified, using default: {params}")
if is_regression:
return VectorRegressionModelEvaluator(data, test_data=test_data, params=params)
else:
return VectorClassificationModelEvaluator(data, test_data=test_data, params=params)
[docs]def create_vector_model_cross_validator(data: InputOutputData,
model: VectorModel = None,
is_regression: bool = None,
params: Union[VectorModelCrossValidatorParams, Dict[str, Any]] = None) \
-> Union[VectorClassificationModelCrossValidator, VectorRegressionModelCrossValidator]:
if params is None:
raise ValueError("params must not be None")
cons = VectorRegressionModelCrossValidator if _is_regression(model, is_regression) else VectorClassificationModelCrossValidator
return cons(data, params=params)
[docs]def create_evaluation_util(data: InputOutputData, model: VectorModel = None, is_regression: bool = None,
evaluator_params: Optional[Union[RegressionEvaluatorParams, ClassificationEvaluatorParams]] = None,
cross_validator_params: Optional[Dict[str, Any]] = None, test_io_data: Optional[InputOutputData] = None) \
-> Union["ClassificationModelEvaluation", "RegressionModelEvaluation"]:
if _is_regression(model, is_regression):
return RegressionModelEvaluation(data, evaluator_params=evaluator_params, cross_validator_params=cross_validator_params, test_io_data=test_io_data)
else:
return ClassificationModelEvaluation(data, evaluator_params=evaluator_params, cross_validator_params=cross_validator_params, test_io_data=test_io_data)
[docs]def eval_model_via_evaluator(model: TModel, io_data: InputOutputData, test_fraction=0.2,
plot_target_distribution=False, compute_probabilities=True, normalize_plots=True, random_seed=60) -> TEvalData:
"""
Evaluates the given model via a simple evaluation mechanism that uses a single split
:param model: the model to evaluate
:param io_data: data on which to evaluate
:param test_fraction: the fraction of the data to test on
:param plot_target_distribution: whether to plot the target values distribution in the entire dataset
:param compute_probabilities: only relevant if the model is a classifier
:param normalize_plots: whether to normalize plotted distributions such that the sum/integrate to 1
:param random_seed:
:return: the evaluation data
"""
if plot_target_distribution:
title = "Distribution of target values in entire dataset"
fig = plt.figure(title)
output_distribution_series = io_data.outputs.iloc[:, 0]
log.info(f"Description of target column in training set: \n{output_distribution_series.describe()}")
if not model.is_regression_model():
output_distribution_series = output_distribution_series.value_counts(normalize=normalize_plots)
ax = sns.barplot(output_distribution_series.index, output_distribution_series.values)
ax.set_ylabel("%")
else:
ax = sns.distplot(output_distribution_series)
ax.set_ylabel("Probability density")
ax.set_title(title)
ax.set_xlabel("target value")
fig.show()
if model.is_regression_model():
evaluator_params = RegressionEvaluatorParams(fractional_split_test_fraction=test_fraction,
fractional_split_random_seed=random_seed)
else:
evaluator_params = ClassificationEvaluatorParams(fractional_split_test_fraction=test_fraction,
compute_probabilities=compute_probabilities, fractional_split_random_seed=random_seed)
ev = create_evaluation_util(io_data, model=model, evaluator_params=evaluator_params)
return ev.perform_simple_evaluation(model, show_plots=True, log_results=True)
[docs]class EvaluationResultCollector:
def __init__(self, show_plots: bool = True, result_writer: Optional[ResultWriter] = None,
tracking_context: TrackingContext = None):
self.show_plots = show_plots
self.result_writer = result_writer
self.tracking_context = tracking_context
[docs] def is_plot_creation_enabled(self) -> bool:
return self.show_plots or self.result_writer is not None or self.tracking_context is not None
[docs] def add_data_frame_csv_file(self, name: str, df: pd.DataFrame):
if self.result_writer is not None:
self.result_writer.write_data_frame_csv_file(name, df)
[docs] def child(self, added_filename_prefix):
result_writer = self.result_writer
if result_writer:
result_writer = result_writer.child_with_added_prefix(added_filename_prefix)
return self.__class__(show_plots=self.show_plots, result_writer=result_writer)
[docs]class EvalStatsPlotCollector(Generic[TEvalStats, TEvalStatsPlot]):
def __init__(self):
self.plots: Dict[str, EvalStatsPlot] = {}
self.disabled_plots: Set[str] = set()
[docs] def add_plot(self, name: str, plot: EvalStatsPlot):
self.plots[name] = plot
[docs] def get_enabled_plots(self) -> List[str]:
return [p for p in self.plots if p not in self.disabled_plots]
[docs] def disable_plots(self, *names: str):
self.disabled_plots.update(names)
[docs] def create_plots(self, eval_stats: EvalStats, subtitle: str, result_collector: EvaluationResultCollector):
known_plots = set(self.plots.keys())
unknown_disabled_plots = self.disabled_plots.difference(known_plots)
if len(unknown_disabled_plots) > 0:
log.warning(f"Plots were disabled which are not registered: {unknown_disabled_plots}; known plots: {known_plots}")
for name, plot in self.plots.items():
if name not in self.disabled_plots and plot.is_applicable(eval_stats):
fig = plot.create_figure(eval_stats, subtitle)
if fig is not None:
result_collector.add_figure(name, fig)
[docs]class RegressionEvalStatsPlotCollector(EvalStatsPlotCollector[RegressionEvalStats, RegressionEvalStatsPlot]):
def __init__(self):
super().__init__()
self.add_plot("error-dist", RegressionEvalStatsPlotErrorDistribution())
self.add_plot("heatmap-gt-pred", RegressionEvalStatsPlotHeatmapGroundTruthPredictions(weighted=False))
self.add_plot("heatmap-gt-pred-weighted", RegressionEvalStatsPlotHeatmapGroundTruthPredictions(weighted=True))
self.add_plot("scatter-gt-pred", RegressionEvalStatsPlotScatterGroundTruthPredictions())
[docs]class ClassificationEvalStatsPlotCollector(EvalStatsPlotCollector[RegressionEvalStats, RegressionEvalStatsPlot]):
def __init__(self):
super().__init__()
self.add_plot("confusion-matrix-rel", ClassificationEvalStatsPlotConfusionMatrix(normalise=True))
self.add_plot("confusion-matrix-abs", ClassificationEvalStatsPlotConfusionMatrix(normalise=False))
# the plots below apply to the binary case only (skipped for non-binary case)
self.add_plot("precision-recall", ClassificationEvalStatsPlotPrecisionRecall())
self.add_plot("threshold-precision-recall", ClassificationEvalStatsPlotProbabilityThresholdPrecisionRecall())
self.add_plot("threshold-counts", ClassificationEvalStatsPlotProbabilityThresholdCounts())
[docs]class ModelEvaluation(ABC, Generic[TModel, TEvaluator, TEvalData, TCrossValidator, TCrossValData, TEvalStats]):
"""
Utility class for the evaluation of models based on a dataset
"""
def __init__(self, io_data: InputOutputData,
eval_stats_plot_collector: Union[RegressionEvalStatsPlotCollector, ClassificationEvalStatsPlotCollector],
evaluator_params: Optional[Union[RegressionEvaluatorParams, ClassificationEvaluatorParams,
Dict[str, Any]]] = None,
cross_validator_params: Optional[Union[VectorModelCrossValidatorParams, Dict[str, Any]]] = None,
test_io_data: Optional[InputOutputData] = None):
"""
:param io_data: the data set to use for evaluation. For evaluation purposes, this dataset usually will be split
into training and test data according to the rules specified by `evaluator_params`.
However, if `test_io_data` is specified, then this is taken to be the training data and `test_io_data` is
taken to be the test data when creating evaluators for simple (single-split) evaluation.
:param eval_stats_plot_collector: a collector for plots generated from evaluation stats objects
:param evaluator_params: parameters with which to instantiate evaluators
:param cross_validator_params: parameters with which to instantiate cross-validators
:param test_io_data: optional test data (see `io_data`)
"""
if cross_validator_params is None:
cross_validator_params = VectorModelCrossValidatorParams(folds=5)
self.evaluator_params = evaluator_params
self.cross_validator_params = cross_validator_params
self.io_data = io_data
self.test_io_data = test_io_data
self.eval_stats_plot_collector = eval_stats_plot_collector
[docs] def create_evaluator(self, model: TModel = None, is_regression: bool = None) -> TEvaluator:
"""
Creates an evaluator holding the current input-output data
:param model: the model for which to create an evaluator (just for reading off regression or classification,
the resulting evaluator will work on other models as well)
:param is_regression: whether to create a regression model evaluator. Either this or model have to be specified
:return: an evaluator
"""
return create_vector_model_evaluator(self.io_data, model=model, is_regression=is_regression, test_data=self.test_io_data,
params=self.evaluator_params)
[docs] def create_cross_validator(self, model: TModel = None, is_regression: bool = None) -> TCrossValidator:
"""
Creates a cross-validator holding the current input-output data
:param model: the model for which to create a cross-validator (just for reading off regression or classification,
the resulting evaluator will work on other models as well)
:param is_regression: whether to create a regression model cross-validator. Either this or model have to be specified
:return: an evaluator
"""
return create_vector_model_cross_validator(self.io_data, model=model, is_regression=is_regression,
params=self.cross_validator_params)
@staticmethod
def _result_writer_for_model(result_writer: Optional[ResultWriter], model: TModel) -> Optional[ResultWriter]:
if result_writer is None:
return None
return result_writer.child_with_added_prefix(model.get_name() + "_")
[docs] def compare_models(self, models: Sequence[TModel], result_writer: Optional[ResultWriter] = None, use_cross_validation=False,
fit_models=True, write_individual_results=True, sort_column: Optional[str] = None, sort_ascending: bool = True,
sort_column_move_to_left=True,
also_include_unsorted_results: bool = False, also_include_cross_val_global_stats: bool = False,
visitors: Optional[Iterable["ModelComparisonVisitor"]] = None,
write_visitor_results=False, write_csv=False,
tracked_experiment: Optional[TrackedExperiment] = None) -> "ModelComparisonData":
"""
Compares several models via simple evaluation or cross-validation
:param models: the models to compare
:param result_writer: a writer with which to store results of the comparison
:param use_cross_validation: whether to use cross-validation in order to evaluate models; if False, use a simple evaluation
on test data (single split)
:param fit_models: whether to fit models before evaluating them; this can only be False if useCrossValidation=False
:param write_individual_results: whether to write results files on each individual model (in addition to the comparison
summary)
:param sort_column: column/metric name by which to sort; the fact that the column names change when using cross-validation
(aggregation function names being added) should be ignored, simply pass the (unmodified) metric name
:param sort_ascending: whether to sort using `sortColumn` in ascending order
:param sort_column_move_to_left: whether to move the `sortColumn` (if any) to the very left
:param also_include_unsorted_results: whether to also include, for the case where the results are sorted, the unsorted table of
results in the results text
:param also_include_cross_val_global_stats: whether to also include, when using cross-validation, the evaluation metrics obtained
when combining the predictions from all folds into a single collection. Note that for classification models,
this may not always be possible (if the set of classes know to the model differs across folds)
:param visitors: visitors which may process individual results
:param write_visitor_results: whether to collect results from visitors (if any) after the comparison
:param write_csv: whether to write metrics table to CSV files
:param tracked_experiment: an experiment for tracking
:return: the comparison results
"""
# collect model evaluation results
stats_list = []
result_by_model_name = {}
evaluator = None
cross_validator = None
for i, model in enumerate(models, start=1):
model_name = model.get_name()
log.info(f"Evaluating model {i}/{len(models)} named '{model_name}' ...")
if use_cross_validation:
if not fit_models:
raise ValueError("Cross-validation necessitates that models be trained several times; got fitModels=False")
if cross_validator is None:
cross_validator = self.create_cross_validator(model)
cross_val_data = self.perform_cross_validation(model, result_writer=result_writer if write_individual_results else None,
cross_validator=cross_validator, tracked_experiment=tracked_experiment)
model_result = ModelComparisonData.Result(cross_validation_data=cross_val_data)
result_by_model_name[model_name] = model_result
eval_stats_collection = cross_val_data.get_eval_stats_collection()
stats_dict = eval_stats_collection.agg_metrics_dict()
else:
if evaluator is None:
evaluator = self.create_evaluator(model)
eval_data = self.perform_simple_evaluation(model, result_writer=result_writer if write_individual_results else None,
fit_model=fit_models, evaluator=evaluator, tracked_experiment=tracked_experiment)
model_result = ModelComparisonData.Result(eval_data=eval_data)
result_by_model_name[model_name] = model_result
eval_stats = eval_data.get_eval_stats()
stats_dict = eval_stats.metrics_dict()
stats_dict["model_name"] = model_name
stats_list.append(stats_dict)
if visitors is not None:
for visitor in visitors:
visitor.visit(model_name, model_result)
results_df = pd.DataFrame(stats_list).set_index("model_name")
# compute results data frame with combined set of data points (for cross-validation only)
cross_val_combined_results_df = None
if use_cross_validation and also_include_cross_val_global_stats:
try:
rows = []
for model_name, result in result_by_model_name.items():
stats_dict = result.cross_validation_data.get_eval_stats_collection().get_global_stats().metrics_dict()
stats_dict["model_name"] = model_name
rows.append(stats_dict)
cross_val_combined_results_df = pd.DataFrame(rows).set_index("model_name")
except Exception as e:
log.error(f"Creation of global stats data frame from cross-validation folds failed: {e}")
def sorted_df(df, sort_col):
if sort_col is not None:
if sort_col not in df.columns:
alt_sort_col = f"mean[{sort_col}]"
if alt_sort_col in df.columns:
sort_col = alt_sort_col
else:
sort_col = None
log.warning(f"Requested sort column '{sort_col}' (or '{alt_sort_col}') not in list of columns {list(df.columns)}")
if sort_col is not None:
df = df.sort_values(sort_col, ascending=sort_ascending, inplace=False)
if sort_column_move_to_left:
df = df[[sort_col] + [c for c in df.columns if c != sort_col]]
return df
# write comparison results
title = "Model comparison results"
if use_cross_validation:
title += ", aggregated across folds"
sorted_results_df = sorted_df(results_df, sort_column)
str_results = f"{title}:\n{sorted_results_df.to_string()}"
if also_include_unsorted_results and sort_column is not None:
str_results += f"\n\n{title} (unsorted):\n{results_df.to_string()}"
sorted_cross_val_combined_results_df = None
if cross_val_combined_results_df is not None:
sorted_cross_val_combined_results_df = sorted_df(cross_val_combined_results_df, sort_column)
str_results += f"\n\nModel comparison results based on combined set of data points from all folds:\n" \
f"{sorted_cross_val_combined_results_df.to_string()}"
log.info(str_results)
if result_writer is not None:
suffix = "crossval" if use_cross_validation else "simple-eval"
str_results += "\n\n" + "\n\n".join([f"{model.get_name()} = {model.pprints()}" for model in models])
result_writer.write_text_file(f"model-comparison-results-{suffix}", str_results)
if write_csv:
result_writer.write_data_frame_csv_file(f"model-comparison-metrics-{suffix}", sorted_results_df)
if sorted_cross_val_combined_results_df is not None:
result_writer.write_data_frame_csv_file(f"model-comparison-metrics-{suffix}-combined",
sorted_cross_val_combined_results_df)
# write visitor results
if visitors is not None and write_visitor_results:
result_collector = EvaluationResultCollector(show_plots=False, result_writer=result_writer)
for visitor in visitors:
visitor.collect_results(result_collector)
return ModelComparisonData(results_df, result_by_model_name, evaluator=evaluator, cross_validator=cross_validator)
[docs] def compare_models_cross_validation(self, models: Sequence[TModel],
result_writer: Optional[ResultWriter] = None) -> "ModelComparisonData":
"""
Compares several models via cross-validation
:param models: the models to compare
:param result_writer: a writer with which to store results of the comparison
:return: the comparison results
"""
return self.compare_models(models, result_writer=result_writer, use_cross_validation=True)
[docs] def create_plots(self, data: Union[TEvalData, TCrossValData], show_plots=True, result_writer: Optional[ResultWriter] = None,
subtitle_prefix: str = "", tracking_context: Optional[TrackingContext] = None):
"""
Creates default plots that visualise the results in the given evaluation data
:param data: the evaluation data for which to create the default plots
:param show_plots: whether to show plots
:param result_writer: if not None, plots will be written using this writer
:param subtitle_prefix: a prefix to add to the subtitle (which itself is the model name)
:param tracking_context: the experiment tracking context
"""
result_collector = EvaluationResultCollector(show_plots=show_plots, result_writer=result_writer,
tracking_context=tracking_context)
if result_collector.is_plot_creation_enabled():
self._create_plots(data, result_collector, subtitle=subtitle_prefix + data.model_name)
def _create_plots(self, data: Union[TEvalData, TCrossValData], result_collector: EvaluationResultCollector, subtitle=None):
def create_plots(pred_var_name, res_collector, subt):
if isinstance(data, VectorModelCrossValidationData):
eval_stats = data.get_eval_stats_collection(predicted_var_name=pred_var_name).get_global_stats()
elif isinstance(data, VectorModelEvaluationData):
eval_stats = data.get_eval_stats(predicted_var_name=pred_var_name)
else:
raise ValueError(f"Unexpected argument: data={data}")
return self._create_eval_stats_plots(eval_stats, res_collector, subtitle=subt)
predicted_var_names = data.predicted_var_names
if len(predicted_var_names) == 1:
create_plots(predicted_var_names[0], result_collector, subtitle)
else:
for predictedVarName in predicted_var_names:
create_plots(predictedVarName, result_collector.child(predictedVarName + "-"), f"{predictedVarName}, {subtitle}")
def _create_eval_stats_plots(self, eval_stats: TEvalStats, result_collector: EvaluationResultCollector, subtitle=None):
"""
:param eval_stats: the evaluation results for which to create plots
:param result_collector: the collector to which all plots are to be passed
:param subtitle: the subtitle to use for generated plots (if any)
"""
self.eval_stats_plot_collector.create_plots(eval_stats, subtitle, result_collector)
[docs]class RegressionModelEvaluation(ModelEvaluation[VectorRegressionModel, VectorRegressionModelEvaluator, VectorRegressionModelEvaluationData,
VectorRegressionModelCrossValidator, VectorRegressionModelCrossValidationData, RegressionEvalStats]):
def __init__(self, io_data: InputOutputData,
evaluator_params: Optional[Union[RegressionEvaluatorParams, Dict[str, Any]]] = None,
cross_validator_params: Optional[Union[VectorModelCrossValidatorParams, Dict[str, Any]]] = None,
test_io_data: Optional[InputOutputData] = None):
"""
:param io_data: the data set to use for evaluation. For evaluation purposes, this dataset usually will be split
into training and test data according to the rules specified by `evaluator_params`.
However, if `test_io_data` is specified, then this is taken to be the training data and `test_io_data` is
taken to be the test data when creating evaluators for simple (single-split) evaluation.
:param evaluator_params: parameters with which to instantiate evaluators
:param cross_validator_params: parameters with which to instantiate cross-validators
:param test_io_data: optional test data (see `io_data`)
"""
super().__init__(io_data, eval_stats_plot_collector=RegressionEvalStatsPlotCollector(), evaluator_params=evaluator_params,
cross_validator_params=cross_validator_params, test_io_data=test_io_data)
[docs]class ClassificationModelEvaluation(ModelEvaluation[VectorClassificationModel, VectorClassificationModelEvaluator,
VectorClassificationModelEvaluationData, VectorClassificationModelCrossValidator, VectorClassificationModelCrossValidationData,
ClassificationEvalStats]):
def __init__(self, io_data: InputOutputData,
evaluator_params: Optional[Union[ClassificationEvaluatorParams, Dict[str, Any]]] = None,
cross_validator_params: Optional[Union[VectorModelCrossValidatorParams, Dict[str, Any]]] = None,
test_io_data: Optional[InputOutputData] = None):
"""
:param io_data: the data set to use for evaluation. For evaluation purposes, this dataset usually will be split
into training and test data according to the rules specified by `evaluator_params`.
However, if `test_io_data` is specified, then this is taken to be the training data and `test_io_data` is
taken to be the test data when creating evaluators for simple (single-split) evaluation.
:param evaluator_params: parameters with which to instantiate evaluators
:param cross_validator_params: parameters with which to instantiate cross-validators
:param test_io_data: optional test data (see `io_data`)
"""
super().__init__(io_data, eval_stats_plot_collector=ClassificationEvalStatsPlotCollector(), evaluator_params=evaluator_params,
cross_validator_params=cross_validator_params, test_io_data=test_io_data)
[docs]class MultiDataModelEvaluation:
def __init__(self, io_data_dict: Dict[str, InputOutputData], key_name: str = "dataset",
meta_data_dict: Optional[Dict[str, Dict[str, Any]]] = None,
evaluator_params: Optional[Union[RegressionEvaluatorParams, ClassificationEvaluatorParams, Dict[str, Any]]] = None,
cross_validator_params: Optional[Union[VectorModelCrossValidatorParams, Dict[str, Any]]] = None,
test_io_data_dict: Optional[Dict[str, Optional[InputOutputData]]] = None):
"""
:param io_data_dict: a dictionary mapping from names to the data sets with which to evaluate models.
For evaluation or cross-validation, these datasets will usually be split according to the rules
specified by `evaluator_params or `cross_validator_params`. An exception is the case where
explicit test data sets are specified by passing `test_io_data_dict`. Then, for these data
sets, the io_data will not be split for evaluation, but the test_io_data will be used instead.
:param key_name: a name for the key value used in inputOutputDataDict, which will be used as a column name in result data frames
:param meta_data_dict: a dictionary which maps from a name (same keys as in inputOutputDataDict) to a dictionary, which maps
from a column name to a value and which is to be used to extend the result data frames containing per-dataset results
:param evaluator_params: parameters to use for the instantiation of evaluators (relevant if useCrossValidation==False)
:param cross_validator_params: parameters to use for the instantiation of cross-validators (relevant if useCrossValidation==True)
:param test_io_data_dict: a dictionary mapping from names to the test data sets to use for evaluation or to None.
Entries with non-None values will be used for evaluation of the models that were trained on the respective io_data_dict.
If passed, the keys need to be a superset of io_data_dict's keys (note that the values may be None, e.g.
if you want to use test data sets for some entries, and splitting of the io_data for others).
If not None, cross-validation cannot be used when calling ``compare_models``.
"""
if test_io_data_dict is not None:
missing_keys = set(io_data_dict).difference(test_io_data_dict)
if len(missing_keys) > 0:
raise ValueError(
"If test_io_data_dict is passed, its keys must be a superset of the io_data_dict's keys."
f"However, found missing_keys: {missing_keys}")
self.io_data_dict = io_data_dict
self.test_io_data_dict = test_io_data_dict
self.key_name = key_name
self.evaluator_params = evaluator_params
self.cross_validator_params = cross_validator_params
if meta_data_dict is not None:
self.meta_df = pd.DataFrame(meta_data_dict.values(), index=meta_data_dict.keys())
else:
self.meta_df = None
[docs] def compare_models(self,
model_factories: Sequence[Callable[[], Union[VectorRegressionModel, VectorClassificationModel]]],
use_cross_validation=False,
result_writer: Optional[ResultWriter] = None,
write_per_dataset_results=False,
write_csvs=False,
column_name_for_model_ranking: str = None,
rank_max=True,
add_combined_eval_stats=False,
create_metric_distribution_plots=True,
create_combined_eval_stats_plots=False,
distribution_plots_cdf = True,
distribution_plots_cdf_complementary = False,
visitors: Optional[Iterable["ModelComparisonVisitor"]] = None) \
-> Union["RegressionMultiDataModelComparisonData", "ClassificationMultiDataModelComparisonData"]:
"""
:param model_factories: a sequence of factory functions for the creation of models to evaluate; every factory must result
in a model with a fixed model name (otherwise results cannot be correctly aggregated)
:param use_cross_validation: whether to use cross-validation (rather than a single split) for model evaluation.
This can only be used if the instance's ``test_io_data_dict`` is None.
:param result_writer: a writer with which to store results; if None, results are not stored
:param write_per_dataset_results: whether to use resultWriter (if not None) in order to generate detailed results for each
dataset in a subdirectory named according to the name of the dataset
:param write_csvs: whether to write metrics table to CSV files
:param column_name_for_model_ranking: column name to use for ranking models
:param rank_max: if true, use max for ranking, else min
:param add_combined_eval_stats: whether to also report, for each model, evaluation metrics on the combined set data points from
all EvalStats objects.
Note that for classification, this is only possible if all individual experiments use the same set of class labels.
:param create_metric_distribution_plots: whether to create, for each model, plots of the distribution of each metric across the
datasets (applies only if result_writer is not None)
:param create_combined_eval_stats_plots: whether to combine, for each type of model, the EvalStats objects from the individual
experiments into a single objects that holds all results and use it to create plots reflecting the overall result (applies only
if resultWriter is not None).
Note that for classification, this is only possible if all individual experiments use the same set of class labels.
:param distribution_plots_cdf: whether to create CDF plots for the metric distributions. Applies only if
create_metric_distribution_plots is True and result_writer is not None.
:param distribution_plots_cdf_complementary: whether to plot the complementary cdf instead of the regular cdf, provided that
distribution_plots_cdf is True.
:param visitors: visitors which may process individual results. Plots generated by visitors are created/collected at the end of the
comparison.
:return: an object containing the full comparison results
"""
if self.test_io_data_dict and use_cross_validation:
raise ValueError("Cannot use cross-validation when `test_io_data_dict` is specified")
all_results_df = pd.DataFrame()
eval_stats_by_model_name = defaultdict(list)
results_by_model_name: Dict[str, List[ModelComparisonData.Result]] = defaultdict(list)
is_regression = None
plot_collector: Optional[EvalStatsPlotCollector] = None
model_names = None
model_name_to_string_repr = None
for i, (key, inputOutputData) in enumerate(self.io_data_dict.items(), start=1):
log.info(f"Evaluating models for data set #{i}/{len(self.io_data_dict)}: {self.key_name}={key}")
models = [f() for f in model_factories]
current_model_names = [model.get_name() for model in models]
if model_names is None:
model_names = current_model_names
elif model_names != current_model_names:
log.warning(f"Model factories do not produce fixed names; use model.withName to name your models. "
f"Got {current_model_names}, previously got {model_names}")
if is_regression is None:
models_are_regression = [model.is_regression_model() for model in models]
if all(models_are_regression):
is_regression = True
elif not any(models_are_regression):
is_regression = False
else:
raise ValueError("The models have to be either all regression models or all classification, not a mixture")
test_io_data = self.test_io_data_dict[key] if self.test_io_data_dict is not None else None
ev = create_evaluation_util(inputOutputData, is_regression=is_regression, evaluator_params=self.evaluator_params,
cross_validator_params=self.cross_validator_params, test_io_data=test_io_data)
if plot_collector is None:
plot_collector = ev.eval_stats_plot_collector
# compute data frame with results for current data set
if write_per_dataset_results and result_writer is not None:
child_result_writer = result_writer.child_for_subdirectory(key)
else:
child_result_writer = None
comparison_data = ev.compare_models(models, use_cross_validation=use_cross_validation, result_writer=child_result_writer,
visitors=visitors, write_visitor_results=False)
df = comparison_data.results_df
# augment data frame
df[self.key_name] = key
df["model_name"] = df.index
df = df.reset_index(drop=True)
# collect eval stats objects by model name
for modelName, result in comparison_data.result_by_model_name.items():
if use_cross_validation:
eval_stats = result.cross_validation_data.get_eval_stats_collection().get_global_stats()
else:
eval_stats = result.eval_data.get_eval_stats()
eval_stats_by_model_name[modelName].append(eval_stats)
results_by_model_name[modelName].append(result)
all_results_df = pd.concat((all_results_df, df))
if model_name_to_string_repr is None:
model_name_to_string_repr = {model.get_name(): model.pprints() for model in models}
if self.meta_df is not None:
all_results_df = all_results_df.join(self.meta_df, on=self.key_name, how="left")
str_all_results = f"All results:\n{all_results_df.to_string()}"
log.info(str_all_results)
# create mean result by model, removing any metrics/columns that produced NaN values
# (because the mean would be computed without them, skipna parameter unsupported)
all_results_grouped = all_results_df.drop(columns=self.key_name).dropna(axis=1).groupby("model_name")
mean_results_df: pd.DataFrame = all_results_grouped.mean()
for colName in [column_name_for_model_ranking, f"mean[{column_name_for_model_ranking}]"]:
if colName in mean_results_df:
mean_results_df.sort_values(column_name_for_model_ranking, inplace=True, ascending=not rank_max)
break
str_mean_results = f"Mean results (averaged across {len(self.io_data_dict)} data sets):\n{mean_results_df.to_string()}"
log.info(str_mean_results)
def iter_combined_eval_stats_from_all_data_sets():
for model_name, evalStatsList in eval_stats_by_model_name.items():
if is_regression:
ev_stats = RegressionEvalStatsCollection(evalStatsList).get_global_stats()
else:
ev_stats = ClassificationEvalStatsCollection(evalStatsList).get_global_stats()
yield model_name, ev_stats
# create further aggregations
agg_dfs = []
for op_name, agg_fn in [("mean", lambda x: x.mean()), ("std", lambda x: x.std()), ("min", lambda x: x.min()),
("max", lambda x: x.max())]:
agg_df = agg_fn(all_results_grouped)
agg_df.columns = [f"{op_name}[{c}]" for c in agg_df.columns]
agg_dfs.append(agg_df)
further_aggs_df = pd.concat(agg_dfs, axis=1)
further_aggs_df = further_aggs_df.loc[mean_results_df.index] # apply same sort order (index is model_name)
column_order = functools.reduce(lambda a, b: a + b, [list(t) for t in zip(*[df.columns for df in agg_dfs])])
further_aggs_df = further_aggs_df[column_order]
str_further_aggs = f"Further aggregations:\n{further_aggs_df.to_string()}"
log.info(str_further_aggs)
# combined eval stats from all datasets (per model)
str_combined_eval_stats = ""
if add_combined_eval_stats:
rows = []
for modelName, eval_stats in iter_combined_eval_stats_from_all_data_sets():
rows.append({"model_name": modelName, **eval_stats.metrics_dict()})
combined_stats_df = pd.DataFrame(rows)
combined_stats_df.set_index("model_name", drop=True, inplace=True)
combined_stats_df = combined_stats_df.loc[mean_results_df.index] # apply same sort order (index is model_name)
str_combined_eval_stats = f"Results on combined test data from all data sets:\n{combined_stats_df.to_string()}\n\n"
log.info(str_combined_eval_stats)
if result_writer is not None:
comparison_content = str_mean_results + "\n\n" + str_further_aggs + "\n\n" + str_combined_eval_stats + str_all_results
comparison_content += "\n\nModels [example instance]:\n\n"
comparison_content += "\n\n".join(f"{name} = {s}" for name, s in model_name_to_string_repr.items())
result_writer.write_text_file("model-comparison-results", comparison_content)
if write_csvs:
result_writer.write_data_frame_csv_file("all-results", all_results_df)
result_writer.write_data_frame_csv_file("mean-results", mean_results_df)
# create plots from combined data for each model
if create_combined_eval_stats_plots:
for modelName, eval_stats in iter_combined_eval_stats_from_all_data_sets():
child_result_writer = result_writer.child_with_added_prefix(modelName + "_") if result_writer is not None else None
result_collector = EvaluationResultCollector(show_plots=False, result_writer=child_result_writer)
plot_collector.create_plots(eval_stats, subtitle=modelName, result_collector=result_collector)
# collect results from visitors (if any)
result_collector = EvaluationResultCollector(show_plots=False, result_writer=result_writer)
if visitors is not None:
for visitor in visitors:
visitor.collect_results(result_collector)
# create result
dataset_names = list(self.io_data_dict.keys())
if is_regression:
mdmc_data = RegressionMultiDataModelComparisonData(all_results_df, mean_results_df, further_aggs_df, eval_stats_by_model_name,
results_by_model_name, dataset_names, model_name_to_string_repr)
else:
mdmc_data = ClassificationMultiDataModelComparisonData(all_results_df, mean_results_df, further_aggs_df,
eval_stats_by_model_name, results_by_model_name, dataset_names, model_name_to_string_repr)
# plot distributions
if create_metric_distribution_plots and result_writer is not None:
mdmc_data.create_distribution_plots(result_writer, cdf=distribution_plots_cdf,
cdf_complementary=distribution_plots_cdf_complementary)
return mdmc_data
[docs]class ModelComparisonData:
[docs] @dataclass
class Result:
eval_data: Union[VectorClassificationModelEvaluationData, VectorRegressionModelEvaluationData] = None
cross_validation_data: Union[VectorClassificationModelCrossValidationData, VectorRegressionModelCrossValidationData] = None
[docs] def iter_evaluation_data(self) -> Iterator[Union[VectorClassificationModelEvaluationData, VectorRegressionModelEvaluationData]]:
if self.eval_data is not None:
yield self.eval_data
if self.cross_validation_data is not None:
yield from self.cross_validation_data.eval_data_list
def __init__(self, results_df: pd.DataFrame, results_by_model_name: Dict[str, Result], evaluator: Optional[VectorModelEvaluator] = None,
cross_validator: Optional[VectorModelCrossValidator] = None):
self.results_df = results_df
self.result_by_model_name = results_by_model_name
self.evaluator = evaluator
self.cross_validator = cross_validator
[docs] def get_best_model_name(self, metric_name: str) -> str:
idx = np.argmax(self.results_df[metric_name])
return self.results_df.index[idx]
[docs] def get_best_model(self, metric_name: str) -> Union[VectorClassificationModel, VectorRegressionModel, VectorModelBase]:
result = self.result_by_model_name[self.get_best_model_name(metric_name)]
if result.eval_data is None:
raise ValueError("The best model is not well-defined when using cross-validation")
return result.eval_data.model
[docs]class ModelComparisonVisitor(ABC):
[docs] @abstractmethod
def visit(self, model_name: str, result: ModelComparisonData.Result):
pass
[docs] @abstractmethod
def collect_results(self, result_collector: EvaluationResultCollector) -> None:
"""
Collects results (such as figures) at the end of the model comparison, based on the results collected
:param result_collector: the collector to which figures are to be added
"""
pass
[docs]class ModelComparisonVisitorAggregatedFeatureImportance(ModelComparisonVisitor):
"""
During a model comparison, computes aggregated feature importance values for the model with the given name
"""
def __init__(self, model_name: str, feature_agg_regex: Sequence[str] = (), write_figure=True, write_data_frame_csv=False):
r"""
:param model_name: the name of the model for which to compute the aggregated feature importance values
:param feature_agg_regex: a sequence of regular expressions describing which feature names to sum as one. Each regex must
contain exactly one group. If a regex matches a feature name, the feature importance will be summed under the key
of the matched group instead of the full feature name. For example, the regex r"(\w+)_\d+$" will cause "foo_1" and "foo_2"
to be summed under "foo" and similarly "bar_1" and "bar_2" to be summed under "bar".
"""
self.model_name = model_name
self.agg_feature_importance = AggregatedFeatureImportance(feature_agg_reg_ex=feature_agg_regex)
self.write_figure = write_figure
self.write_data_frame_csv = write_data_frame_csv
[docs] def visit(self, model_name: str, result: ModelComparisonData.Result):
if model_name == self.model_name:
if result.cross_validation_data is not None:
models = result.cross_validation_data.trained_models
if models is not None:
for model in models:
self._collect(model)
else:
raise ValueError("Models were not returned in cross-validation results")
elif result.eval_data is not None:
self._collect(result.eval_data.model)
def _collect(self, model: Union[FeatureImportanceProvider, VectorModelBase]):
if not isinstance(model, FeatureImportanceProvider):
raise ValueError(f"Got model which does inherit from {FeatureImportanceProvider.__qualname__}: {model}")
self.agg_feature_importance.add(model.get_feature_importance_dict())
[docs] @deprecated("Use getFeatureImportance and create the plot using the returned object")
def plot_feature_importance(self) -> plt.Figure:
feature_importance_dict = self.agg_feature_importance.get_aggregated_feature_importance().get_feature_importance_dict()
return plot_feature_importance(feature_importance_dict, subtitle=self.model_name)
[docs] def get_feature_importance(self) -> FeatureImportance:
return self.agg_feature_importance.get_aggregated_feature_importance()
[docs] def collect_results(self, result_collector: EvaluationResultCollector):
feature_importance = self.get_feature_importance()
if self.write_figure:
result_collector.add_figure(f"{self.model_name}_feature-importance", feature_importance.plot())
if self.write_data_frame_csv:
result_collector.add_data_frame_csv_file(f"{self.model_name}_feature-importance", feature_importance.get_data_frame())
[docs]class MultiDataModelComparisonData(Generic[TEvalStats, TEvalStatsCollection], ABC):
def __init__(self, all_results_df: pd.DataFrame,
mean_results_df: pd.DataFrame,
agg_results_df: pd.DataFrame,
eval_stats_by_model_name: Dict[str, List[TEvalStats]],
results_by_model_name: Dict[str, List[ModelComparisonData.Result]],
dataset_names: List[str],
model_name_to_string_repr: Dict[str, str]):
self.all_results_df = all_results_df
self.mean_results_df = mean_results_df
self.agg_results_df = agg_results_df
self.eval_stats_by_model_name = eval_stats_by_model_name
self.results_by_model_name = results_by_model_name
self.dataset_names = dataset_names
self.model_name_to_string_repr = model_name_to_string_repr
[docs] def get_model_names(self) -> List[str]:
return list(self.eval_stats_by_model_name.keys())
[docs] def get_model_description(self, model_name: str) -> str:
return self.model_name_to_string_repr[model_name]
[docs] def get_eval_stats_list(self, model_name: str) -> List[TEvalStats]:
return self.eval_stats_by_model_name[model_name]
[docs] @abstractmethod
def get_eval_stats_collection(self, model_name: str) -> TEvalStatsCollection:
pass
[docs] def iter_model_results(self, model_name: str) -> Iterator[Tuple[str, ModelComparisonData.Result]]:
results = self.results_by_model_name[model_name]
yield from zip(self.dataset_names, results)
[docs] def create_distribution_plots(self, result_writer: ResultWriter, cdf=True, cdf_complementary=False):
"""
Creates plots of distributions of metrics across datasets for each model as a histogram, and additionally
any x-y plots (scatter plots & heat maps) for metrics that have associated paired metrics that were also computed
:param result_writer: the result writer
:param cdf: whether to additionally plot, for each distribution, the cumulative distribution function
:param cdf_complementary: whether to plot the complementary cdf instead of the regular cdf, provided that ``cdf`` is True
"""
for modelName in self.get_model_names():
eval_stats_collection = self.get_eval_stats_collection(modelName)
for metricName in eval_stats_collection.get_metric_names():
# plot distribution
fig = eval_stats_collection.plot_distribution(metricName, subtitle=modelName, cdf=cdf, cdf_complementary=cdf_complementary)
result_writer.write_figure(f"{modelName}_dist-{metricName}", fig)
# scatter plot with paired metrics
metric: Metric = eval_stats_collection.get_metric_by_name(metricName)
for paired_metric in metric.get_paired_metrics():
if eval_stats_collection.has_metric(paired_metric):
fig = eval_stats_collection.plot_scatter(metric.name, paired_metric.name)
result_writer.write_figure(f"{modelName}_scatter-{metric.name}-{paired_metric.name}", fig)
fig = eval_stats_collection.plot_heat_map(metric.name, paired_metric.name)
result_writer.write_figure(f"{modelName}_heatmap-{metric.name}-{paired_metric.name}", fig)
[docs]class ClassificationMultiDataModelComparisonData(MultiDataModelComparisonData[ClassificationEvalStats, ClassificationEvalStatsCollection]):
[docs] def get_eval_stats_collection(self, model_name: str):
return ClassificationEvalStatsCollection(self.get_eval_stats_list(model_name))
[docs]class RegressionMultiDataModelComparisonData(MultiDataModelComparisonData[RegressionEvalStats, RegressionEvalStatsCollection]):
[docs] def get_eval_stats_collection(self, model_name: str):
return RegressionEvalStatsCollection(self.get_eval_stats_list(model_name))