Coverage for src/sensai/sklearn_quantile.py: 0%

69 statements  

« prev     ^ index     » next       coverage.py v7.6.1, created at 2024-08-13 22:17 +0000

1import functools 

2from abc import ABC 

3 

4import numpy as np 

5import pandas as pd 

6from sklearn_quantile import RandomForestQuantileRegressor 

7 

8from .vector_model import VectorRegressionModel, InputOutputData 

9from .evaluation.eval_stats import RegressionMetric 

10 

11from .sklearn.sklearn_base import AbstractSkLearnMultipleOneDimVectorRegressionModel 

12from .util.aggregation import RelativeFrequencyCounter 

13 

14 

15class RandomForestQuantileRegressorVectorRegressionModel(AbstractSkLearnMultipleOneDimVectorRegressionModel): 

16 def __init__(self, confidence: float, random_state=42, **kwargs): 

17 """ 

18 :param q: the default quantile that is used for predictions 

19 :param kwargs: keyword arguments to pass on to RandomForestQuantileRegressor 

20 """ 

21 margin = 1-confidence 

22 q = [0.5, margin/2, 1 - margin/2] 

23 super().__init__(RandomForestQuantileRegressor, q=q, random_state=random_state, **kwargs) 

24 

25 def predict_confidence_intervals(self, x: pd.DataFrame, var_name: str = None): 

26 """ 

27 :param x: the input data 

28 :param var_name: the predicted variable name; may be None if there is only one predicted variable 

29 :return: an array of shape [2, N], where the first dimension contains the confidence interval's lower bounds and the second 

30 its upper bounds 

31 """ 

32 model = self.get_sklearn_model(var_name) 

33 model: RandomForestQuantileRegressor 

34 outputs = self._predict_quantiles(model, self.compute_model_inputs(x)) 

35 return outputs[1:] 

36 

37 def _predict_quantiles(self, model: RandomForestQuantileRegressor, inputs: pd.DataFrame) -> np.ndarray: 

38 outputs = model.predict(inputs) 

39 return outputs 

40 

41 def _predict_sklearn_single_model(self, model, inputs: pd.DataFrame) -> np.ndarray: 

42 return self._predict_quantiles(model, inputs)[0] 

43 

44 

45class QuantileRegressionMetric(RegressionMetric, ABC): 

46 @staticmethod 

47 @functools.lru_cache(maxsize=1) # use cache for efficient reuse of results across different subclasses during evaluation 

48 def compute_confidence_intervals(model: VectorRegressionModel, io_data: InputOutputData = None) -> np.ndarray: 

49 if not isinstance(model, RandomForestQuantileRegressorVectorRegressionModel): 

50 raise ValueError(f"Model must be of type RandomForestQuantileRegressorVectorRegressionModel, got type {type(model)}") 

51 intervals: np.ndarray = model.predict_confidence_intervals(io_data.inputs) 

52 return intervals 

53 

54 

55class QuantileRegressionMetricAccuracyInConfidenceInterval(QuantileRegressionMetric): 

56 """ 

57 Metric reflecting the accuracy of the confidence interval, i.e. the relative frequency of predictions where the confidence interval 

58 contains the ground true value 

59 """ 

60 name = "AccuracyInCI" 

61 

62 @classmethod 

63 def compute_value(cls, y_true: np.ndarray, y_predicted: np.ndarray, model: VectorRegressionModel = None, 

64 io_data: InputOutputData = None): 

65 intervals = cls.compute_confidence_intervals(model, io_data) 

66 rf = RelativeFrequencyCounter() 

67 for (lower, upper), gt in zip(intervals.transpose(), y_true): 

68 rf.count(lower <= gt <= upper) 

69 return rf.get_relative_frequency() 

70 

71 

72class QuantileRegressionMetricConfidenceIntervalMeanSize(QuantileRegressionMetric): 

73 """ 

74 Metric for the mean size of the confidence interval 

75 """ 

76 name = "MeanSizeCI" 

77 

78 @classmethod 

79 def compute_value(cls, y_true: np.ndarray, y_predicted: np.ndarray, model: VectorRegressionModel = None, io_data: InputOutputData = None): 

80 intervals = cls.compute_confidence_intervals(model, io_data) 

81 values = [] 

82 for lower, upper in intervals.transpose(): 

83 values.append(upper-lower) 

84 return np.mean(values) 

85 

86 

87class QuantileRegressionMetricConfidenceIntervalMedianSize(QuantileRegressionMetric): 

88 """ 

89 Metric for the median size of the confidence interval 

90 """ 

91 name = "MedianSizeCI" 

92 

93 @classmethod 

94 def compute_value(cls, y_true: np.ndarray, y_predicted: np.ndarray, model: VectorRegressionModel = None, io_data: InputOutputData = None): 

95 intervals = cls.compute_confidence_intervals(model, io_data) 

96 values = [] 

97 for lower, upper in intervals.transpose(): 

98 values.append(upper-lower) 

99 return np.median(values) 

100 

101 

102class QuantileRegressionMetricRelFreqMaxSizeConfidenceInterval(QuantileRegressionMetric): 

103 """ 

104 Relative frequency of confidence interval having the given maximum size 

105 """ 

106 def __init__(self, max_size: float): 

107 super().__init__(f"RelFreqMaxSizeCI[{max_size}]") 

108 self.max_size = max_size 

109 

110 def compute_value(self, y_true: np.ndarray, y_predicted: np.ndarray, model: VectorRegressionModel = None, io_data: InputOutputData = None): 

111 intervals = self.compute_confidence_intervals(model, io_data) 

112 counter = RelativeFrequencyCounter() 

113 for lower, upper in intervals.transpose(): 

114 size = upper-lower 

115 counter.count(size <= self.max_size) 

116 return counter.get_relative_frequency()