Coverage for src/sensai/torch/torch_models/lstnet/lstnet_models.py: 28%

102 statements  

« prev     ^ index     » next       coverage.py v7.6.1, created at 2024-08-13 22:17 +0000

1import collections 

2import logging 

3import re 

4from typing import Optional, Union 

5 

6import pandas as pd 

7import torch 

8 

9from .lstnet_modules import LSTNetwork 

10from ...torch_base import TorchVectorClassificationModel, VectorTorchModel, ClassificationOutputMode 

11from ...torch_data import TorchDataSetProviderFromDataUtil, TensorScalerIdentity, TensorScaler, DataUtil 

12from ...torch_enums import ActivationFunction 

13from ...torch_opt import NNOptimiserParams 

14 

15log: logging.Logger = logging.getLogger(__name__) 

16 

17 

18class LSTNetworkVectorClassificationModel(TorchVectorClassificationModel): 

19 """ 

20 Classification model for time series data using the LSTNetwork architecture. 

21 

22 Since the model takes a time series as input, it requires that input data frames to use special naming of columns 

23 such that the data can be interpreted correctly: 

24 Each column name must start with an N-digit prefix indicating the time slice the data pertains to (for any fixed N); 

25 the following suffix shall indicate the name of the actual feature. 

26 For each N-digit prefix, we must have the same set of suffixes in the list of columns, i.e. we must have the same 

27 features for each time slice in the input time series. 

28 """ 

29 def __init__(self, num_input_time_slices, input_dim_per_time_slice, num_classes: Optional[int] = None, 

30 num_convolutions: int = 100, num_cnn_time_slices: int = 6, hid_rnn: int = 100, skip: int = 0, hid_skip: int = 5, 

31 hw_window: int = 0, hw_combine: str = "plus", dropout=0.2, output_activation=ActivationFunction.LOG_SOFTMAX, cuda=True, 

32 nn_optimiser_params: Union[dict, NNOptimiserParams] = None): 

33 """ 

34 :param num_input_time_slices: the number of input time slices 

35 :param input_dim_per_time_slice: the dimension of the input data per time slice 

36 :param num_classes: the number of classes considered by this classification problem; if None, determine from data 

37 :param num_cnn_time_slices: the number of time slices considered by each convolution (i.e. it is one of the dimensions of the matrix 

38 used for convolutions, the other dimension being inputDimPerTimeSlice), a.k.a. "Ck" 

39 :param num_convolutions: the number of separate convolutions to apply, i.e. the number of independent convolution matrices, 

40 a.k.a "hidC"; 

41 if it is 0, then the entire complex processing path is not applied. 

42 :param hid_rnn: the number of hidden output dimensions for the RNN stage 

43 :param skip: the number of time slices to skip for the skip-RNN. If it is 0, then the skip-RNN is not used. 

44 :param hid_skip: the number of output dimensions of each of the skip parallel RNNs 

45 :param hw_window: the number of time slices from the end of the input time series to consider as input for the highway component. 

46 If it is 0, the highway component is not used. 

47 :param hw_combine: {"plus", "product", "bilinear"} the function with which the highway component's output is combined with the 

48 complex path's output 

49 :param dropout: the dropout probability to use during training (dropouts are applied after every major step in the evaluation path) 

50 :param output_activation: the output activation function 

51 :param nn_optimiser_params: parameters for NNOptimiser to use for training 

52 """ 

53 self.num_input_time_slices = num_input_time_slices 

54 self.input_dim_per_time_slice = input_dim_per_time_slice 

55 self.num_convolutions = num_convolutions 

56 self.num_cnn_time_slices = num_cnn_time_slices 

57 self.hid_rnn = hid_rnn 

58 self.skip = skip 

59 self.hid_skip = hid_skip 

60 self.hw_window = hw_window 

61 self.hw_combine = hw_combine 

62 self.dropout = dropout 

63 self.cuda = cuda 

64 self.output_activation = output_activation 

65 self.num_classes = num_classes 

66 output_mode = ClassificationOutputMode.for_activation_fn(ActivationFunction.torch_function_from_any(output_activation)) 

67 super().__init__(output_mode, self._create_lst_network_model, nn_optimiser_params=nn_optimiser_params) 

68 

69 def _create_lst_network_model(self): 

70 return self._LSTNetworkModel(self) 

71 

72 class _LSTNetworkModel(VectorTorchModel): 

73 def __init__(self, parent: "LSTNetworkVectorClassificationModel"): 

74 super().__init__(parent.cuda) 

75 self.parent = parent 

76 

77 def create_torch_module_for_dims(self, input_dim, output_dim): 

78 p = self.parent 

79 expected_input_dim = p.num_input_time_slices * p.input_dim_per_time_slice 

80 if expected_input_dim != input_dim: 

81 raise ValueError(f"Unexpected input size {input_dim}, expected {self.inputDim}") 

82 if p.num_classes is None: 

83 output_dim_per_time_slice = output_dim 

84 else: 

85 output_dim_per_time_slice = p.num_classes 

86 if p.num_classes != output_dim: 

87 raise ValueError(f"Unexpected output dim {output_dim}, expected {p.num_classes}") 

88 return LSTNetwork(num_input_time_slices=p.num_input_time_slices, 

89 input_dim_per_time_slice=p.input_dim_per_time_slice, 

90 num_output_time_slices=1, 

91 output_dim_per_time_slice=output_dim_per_time_slice, 

92 num_convolutions=p.num_convolutions, 

93 num_cnn_time_slices=p.num_cnn_time_slices, 

94 hid_rnn=p.hid_rnn, 

95 hw_window=p.hw_window, 

96 hw_combine=p.hw_combine, 

97 dropout=p.dropout, 

98 output_activation=p.output_activation, 

99 skip=p.skip, 

100 hid_skip=p.hid_skip, 

101 mode=LSTNetwork.Mode.CLASSIFICATION) 

102 

103 def _create_data_set_provider(self, inputs: pd.DataFrame, outputs: pd.DataFrame) -> TorchDataSetProviderFromDataUtil: 

104 if self.num_classes is None: 

105 self.num_classes = len(self._labels) 

106 elif self.num_classes != len(self._labels): 

107 raise ValueError(f"Output dimension {self.num_classes} per time time slice was specified, while the training data contains " 

108 f"{len(self._labels)} classes") 

109 return TorchDataSetProviderFromDataUtil(self.DataUtil(inputs, outputs, self.num_classes), self.cuda) 

110 

111 def _predict_outputs_for_input_data_frame(self, inputs: pd.DataFrame) -> torch.Tensor: 

112 log.info(f"Predicting outputs for {len(inputs)} inputs") 

113 result = super()._predict_outputs_for_input_data_frame(inputs) 

114 return result.squeeze(2) 

115 

116 def _compute_model_inputs(self, x: pd.DataFrame, y: pd.DataFrame = None, fit=False) -> pd.DataFrame: 

117 x = super()._compute_model_inputs(x, y=y, fit=fit) 

118 

119 # sort input data frame columns by name 

120 x = x[sorted(x.columns)] 

121 

122 # check input column name format and consistency 

123 col_name_regex = re.compile(r"(\d+).+") 

124 cols_by_time_slice = collections.defaultdict(list) 

125 num_digits = None 

126 for colName in x.columns: 

127 match = col_name_regex.fullmatch(colName) 

128 if not match: 

129 raise ValueError(f"Column name '{colName}' does not match the required format (N-digit prefix indicating the time slice " 

130 f"order followed by feature name; for any fixed N); columns={list(x.columns)}") 

131 time_slice = match.group(1) 

132 if num_digits is None: 

133 num_digits = len(time_slice) 

134 elif num_digits != len(time_slice): 

135 raise ValueError(f"Inconsistent number of digits in column names: Got {num_digits} leading digits for one feature and " 

136 f"{len(time_slice)} for another; columns={list(x.columns)}") 

137 cols_by_time_slice[time_slice].append(colName[num_digits:]) 

138 reference_cols = None 

139 for time_slice, cols in cols_by_time_slice.items(): 

140 if reference_cols is None: 

141 reference_cols = cols 

142 elif reference_cols != cols: 

143 raise ValueError(f"Inconsistent features across time slices: Got suffixes {cols} for one time slice and {reference_cols} " 

144 f"for another; columns={list(x.columns)}") 

145 

146 return x 

147 

148 class DataUtil(DataUtil): 

149 def __init__(self, x_data: pd.DataFrame, y_data: pd.DataFrame, num_classes): 

150 self.y_data = y_data 

151 self.x_data = x_data 

152 self.numClasses = num_classes 

153 self.scaler = TensorScalerIdentity() 

154 

155 def input_dim(self): 

156 return len(self.x_data.columns) 

157 

158 def model_output_dim(self) -> int: 

159 return self.numClasses 

160 

161 def split_into_tensors(self, fractional_size_of_first_set): 

162 split_index = round(fractional_size_of_first_set * len(self.y_data)) 

163 y1, x1 = self.get_input_output_pair(self.y_data[:split_index], self.x_data[:split_index]) 

164 y2, x2 = self.get_input_output_pair(self.y_data[split_index:], self.x_data[split_index:]) 

165 return (x1, y1), (x2, y2) 

166 

167 def get_input_output_pair(self, output, input): 

168 y = torch.tensor(output.values).long() 

169 x = torch.tensor(input.values).float() 

170 return y, x 

171 

172 def get_output_tensor_scaler(self) -> TensorScaler: 

173 return self.scaler 

174 

175 def get_input_tensor_scaler(self) -> TensorScaler: 

176 return self.scaler