Coverage for src/sensai/torch/torch_models/lstnet/lstnet

1import collections

2import logging

3import re

4from typing import Optional, Union

6import pandas as pd

7import torch

9from .lstnet_modules import LSTNetwork

10from ...torch_base import TorchVectorClassificationModel, VectorTorchModel, ClassificationOutputMode

11from ...torch_data import TorchDataSetProviderFromDataUtil, TensorScalerIdentity, TensorScaler, DataUtil

12from ...torch_enums import ActivationFunction

13from ...torch_opt import NNOptimiserParams

15log: logging.Logger = logging.getLogger(__name__)

18class LSTNetworkVectorClassificationModel(TorchVectorClassificationModel):

19 """

20 Classification model for time series data using the LSTNetwork architecture.

22 Since the model takes a time series as input, it requires that input data frames to use special naming of columns

23 such that the data can be interpreted correctly:

24 Each column name must start with an N-digit prefix indicating the time slice the data pertains to (for any fixed N);

25 the following suffix shall indicate the name of the actual feature.

26 For each N-digit prefix, we must have the same set of suffixes in the list of columns, i.e. we must have the same

27 features for each time slice in the input time series.

28 """

29 def __init__(self, num_input_time_slices, input_dim_per_time_slice, num_classes: Optional[int] = None,

30 num_convolutions: int = 100, num_cnn_time_slices: int = 6, hid_rnn: int = 100, skip: int = 0, hid_skip: int = 5,

31 hw_window: int = 0, hw_combine: str = "plus", dropout=0.2, output_activation=ActivationFunction.LOG_SOFTMAX, cuda=True,

32 nn_optimiser_params: Union[dict, NNOptimiserParams] = None):

33 """

34 :param num_input_time_slices: the number of input time slices

35 :param input_dim_per_time_slice: the dimension of the input data per time slice

36 :param num_classes: the number of classes considered by this classification problem; if None, determine from data

37 :param num_cnn_time_slices: the number of time slices considered by each convolution (i.e. it is one of the dimensions of the matrix

38 used for convolutions, the other dimension being inputDimPerTimeSlice), a.k.a. "Ck"

39 :param num_convolutions: the number of separate convolutions to apply, i.e. the number of independent convolution matrices,

40 a.k.a "hidC";

41 if it is 0, then the entire complex processing path is not applied.

42 :param hid_rnn: the number of hidden output dimensions for the RNN stage

43 :param skip: the number of time slices to skip for the skip-RNN. If it is 0, then the skip-RNN is not used.

44 :param hid_skip: the number of output dimensions of each of the skip parallel RNNs

45 :param hw_window: the number of time slices from the end of the input time series to consider as input for the highway component.

46 If it is 0, the highway component is not used.

47 :param hw_combine: {"plus", "product", "bilinear"} the function with which the highway component's output is combined with the

48 complex path's output

49 :param dropout: the dropout probability to use during training (dropouts are applied after every major step in the evaluation path)

50 :param output_activation: the output activation function

51 :param nn_optimiser_params: parameters for NNOptimiser to use for training

52 """

53 self.num_input_time_slices = num_input_time_slices

54 self.input_dim_per_time_slice = input_dim_per_time_slice

55 self.num_convolutions = num_convolutions

56 self.num_cnn_time_slices = num_cnn_time_slices

57 self.hid_rnn = hid_rnn

58 self.skip = skip

59 self.hid_skip = hid_skip

60 self.hw_window = hw_window

61 self.hw_combine = hw_combine

62 self.dropout = dropout

63 self.cuda = cuda

64 self.output_activation = output_activation

65 self.num_classes = num_classes

66 output_mode = ClassificationOutputMode.for_activation_fn(ActivationFunction.torch_function_from_any(output_activation))

67 super().__init__(output_mode, self._create_lst_network_model, nn_optimiser_params=nn_optimiser_params)

69 def _create_lst_network_model(self):

70 return self._LSTNetworkModel(self)

72 class _LSTNetworkModel(VectorTorchModel):

73 def __init__(self, parent: "LSTNetworkVectorClassificationModel"):

74 super().__init__(parent.cuda)

75 self.parent = parent

77 def create_torch_module_for_dims(self, input_dim, output_dim):

78 p = self.parent

79 expected_input_dim = p.num_input_time_slices * p.input_dim_per_time_slice

80 if expected_input_dim != input_dim:

81 raise ValueError(f"Unexpected input size {input_dim}, expected {self.inputDim}")

82 if p.num_classes is None:

83 output_dim_per_time_slice = output_dim

84 else:

85 output_dim_per_time_slice = p.num_classes

86 if p.num_classes != output_dim:

87 raise ValueError(f"Unexpected output dim {output_dim}, expected {p.num_classes}")

88 return LSTNetwork(num_input_time_slices=p.num_input_time_slices,

89 input_dim_per_time_slice=p.input_dim_per_time_slice,

90 num_output_time_slices=1,

91 output_dim_per_time_slice=output_dim_per_time_slice,

92 num_convolutions=p.num_convolutions,

93 num_cnn_time_slices=p.num_cnn_time_slices,

94 hid_rnn=p.hid_rnn,

95 hw_window=p.hw_window,

96 hw_combine=p.hw_combine,

97 dropout=p.dropout,

98 output_activation=p.output_activation,

99 skip=p.skip,

100 hid_skip=p.hid_skip,

101 mode=LSTNetwork.Mode.CLASSIFICATION)

102

103 def _create_data_set_provider(self, inputs: pd.DataFrame, outputs: pd.DataFrame) -> TorchDataSetProviderFromDataUtil:

104 if self.num_classes is None:

105 self.num_classes = len(self._labels)

106 elif self.num_classes != len(self._labels):

107 raise ValueError(f"Output dimension {self.num_classes} per time time slice was specified, while the training data contains "

108 f"{len(self._labels)} classes")

109 return TorchDataSetProviderFromDataUtil(self.DataUtil(inputs, outputs, self.num_classes), self.cuda)

110

111 def _predict_outputs_for_input_data_frame(self, inputs: pd.DataFrame) -> torch.Tensor:

112 log.info(f"Predicting outputs for {len(inputs)} inputs")

113 result = super()._predict_outputs_for_input_data_frame(inputs)

114 return result.squeeze(2)

115

116 def _compute_model_inputs(self, x: pd.DataFrame, y: pd.DataFrame = None, fit=False) -> pd.DataFrame:

117 x = super()._compute_model_inputs(x, y=y, fit=fit)

118

119 # sort input data frame columns by name

120 x = x[sorted(x.columns)]

121

122 # check input column name format and consistency

123 col_name_regex = re.compile(r"(\d+).+")

124 cols_by_time_slice = collections.defaultdict(list)

125 num_digits = None

126 for colName in x.columns:

127 match = col_name_regex.fullmatch(colName)

128 if not match:

129 raise ValueError(f"Column name '{colName}' does not match the required format (N-digit prefix indicating the time slice "

130 f"order followed by feature name; for any fixed N); columns={list(x.columns)}")

131 time_slice = match.group(1)

132 if num_digits is None:

133 num_digits = len(time_slice)

134 elif num_digits != len(time_slice):

135 raise ValueError(f"Inconsistent number of digits in column names: Got {num_digits} leading digits for one feature and "

136 f"{len(time_slice)} for another; columns={list(x.columns)}")

137 cols_by_time_slice[time_slice].append(colName[num_digits:])

138 reference_cols = None

139 for time_slice, cols in cols_by_time_slice.items():

140 if reference_cols is None:

141 reference_cols = cols

142 elif reference_cols != cols:

143 raise ValueError(f"Inconsistent features across time slices: Got suffixes {cols} for one time slice and {reference_cols} "

144 f"for another; columns={list(x.columns)}")

145

146 return x

147

148 class DataUtil(DataUtil):

149 def __init__(self, x_data: pd.DataFrame, y_data: pd.DataFrame, num_classes):

150 self.y_data = y_data

151 self.x_data = x_data

152 self.numClasses = num_classes

153 self.scaler = TensorScalerIdentity()

154

155 def input_dim(self):

156 return len(self.x_data.columns)

157

158 def model_output_dim(self) -> int:

159 return self.numClasses

160

161 def split_into_tensors(self, fractional_size_of_first_set):

162 split_index = round(fractional_size_of_first_set * len(self.y_data))

163 y1, x1 = self.get_input_output_pair(self.y_data[:split_index], self.x_data[:split_index])

164 y2, x2 = self.get_input_output_pair(self.y_data[split_index:], self.x_data[split_index:])

165 return (x1, y1), (x2, y2)

166

167 def get_input_output_pair(self, output, input):

168 y = torch.tensor(output.values).long()

169 x = torch.tensor(input.values).float()

170 return y, x

171

172 def get_output_tensor_scaler(self) -> TensorScaler:

173 return self.scaler

174

175 def get_input_tensor_scaler(self) -> TensorScaler:

176 return self.scaler

Coverage for src/sensai/torch/torch_models/lstnet/lstnet_models.py: 28%

102 statements