Coverage for src/sensai/torch/torch_models/lstnet/lstnet_models.py: 28%
102 statements
« prev ^ index » next coverage.py v7.6.1, created at 2024-08-13 22:17 +0000
« prev ^ index » next coverage.py v7.6.1, created at 2024-08-13 22:17 +0000
1import collections
2import logging
3import re
4from typing import Optional, Union
6import pandas as pd
7import torch
9from .lstnet_modules import LSTNetwork
10from ...torch_base import TorchVectorClassificationModel, VectorTorchModel, ClassificationOutputMode
11from ...torch_data import TorchDataSetProviderFromDataUtil, TensorScalerIdentity, TensorScaler, DataUtil
12from ...torch_enums import ActivationFunction
13from ...torch_opt import NNOptimiserParams
15log: logging.Logger = logging.getLogger(__name__)
18class LSTNetworkVectorClassificationModel(TorchVectorClassificationModel):
19 """
20 Classification model for time series data using the LSTNetwork architecture.
22 Since the model takes a time series as input, it requires that input data frames to use special naming of columns
23 such that the data can be interpreted correctly:
24 Each column name must start with an N-digit prefix indicating the time slice the data pertains to (for any fixed N);
25 the following suffix shall indicate the name of the actual feature.
26 For each N-digit prefix, we must have the same set of suffixes in the list of columns, i.e. we must have the same
27 features for each time slice in the input time series.
28 """
29 def __init__(self, num_input_time_slices, input_dim_per_time_slice, num_classes: Optional[int] = None,
30 num_convolutions: int = 100, num_cnn_time_slices: int = 6, hid_rnn: int = 100, skip: int = 0, hid_skip: int = 5,
31 hw_window: int = 0, hw_combine: str = "plus", dropout=0.2, output_activation=ActivationFunction.LOG_SOFTMAX, cuda=True,
32 nn_optimiser_params: Union[dict, NNOptimiserParams] = None):
33 """
34 :param num_input_time_slices: the number of input time slices
35 :param input_dim_per_time_slice: the dimension of the input data per time slice
36 :param num_classes: the number of classes considered by this classification problem; if None, determine from data
37 :param num_cnn_time_slices: the number of time slices considered by each convolution (i.e. it is one of the dimensions of the matrix
38 used for convolutions, the other dimension being inputDimPerTimeSlice), a.k.a. "Ck"
39 :param num_convolutions: the number of separate convolutions to apply, i.e. the number of independent convolution matrices,
40 a.k.a "hidC";
41 if it is 0, then the entire complex processing path is not applied.
42 :param hid_rnn: the number of hidden output dimensions for the RNN stage
43 :param skip: the number of time slices to skip for the skip-RNN. If it is 0, then the skip-RNN is not used.
44 :param hid_skip: the number of output dimensions of each of the skip parallel RNNs
45 :param hw_window: the number of time slices from the end of the input time series to consider as input for the highway component.
46 If it is 0, the highway component is not used.
47 :param hw_combine: {"plus", "product", "bilinear"} the function with which the highway component's output is combined with the
48 complex path's output
49 :param dropout: the dropout probability to use during training (dropouts are applied after every major step in the evaluation path)
50 :param output_activation: the output activation function
51 :param nn_optimiser_params: parameters for NNOptimiser to use for training
52 """
53 self.num_input_time_slices = num_input_time_slices
54 self.input_dim_per_time_slice = input_dim_per_time_slice
55 self.num_convolutions = num_convolutions
56 self.num_cnn_time_slices = num_cnn_time_slices
57 self.hid_rnn = hid_rnn
58 self.skip = skip
59 self.hid_skip = hid_skip
60 self.hw_window = hw_window
61 self.hw_combine = hw_combine
62 self.dropout = dropout
63 self.cuda = cuda
64 self.output_activation = output_activation
65 self.num_classes = num_classes
66 output_mode = ClassificationOutputMode.for_activation_fn(ActivationFunction.torch_function_from_any(output_activation))
67 super().__init__(output_mode, self._create_lst_network_model, nn_optimiser_params=nn_optimiser_params)
69 def _create_lst_network_model(self):
70 return self._LSTNetworkModel(self)
72 class _LSTNetworkModel(VectorTorchModel):
73 def __init__(self, parent: "LSTNetworkVectorClassificationModel"):
74 super().__init__(parent.cuda)
75 self.parent = parent
77 def create_torch_module_for_dims(self, input_dim, output_dim):
78 p = self.parent
79 expected_input_dim = p.num_input_time_slices * p.input_dim_per_time_slice
80 if expected_input_dim != input_dim:
81 raise ValueError(f"Unexpected input size {input_dim}, expected {self.inputDim}")
82 if p.num_classes is None:
83 output_dim_per_time_slice = output_dim
84 else:
85 output_dim_per_time_slice = p.num_classes
86 if p.num_classes != output_dim:
87 raise ValueError(f"Unexpected output dim {output_dim}, expected {p.num_classes}")
88 return LSTNetwork(num_input_time_slices=p.num_input_time_slices,
89 input_dim_per_time_slice=p.input_dim_per_time_slice,
90 num_output_time_slices=1,
91 output_dim_per_time_slice=output_dim_per_time_slice,
92 num_convolutions=p.num_convolutions,
93 num_cnn_time_slices=p.num_cnn_time_slices,
94 hid_rnn=p.hid_rnn,
95 hw_window=p.hw_window,
96 hw_combine=p.hw_combine,
97 dropout=p.dropout,
98 output_activation=p.output_activation,
99 skip=p.skip,
100 hid_skip=p.hid_skip,
101 mode=LSTNetwork.Mode.CLASSIFICATION)
103 def _create_data_set_provider(self, inputs: pd.DataFrame, outputs: pd.DataFrame) -> TorchDataSetProviderFromDataUtil:
104 if self.num_classes is None:
105 self.num_classes = len(self._labels)
106 elif self.num_classes != len(self._labels):
107 raise ValueError(f"Output dimension {self.num_classes} per time time slice was specified, while the training data contains "
108 f"{len(self._labels)} classes")
109 return TorchDataSetProviderFromDataUtil(self.DataUtil(inputs, outputs, self.num_classes), self.cuda)
111 def _predict_outputs_for_input_data_frame(self, inputs: pd.DataFrame) -> torch.Tensor:
112 log.info(f"Predicting outputs for {len(inputs)} inputs")
113 result = super()._predict_outputs_for_input_data_frame(inputs)
114 return result.squeeze(2)
116 def _compute_model_inputs(self, x: pd.DataFrame, y: pd.DataFrame = None, fit=False) -> pd.DataFrame:
117 x = super()._compute_model_inputs(x, y=y, fit=fit)
119 # sort input data frame columns by name
120 x = x[sorted(x.columns)]
122 # check input column name format and consistency
123 col_name_regex = re.compile(r"(\d+).+")
124 cols_by_time_slice = collections.defaultdict(list)
125 num_digits = None
126 for colName in x.columns:
127 match = col_name_regex.fullmatch(colName)
128 if not match:
129 raise ValueError(f"Column name '{colName}' does not match the required format (N-digit prefix indicating the time slice "
130 f"order followed by feature name; for any fixed N); columns={list(x.columns)}")
131 time_slice = match.group(1)
132 if num_digits is None:
133 num_digits = len(time_slice)
134 elif num_digits != len(time_slice):
135 raise ValueError(f"Inconsistent number of digits in column names: Got {num_digits} leading digits for one feature and "
136 f"{len(time_slice)} for another; columns={list(x.columns)}")
137 cols_by_time_slice[time_slice].append(colName[num_digits:])
138 reference_cols = None
139 for time_slice, cols in cols_by_time_slice.items():
140 if reference_cols is None:
141 reference_cols = cols
142 elif reference_cols != cols:
143 raise ValueError(f"Inconsistent features across time slices: Got suffixes {cols} for one time slice and {reference_cols} "
144 f"for another; columns={list(x.columns)}")
146 return x
148 class DataUtil(DataUtil):
149 def __init__(self, x_data: pd.DataFrame, y_data: pd.DataFrame, num_classes):
150 self.y_data = y_data
151 self.x_data = x_data
152 self.numClasses = num_classes
153 self.scaler = TensorScalerIdentity()
155 def input_dim(self):
156 return len(self.x_data.columns)
158 def model_output_dim(self) -> int:
159 return self.numClasses
161 def split_into_tensors(self, fractional_size_of_first_set):
162 split_index = round(fractional_size_of_first_set * len(self.y_data))
163 y1, x1 = self.get_input_output_pair(self.y_data[:split_index], self.x_data[:split_index])
164 y2, x2 = self.get_input_output_pair(self.y_data[split_index:], self.x_data[split_index:])
165 return (x1, y1), (x2, y2)
167 def get_input_output_pair(self, output, input):
168 y = torch.tensor(output.values).long()
169 x = torch.tensor(input.values).float()
170 return y, x
172 def get_output_tensor_scaler(self) -> TensorScaler:
173 return self.scaler
175 def get_input_tensor_scaler(self) -> TensorScaler:
176 return self.scaler