Coverage for src/sensai/torch/torch_data.py: 67%

386 statements  

« prev     ^ index     » next       coverage.py v7.6.1, created at 2024-08-13 22:17 +0000

1import logging 

2from abc import ABC, abstractmethod 

3import math 

4from typing import Tuple, Sequence, Optional, Union, List, Iterator 

5 

6import numpy as np 

7import pandas as pd 

8import sklearn.preprocessing 

9import torch 

10from torch.autograd import Variable 

11 

12from .. import normalisation 

13from ..data import DataFrameSplitter, DataFrameSplitterFractional 

14from ..data_transformation import DFTSkLearnTransformer 

15from ..util.dtype import to_float_array 

16from ..util.pickle import setstate 

17 

18 

19log = logging.getLogger(__name__) 

20 

21 

22def to_tensor(d: Union[torch.Tensor, np.ndarray, list], cuda=False): 

23 if not isinstance(d, torch.Tensor): 

24 if isinstance(d, np.ndarray): 

25 d = torch.from_numpy(d) 

26 elif isinstance(d, list): 

27 d = torch.from_numpy(np.array(d)) 

28 else: 

29 raise ValueError() 

30 if cuda: 

31 d.cuda() 

32 return d 

33 

34 

35class TensorScaler(ABC): 

36 @abstractmethod 

37 def cuda(self): 

38 """ 

39 Makes this scaler's components use CUDA 

40 """ 

41 pass 

42 

43 @abstractmethod 

44 def normalise(self, tensor: torch.Tensor) -> torch.Tensor: 

45 """ 

46 Applies scaling/normalisation to the given tensor 

47 :param tensor: the tensor to scale/normalise 

48 :return: the scaled/normalised tensor 

49 """ 

50 pass 

51 

52 @abstractmethod 

53 def denormalise(self, tensor: torch.Tensor) -> torch.Tensor: 

54 """ 

55 Applies the inverse of method normalise to the given tensor 

56 :param tensor: the tensor to denormalise 

57 :return: the denormalised tensor 

58 """ 

59 pass 

60 

61 

62class TensorScalerCentreAndScale(TensorScaler): 

63 def __init__(self, centre: Optional[torch.Tensor] = None, scale: Optional[torch.Tensor] = None): 

64 self.centre = centre 

65 self.scale = scale 

66 

67 def cuda(self): 

68 if self.scale is not None: 

69 self.scale = self.scale.cuda() 

70 if self.centre is not None: 

71 self.centre = self.centre.cuda() 

72 

73 def normalise(self, tensor: torch.Tensor) -> torch.Tensor: 

74 if self.centre is not None: 

75 tensor -= self.centre 

76 if self.scale is not None: 

77 tensor *= self.scale 

78 return tensor 

79 

80 def denormalise(self, tensor: torch.Tensor) -> torch.Tensor: 

81 if self.scale is not None: 

82 tensor /= self.scale 

83 if self.centre is not None: 

84 tensor += self.centre 

85 return tensor 

86 

87 

88class TensorScalerFromVectorDataScaler(TensorScalerCentreAndScale): 

89 def __init__(self, vector_data_scaler: normalisation.VectorDataScaler, cuda: bool): 

90 if vector_data_scaler.scale is not None: 

91 inv_scale = torch.from_numpy(vector_data_scaler.scale).float() 

92 scale = 1.0 / inv_scale 

93 else: 

94 scale = None 

95 centre = vector_data_scaler.translate 

96 if centre is not None: 

97 centre = torch.from_numpy(vector_data_scaler.translate).float() 

98 super().__init__(centre=centre, scale=scale) 

99 if cuda: 

100 self.cuda() 

101 

102 def __setstate__(self, state): 

103 if "translate" in state: 

104 if state["scale"] is not None: # old representation where scale is actually inverse scale 

105 state["scale"] = 1.0 / state["scale"] 

106 setstate(TensorScalerFromVectorDataScaler, self, state, renamed_properties={"translate": "centre"}) 

107 

108 

109class TensorScalerIdentity(TensorScaler): 

110 def cuda(self): 

111 pass 

112 

113 def normalise(self, tensor: torch.Tensor) -> torch.Tensor: 

114 return tensor 

115 

116 def denormalise(self, tensor: torch.Tensor) -> torch.Tensor: 

117 return tensor 

118 

119 

120class TensorScalerFromDFTSkLearnTransformer(TensorScalerCentreAndScale): 

121 def __init__(self, dft: DFTSkLearnTransformer): 

122 trans = dft.sklearnTransformer 

123 if isinstance(trans, sklearn.preprocessing.RobustScaler): 

124 centre = trans.center_ 

125 scale = trans.scale_ 

126 is_reciprocal_scale = True 

127 else: 

128 raise ValueError(f"sklearn transformer of type '{trans.__class__}' is unhandled") 

129 if centre is not None: 

130 centre = torch.from_numpy(centre).float() 

131 if scale is not None: 

132 scale = torch.from_numpy(scale).float() 

133 if is_reciprocal_scale: 

134 scale = 1.0 / scale 

135 super().__init__(centre=centre, scale=scale) 

136 

137 

138class Tensoriser(ABC): 

139 """ 

140 Represents a method for transforming a data frame into one or more tensors to be processed by a neural network model 

141 """ 

142 def tensorise(self, df: pd.DataFrame) -> Union[torch.Tensor, List[torch.Tensor]]: 

143 result = self._tensorise(df) 

144 if type(result) == list: 

145 lengths = set(map(len, result)) 

146 if len(lengths) != 1: 

147 raise Exception("Lengths of tensors inconsistent") 

148 length = lengths.pop() 

149 else: 

150 length = len(result) 

151 if length != len(df): 

152 raise Exception(f"{self} produced result of length {length} for DataFrame of shape {df.shape}") 

153 return result 

154 

155 @abstractmethod 

156 def _tensorise(self, df: pd.DataFrame) -> Union[torch.Tensor, List[torch.Tensor]]: 

157 pass 

158 

159 @abstractmethod 

160 def fit(self, df: pd.DataFrame, model=None): 

161 """ 

162 :param df: the data frame with which to fit this tensoriser 

163 :param model: the model in the context of which the fitting takes place (if any). 

164 The fitting process may set parameters within the model that can only be determined from the (pre-tensorised) data. 

165 """ 

166 pass 

167 

168 

169class RuleBasedTensoriser(Tensoriser, ABC): 

170 """ 

171 Base class for tensorisers which transform data frames into tensors based on a predefined set of rules and do not require fitting 

172 """ 

173 def fit(self, df: pd.DataFrame, model=None): 

174 pass 

175 

176 

177class TensoriserDataFrameFloatValuesMatrix(RuleBasedTensoriser): 

178 def _tensorise(self, df: pd.DataFrame) -> np.ndarray: 

179 return torch.from_numpy(to_float_array(df)).float() 

180 

181 

182class TensoriserClassLabelIndices(RuleBasedTensoriser): 

183 def _tensorise(self, df: pd.DataFrame) -> np.ndarray: 

184 if len(df.columns) != 1: 

185 raise ValueError("Expected a single column containing the class label indices") 

186 return torch.from_numpy(df[df.columns[0]].values).long() 

187 

188 

189class DataUtil(ABC): 

190 """Interface for DataUtil classes, which are used to process data for neural networks""" 

191 

192 @abstractmethod 

193 def split_into_tensors(self, fractional_size_of_first_set) \ 

194 -> Tuple[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor, torch.Tensor]]: 

195 """ 

196 Splits the data set 

197 

198 :param fractional_size_of_first_set: the desired fractional size in 

199 :return: a tuple (A, B) where A and B are tuples (in, out) with input and output data 

200 """ 

201 pass 

202 

203 @abstractmethod 

204 def get_output_tensor_scaler(self) -> TensorScaler: 

205 """ 

206 Gets the scaler with which to scale model outputs 

207 

208 :return: the scaler 

209 """ 

210 pass 

211 

212 @abstractmethod 

213 def get_input_tensor_scaler(self) -> TensorScaler: 

214 """ 

215 Gets the scaler with which to scale model inputs 

216 

217 :return: the scaler 

218 """ 

219 pass 

220 

221 @abstractmethod 

222 def model_output_dim(self) -> int: 

223 """ 

224 :return: the dimensionality that is to be output by the model to be trained 

225 """ 

226 pass 

227 

228 @abstractmethod 

229 def input_dim(self): 

230 """ 

231 :return: the dimensionality of the input or None if it is variable 

232 """ 

233 pass 

234 

235 

236class VectorDataUtil(DataUtil): 

237 def __init__(self, 

238 inputs: pd.DataFrame, 

239 outputs: pd.DataFrame, 

240 cuda: bool, 

241 normalisation_mode=normalisation.NormalisationMode.NONE, 

242 differing_output_normalisation_mode=None, 

243 input_tensoriser: Optional[Tensoriser] = None, 

244 output_tensoriser: Optional[Tensoriser] = None, 

245 data_frame_splitter: Optional[DataFrameSplitter] = None): 

246 """ 

247 :param inputs: the data frame of inputs 

248 :param outputs: the data frame of outputs 

249 :param cuda: whether to apply CUDA 

250 :param normalisation_mode: the normalisation mode to use for inputs and (unless differingOutputNormalisationMode is specified) 

251 outputs 

252 :param differing_output_normalisation_mode: the normalisation mode to apply to outputs, overriding normalisationMode; 

253 if None, use normalisationMode 

254 """ 

255 if inputs.shape[0] != outputs.shape[0]: 

256 raise ValueError("Output length must be equal to input length") 

257 self.inputs = inputs 

258 self.outputs = outputs 

259 self.inputTensoriser = input_tensoriser if input_tensoriser is not None else TensoriserDataFrameFloatValuesMatrix() 

260 self.outputTensoriser = output_tensoriser if output_tensoriser is not None else TensoriserDataFrameFloatValuesMatrix() 

261 self.inputVectorDataScaler = normalisation.VectorDataScaler(self.inputs, normalisation_mode) 

262 self.inputTensorScaler = TensorScalerFromVectorDataScaler(self.inputVectorDataScaler, cuda) 

263 self.outputVectorDataScaler = normalisation.VectorDataScaler(self.outputs, 

264 normalisation_mode if differing_output_normalisation_mode is None else differing_output_normalisation_mode) 

265 self.outputTensorScaler = TensorScalerFromVectorDataScaler(self.outputVectorDataScaler, cuda) 

266 self.dataFrameSplitter = data_frame_splitter 

267 

268 def __len__(self): 

269 return len(self.inputs) 

270 

271 def get_output_tensor_scaler(self): 

272 return self.outputTensorScaler 

273 

274 def get_input_tensor_scaler(self): 

275 return self.inputTensorScaler 

276 

277 def _compute_split_indices(self, fractional_size_of_first_set): 

278 splitter = self.dataFrameSplitter 

279 if splitter is None: 

280 # By default, we use a simple fractional split without shuffling. 

281 # Shuffling is usually unnecessary, because in evaluation contexts, the data may have already been shuffled by the evaluator 

282 # (unless explicitly disabled by the user). Furthermore, not shuffling gives the user the possibility to manually 

283 # order the data in ways that result in desirable fractional splits (though the user may, of course, simply override 

284 # the splitter to achieve any desired split). 

285 splitter = DataFrameSplitterFractional(shuffle=False) 

286 indices_a, indices_b = splitter.compute_split_indices(self.inputs, fractional_size_of_first_set) 

287 return indices_a, indices_b 

288 

289 def split_into_tensors(self, fractional_size_of_first_set): 

290 indices_a, indices_b = self._compute_split_indices(fractional_size_of_first_set) 

291 a = self._tensors_for_indices(indices_a) 

292 b = self._tensors_for_indices(indices_b) 

293 return a, b 

294 

295 def _data_frames_for_indices(self, indices): 

296 input_df = self.inputs.iloc[indices] 

297 output_df = self.outputs.iloc[indices] 

298 return input_df, output_df 

299 

300 def _tensors_for_indices(self, indices): 

301 input_df, output_df = self._data_frames_for_indices(indices) 

302 return self._tensors_for_data_frames(input_df, output_df) 

303 

304 def _tensors_for_data_frames(self, input_df, output_df): 

305 # apply normalisation (if any) 

306 if self.inputVectorDataScaler.normalisation_mode != normalisation.NormalisationMode.NONE: 

307 input_df = pd.DataFrame(self.inputVectorDataScaler.get_normalised_array(input_df), columns=input_df.columns, 

308 index=input_df.index) 

309 if self.outputVectorDataScaler.normalisation_mode != normalisation.NormalisationMode.NONE: 

310 output_df = pd.DataFrame(self.outputVectorDataScaler.get_normalised_array(output_df), columns=output_df.columns, 

311 index=output_df.index) 

312 

313 return self.inputTensoriser.tensorise(input_df), self.outputTensoriser.tensorise(output_df) 

314 

315 def split_into_data_sets(self, fractional_size_of_first_set, cuda: bool, tensorise_dynamically=False) \ 

316 -> Tuple["TorchDataSet", "TorchDataSet"]: 

317 if not tensorise_dynamically: 

318 (xA, yA), (xB, yB) = self.split_into_tensors(fractional_size_of_first_set) 

319 return TorchDataSetFromTensors(xA, yA, cuda), TorchDataSetFromTensors(xB, yB, cuda) 

320 else: 

321 if self.inputVectorDataScaler.normalisation_mode != normalisation.NormalisationMode.NONE or \ 

322 self.outputVectorDataScaler.normalisation_mode != normalisation.NormalisationMode.NONE: 

323 raise Exception("Dynamic tensorisation is not supported when using data scaling") 

324 indices_a, indices_b = self._compute_split_indices(fractional_size_of_first_set) 

325 input_a, output_a = self._data_frames_for_indices(indices_a) 

326 input_b, output_b = self._data_frames_for_indices(indices_b) 

327 ds_a = TorchDataSetFromDataFramesDynamicallyTensorised(input_a, output_a, cuda, input_tensoriser=self.inputTensoriser, 

328 output_tensoriser=self.outputTensoriser) 

329 ds_b = TorchDataSetFromDataFramesDynamicallyTensorised(input_b, output_b, cuda, input_tensoriser=self.inputTensoriser, 

330 output_tensoriser=self.outputTensoriser) 

331 return ds_a, ds_b 

332 

333 def input_dim(self): 

334 return self.inputs.shape[1] 

335 

336 def output_dim(self): 

337 """ 

338 :return: the dimensionality of the outputs (ground truth values) 

339 """ 

340 return self.outputs.shape[1] 

341 

342 def model_output_dim(self): 

343 return self.output_dim() 

344 

345 

346class ClassificationVectorDataUtil(VectorDataUtil): 

347 def __init__(self, 

348 inputs: pd.DataFrame, 

349 outputs: pd.DataFrame, 

350 cuda, 

351 num_classes, 

352 normalisation_mode=normalisation.NormalisationMode.NONE, 

353 input_tensoriser: Tensoriser = None, 

354 output_tensoriser: Tensoriser = None, 

355 data_frame_splitter: Optional[DataFrameSplitter] = None): 

356 if len(outputs.columns) != 1: 

357 raise Exception(f"Exactly one output dimension (the class index) is required, got {len(outputs.columns)}") 

358 super().__init__(inputs, outputs, cuda, normalisation_mode=normalisation_mode, 

359 differing_output_normalisation_mode=normalisation.NormalisationMode.NONE, input_tensoriser=input_tensoriser, 

360 output_tensoriser=TensoriserClassLabelIndices() if output_tensoriser is None else output_tensoriser, 

361 data_frame_splitter=data_frame_splitter) 

362 self.numClasses = num_classes 

363 

364 def model_output_dim(self): 

365 return self.numClasses 

366 

367 

368class TorchDataSet: 

369 @abstractmethod 

370 def iter_batches(self, batch_size: int, shuffle: bool = False, input_only=False) -> Iterator[Union[Tuple[torch.Tensor, torch.Tensor], 

371 Tuple[Sequence[torch.Tensor], torch.Tensor], torch.Tensor, Sequence[torch.Tensor]]]: 

372 """ 

373 Provides an iterator over batches from the data set. 

374 

375 :param batch_size: the maximum size of each batch 

376 :param shuffle: whether to shuffle the data set 

377 :param input_only: whether to provide only inputs (rather than inputs and corresponding outputs). 

378 If true, provide only inputs, where inputs can either be a tensor or a tuple of tensors. 

379 If false, provide a pair (i, o) with inputs and corresponding outputs (o is always a tensor). 

380 Some data sets may only be able to provide inputs, in which case inputOnly=False should lead to an 

381 exception. 

382 """ 

383 pass 

384 

385 @abstractmethod 

386 def size(self) -> Optional[int]: 

387 """ 

388 Returns the total size of the data set (number of data points) if it is known. 

389 

390 :return: the number of data points or None of the size is not known. 

391 """ 

392 pass 

393 

394 

395class TorchDataSetProvider: 

396 def __init__(self, input_tensor_scaler: Optional[TensorScaler] = None, output_tensor_scaler: Optional[TensorScaler] = None, 

397 input_dim: Optional[int] = None, model_output_dim: int = None): 

398 if input_tensor_scaler is None: 

399 input_tensor_scaler = TensorScalerIdentity() 

400 if output_tensor_scaler is None: 

401 output_tensor_scaler = TensorScalerIdentity() 

402 if model_output_dim is None: 

403 raise ValueError("The model output dimension must be provided") 

404 self.inputTensorScaler = input_tensor_scaler 

405 self.outputTensorScaler = output_tensor_scaler 

406 self.inputDim = input_dim 

407 self.modelOutputDim = model_output_dim 

408 

409 @abstractmethod 

410 def provide_split(self, fractional_size_of_first_set: float) -> Tuple[TorchDataSet, TorchDataSet]: 

411 """ 

412 Provides two data sets, which could, for example, serve as training and validation sets. 

413 

414 :param fractional_size_of_first_set: the fractional size of the first data set 

415 :return: a tuple of data sets (A, B) where A has (approximately) the given fractional size and B encompasses 

416 the remainder of the data 

417 """ 

418 pass 

419 

420 def get_output_tensor_scaler(self) -> TensorScaler: 

421 return self.outputTensorScaler 

422 

423 def get_input_tensor_scaler(self) -> TensorScaler: 

424 return self.inputTensorScaler 

425 

426 def get_model_output_dim(self) -> int: 

427 """ 

428 :return: the number of output dimensions that would be required to be generated by the model to match this dataset. 

429 """ 

430 return self.modelOutputDim 

431 

432 def get_input_dim(self) -> Optional[int]: 

433 """ 

434 :return: the number of output dimensions that would be required to be generated by the model to match this dataset. 

435 For models that accept variable input sizes (such as RNNs), this may be None. 

436 """ 

437 return self.inputDim 

438 

439 

440class TensorTuple: 

441 """ 

442 Represents a tuple of tensors (or a single tensor) and can be used to manipulate the contained tensors simultaneously 

443 """ 

444 def __init__(self, tensors: Union[torch.Tensor, Sequence[torch.Tensor]]): 

445 if isinstance(tensors, torch.Tensor): 

446 tensors = [tensors] 

447 lengths = set(map(len, tensors)) 

448 if len(lengths) != 1: 

449 raise ValueError("Not all tensors are of the same length") 

450 self.length = lengths.pop() 

451 self.tensors = tensors 

452 

453 def __len__(self): 

454 return self.length 

455 

456 def __getitem__(self, key) -> "TensorTuple": 

457 t = tuple((t[key] for t in self.tensors)) 

458 return TensorTuple(t) 

459 

460 def cuda(self) -> "TensorTuple": 

461 return TensorTuple([t.cuda() for t in self.tensors]) 

462 

463 def tuple(self) -> Sequence[torch.Tensor]: 

464 return tuple(self.tensors) 

465 

466 def item(self) -> Union[torch.Tensor, Sequence[torch.Tensor]]: 

467 if len(self.tensors) == 1: 

468 return self.tensors[0] 

469 else: 

470 return self.tuple() 

471 

472 def concat(self, other: "TensorTuple") -> "TensorTuple": 

473 if len(self.tensors) != len(other.tensors): 

474 raise ValueError("Tensor tuples are incompatible") 

475 tensors = [torch.cat([a, b], dim=0) for a, b in zip(self.tensors, other.tensors)] 

476 return TensorTuple(tensors) 

477 

478 

479class TorchDataSetFromTensors(TorchDataSet): 

480 def __init__(self, x: Union[torch.Tensor, Sequence[torch.Tensor]], y: Optional[torch.Tensor], cuda: bool): 

481 """ 

482 :param x: the input tensor(s); if more than one, they must be of the same length (and a slice of each shall be provided to the 

483 model as an input in each batch) 

484 :param y: the output tensor 

485 :param cuda: whether any generated tensors shall be moved to the selected CUDA device 

486 """ 

487 x = TensorTuple(x) 

488 y = TensorTuple(y) if y is not None else None 

489 if y is not None and len(x) != len(y): 

490 raise ValueError("Tensors are not of the same length") 

491 self.x = x 

492 self.y = y 

493 self.cuda = cuda 

494 

495 def iter_batches(self, batch_size: int, shuffle: bool = False, input_only=False) -> Iterator[Union[Tuple[torch.Tensor, torch.Tensor], 

496 Tuple[Sequence[torch.Tensor], torch.Tensor], torch.Tensor, Sequence[torch.Tensor]]]: 

497 tensor_tuples = (self.x, self.y) if not input_only and self.y is not None else (self.x,) 

498 yield from self._get_batches(tensor_tuples, batch_size, shuffle) 

499 

500 def _get_batches(self, tensor_tuples: Sequence[TensorTuple], batch_size, shuffle): 

501 length = len(tensor_tuples[0]) 

502 if shuffle: 

503 index = torch.randperm(length) 

504 else: 

505 index = torch.LongTensor(range(length)) 

506 start_idx = 0 

507 while start_idx < length: 

508 remaining_items = length - start_idx 

509 is_second_last_batch = remaining_items <= 2*batch_size and remaining_items > batch_size 

510 if is_second_last_batch: 

511 # to avoid cases where the last batch is excessively small (1 item in the worst case, where e.g. batch 

512 # normalisation would not be applicable), we evenly distribute the items across the last two batches 

513 adjusted_batch_size = math.ceil(remaining_items / 2) 

514 end_idx = min(length, start_idx + adjusted_batch_size) 

515 else: 

516 end_idx = min(length, start_idx + batch_size) 

517 excerpt = index[start_idx:end_idx] 

518 batch = [] 

519 for tensorTuple in tensor_tuples: 

520 if len(tensorTuple) != length: 

521 raise Exception("Passed tensors of differing lengths") 

522 t = tensorTuple[excerpt] 

523 if self.cuda: 

524 t = t.cuda() 

525 item = t.item() 

526 if type(item) == tuple: 

527 item = tuple(Variable(t) for t in item) 

528 else: 

529 item = Variable(item) 

530 batch.append(item) 

531 if len(batch) == 1: 

532 yield batch[0] 

533 else: 

534 yield tuple(batch) 

535 start_idx = end_idx 

536 

537 def size(self): 

538 return len(self.x) 

539 

540 

541class TorchDataSetFromDataFramesPreTensorised(TorchDataSetFromTensors): 

542 def __init__(self, input_df: pd.DataFrame, output_df: Optional[pd.DataFrame], cuda: bool, 

543 input_tensoriser: Optional[Tensoriser] = None, output_tensoriser: Optional[Tensoriser] = None): 

544 if input_tensoriser is None: 

545 input_tensoriser = TensoriserDataFrameFloatValuesMatrix() 

546 log.debug(f"Applying {input_tensoriser} to data frame of length {len(input_df)} ...") 

547 input_tensors = input_tensoriser.tensorise(input_df) 

548 if output_df is not None: 

549 if output_tensoriser is None: 

550 output_tensoriser = TensoriserDataFrameFloatValuesMatrix() 

551 log.debug(f"Applying {output_tensoriser} to data frame of length {len(output_df)} ...") 

552 output_tensors = output_tensoriser.tensorise(output_df) 

553 else: 

554 output_tensors = None 

555 super().__init__(input_tensors, output_tensors, cuda) 

556 

557 

558class TorchDataSetFromDataFramesDynamicallyTensorised(TorchDataSet): 

559 def __init__(self, input_df: pd.DataFrame, output_df: Optional[pd.DataFrame], cuda: bool, 

560 input_tensoriser: Optional[Tensoriser] = None, output_tensoriser: Optional[Tensoriser] = None): 

561 self.inputDF = input_df 

562 self.outputDF = output_df 

563 self.cuda = cuda 

564 if input_tensoriser is None: 

565 input_tensoriser = TensoriserDataFrameFloatValuesMatrix() 

566 self.inputTensoriser = input_tensoriser 

567 if output_df is not None: 

568 if len(input_df) != len(output_df): 

569 raise ValueError("Lengths of input and output data frames must be equal") 

570 if output_tensoriser is None: 

571 output_tensoriser = TensoriserDataFrameFloatValuesMatrix() 

572 self.outputTensoriser = output_tensoriser 

573 

574 def size(self) -> Optional[int]: 

575 return len(self.inputDF) 

576 

577 def iter_batches(self, batch_size: int, shuffle: bool = False, input_only=False): 

578 length = len(self.inputDF) 

579 if shuffle: 

580 index = torch.randperm(length) 

581 else: 

582 index = torch.LongTensor(range(length)) 

583 i = 0 

584 while i < length: 

585 batch_indices = index[i:i + batch_size] 

586 input_tensors = TensorTuple(self.inputTensoriser.tensorise(self.inputDF.iloc[batch_indices])) 

587 if self.cuda: 

588 input_tensors = input_tensors.cuda() 

589 if input_only: 

590 yield input_tensors.item() 

591 else: 

592 output_tensors = TensorTuple(self.outputTensoriser.tensorise(self.outputDF.iloc[batch_indices])) 

593 if self.cuda: 

594 output_tensors = output_tensors.cuda() 

595 yield input_tensors.item(), output_tensors.item() 

596 i += batch_size 

597 

598 

599class TorchDataSetFromDataFrames(TorchDataSet): 

600 def __init__(self, input_df: pd.DataFrame, output_df: Optional[pd.DataFrame], cuda: bool, 

601 input_tensoriser: Optional[Tensoriser] = None, output_tensoriser: Optional[Tensoriser] = None, 

602 tensorise_dynamically=False): 

603 if tensorise_dynamically: 

604 self._torchDataSet: TorchDataSet = TorchDataSetFromDataFramesDynamicallyTensorised(input_df, output_df, cuda, 

605 input_tensoriser=input_tensoriser, output_tensoriser=output_tensoriser) 

606 else: 

607 self._torchDataSet: TorchDataSet = TorchDataSetFromDataFramesPreTensorised(input_df, output_df, cuda, 

608 input_tensoriser=input_tensoriser, output_tensoriser=output_tensoriser) 

609 

610 def iter_batches(self, batch_size: int, shuffle: bool = False, input_only=False): 

611 yield from self._torchDataSet.iter_batches(batch_size, shuffle=shuffle, input_only=input_only) 

612 

613 def size(self) -> Optional[int]: 

614 return self._torchDataSet.size() 

615 

616 

617class TorchDataSetProviderFromDataUtil(TorchDataSetProvider): 

618 def __init__(self, data_util: DataUtil, cuda: bool): 

619 super().__init__(input_tensor_scaler=data_util.get_input_tensor_scaler(), output_tensor_scaler=data_util.get_output_tensor_scaler(), 

620 input_dim=data_util.input_dim(), model_output_dim=data_util.model_output_dim()) 

621 self.dataUtil = data_util 

622 self.cuda = cuda 

623 

624 def provide_split(self, fractional_size_of_first_set: float) -> Tuple[TorchDataSet, TorchDataSet]: 

625 (x1, y1), (x2, y2) = self.dataUtil.split_into_tensors(fractional_size_of_first_set) 

626 return TorchDataSetFromTensors(x1, y1, self.cuda), TorchDataSetFromTensors(x2, y2, self.cuda) 

627 

628 

629class TorchDataSetProviderFromVectorDataUtil(TorchDataSetProvider): 

630 def __init__(self, data_util: VectorDataUtil, cuda: bool, tensorise_dynamically=False): 

631 super().__init__(input_tensor_scaler=data_util.get_input_tensor_scaler(), output_tensor_scaler=data_util.get_output_tensor_scaler(), 

632 input_dim=data_util.input_dim(), model_output_dim=data_util.model_output_dim()) 

633 self.dataUtil = data_util 

634 self.cuda = cuda 

635 self.tensoriseDynamically = tensorise_dynamically 

636 

637 def provide_split(self, fractional_size_of_first_set: float) -> Tuple[TorchDataSet, TorchDataSet]: 

638 return self.dataUtil.split_into_data_sets(fractional_size_of_first_set, self.cuda, tensorise_dynamically=self.tensoriseDynamically) 

639 

640 

641class TensorTransformer(ABC): 

642 @abstractmethod 

643 def transform(self, t: torch.Tensor) -> torch.Tensor: 

644 pass