Coverage for src/sensai/featuregen/feature_generator.py: 61%

416 statements  

« prev     ^ index     » next       coverage.py v7.6.1, created at 2024-08-13 22:17 +0000

1import functools 

2import logging 

3import re 

4from abc import ABC, abstractmethod 

5from typing import Sequence, List, Union, Callable, Any, Dict, TYPE_CHECKING, Optional 

6 

7import numpy as np 

8import pandas as pd 

9 

10from .. import util, data_transformation 

11from ..data_transformation import DFTNormalisation, DFTFromFeatureGenerator, DataFrameTransformer 

12from ..util import flatten_arguments 

13from ..util.string import or_regex_group, ToStringMixin, list_string 

14from ..util.typing import PandasNamedTuple 

15 

16if TYPE_CHECKING: 

17 from ..vector_model import VectorModel 

18 from ..columngen import ColumnGenerator 

19 

20 

21log = logging.getLogger(__name__) 

22 

23 

24class DuplicateColumnNamesException(Exception): 

25 pass 

26 

27 

28class FeatureGenerator(ToStringMixin, ABC): 

29 """ 

30 Base class for feature generators that create a new DataFrame containing feature values 

31 from an input DataFrame 

32 """ 

33 def __init__(self, 

34 categorical_feature_names: Optional[Union[Sequence[str], str]] = None, 

35 normalisation_rules: Sequence[data_transformation.DFTNormalisation.Rule] = (), 

36 normalisation_rule_template: Optional[data_transformation.DFTNormalisation.RuleTemplate] = None, 

37 add_categorical_default_rules: bool = True): 

38 """ 

39 :param categorical_feature_names: either a sequence of column names or a regex that is to match all categorical feature names 

40 (which must not only work for the feature generated by this feature generator, i.e. it should not match feature names generated 

41 by other feature generators). 

42 It will be ensured that the respective columns in the generated data frames will have dtype 'category'. 

43 Furthermore, the presence of meta-information can later be leveraged for further transformations, e.g., one-hot encoding. 

44 :param normalisation_rules: Rules to be used by DFTNormalisation (e.g.,for constructing an input transformer for a model). 

45 These rules are only relevant if a DFTNormalisation object consuming them is instantiated and used 

46 within a data processing pipeline. They do not affect feature generation. 

47 :param normalisation_rule_template: This parameter can be supplied instead of `normalisation_rules` for the case where 

48 there shall be a single rule that applies to all columns generated by this feature generator that were not labeled as 

49 categorical. Like normalisation_rules, this is only relevant if a DFTNormalisation object consuming 

50 normalisation rules is instantiated and used within a data processing pipeline. 

51 It does not affect feature generation. 

52 :param add_categorical_default_rules: 

53 If True, normalisation rules for categorical features (which are unsupported by normalisation) and their corresponding one-hot 

54 encoded features (with "_<index>" appended) will be added. It does not affect feature generation. 

55 """ 

56 # NOTE: While it would be more elegant to not have all of the above constructor arguments and instead provide 

57 # them later using "with*" methods, this would have the significant drawback that it would enable 

58 # all such attributes to be provided in all subclasses, even in ones where we know settings exactly 

59 # and can provide them directly in the subclass constructor implementation. Thus it would enable 

60 # non-sensical settings which should be avoided. 

61 if len(normalisation_rules) > 0 and normalisation_rule_template is not None: 

62 raise ValueError(f"Normalisation rules should be empty when a rule template is provided") 

63 

64 self._generatedColumnNames = None 

65 self.__categoricalFeatureNames = categorical_feature_names 

66 

67 if type(categorical_feature_names) == str: 

68 categorical_feature_name_regex = categorical_feature_names 

69 else: 

70 if categorical_feature_names is not None and len(categorical_feature_names) > 0: 

71 categorical_feature_name_regex = or_regex_group(categorical_feature_names) 

72 else: 

73 categorical_feature_name_regex = None 

74 self._categoricalFeatureNameRegex: str = categorical_feature_name_regex 

75 self._categoricalFeatureRules = [] 

76 

77 if normalisation_rule_template is not None: 

78 # Note: placeholder rule's regex will be set in generate 

79 self._normalisationRules = [normalisation_rule_template.to_placeholder_rule()] 

80 self._mustUpdateNormalisationRuleBasedOnColumnNames = True 

81 else: 

82 self._normalisationRules = list(normalisation_rules) 

83 self._mustUpdateNormalisationRuleBasedOnColumnNames = False 

84 

85 if add_categorical_default_rules: 

86 if categorical_feature_name_regex is not None: 

87 self._categoricalFeatureRules.append(data_transformation.DFTNormalisation.Rule(categorical_feature_name_regex, 

88 unsupported=True)) 

89 self._categoricalFeatureRules.append(data_transformation.DFTNormalisation.Rule(categorical_feature_name_regex + r"_\d+", 

90 skip=True)) # rule for one-hot transformation 

91 

92 self._name: Optional[str] = None 

93 self._isFitted = False 

94 

95 # for backwards compatibility with persisted Featuregens based on code prior to commit 7088cbbe 

96 # They lack the __isFitted attribute and we assume that each such Featuregen was fitted 

97 def __setstate__(self, d): 

98 d["_isFitted"] = d.get("_isFitted", True) 

99 self.__dict__ = d 

100 

101 def _tostring_exclude_private(self) -> bool: 

102 return True 

103 

104 def _tostring_additional_entries(self) -> Dict[str, Any]: 

105 return dict(name=self.get_name()) 

106 

107 def get_name(self) -> str: 

108 """ 

109 :return: the name of this feature generator, which may be a default name if the name has not been set. Note that feature generators 

110 created by a FeatureGeneratorFactory always get the name with which the generator factory was registered. 

111 """ 

112 if self._name is None: 

113 return f"{self.__class__.__name__}-{id(self)}" 

114 return self._name 

115 

116 def set_name(self, name: str) -> None: 

117 self._name = name 

118 

119 def get_names(self) -> List[str]: 

120 """ 

121 :return: the list of names of feature generators; will be a list with a single name for a regular feature generator 

122 """ 

123 return [self.get_name()] 

124 

125 def info(self): 

126 return { 

127 "name": self.get_name(), 

128 "categoricalFeatureNames": self.__categoricalFeatureNames, 

129 "generatedColumnNames": self.get_generated_column_names(), 

130 "isFitted": self.is_fitted(), 

131 "normalisationRules": self.get_normalisation_rules(), 

132 } 

133 

134 def get_normalisation_rules(self, include_generated_categorical_rules=True) -> List[data_transformation.DFTNormalisation.Rule]: 

135 if include_generated_categorical_rules: 

136 return self._normalisationRules + self._categoricalFeatureRules 

137 else: 

138 return self._normalisationRules 

139 

140 def get_categorical_feature_name_regex(self) -> Optional[str]: 

141 return self._categoricalFeatureNameRegex 

142 

143 def is_categorical_feature(self, feature_name): 

144 if self._categoricalFeatureNameRegex is None: 

145 return False 

146 return re.fullmatch(self._categoricalFeatureNameRegex, feature_name) is not None 

147 

148 def get_generated_column_names(self) -> Optional[List[str]]: 

149 """ 

150 :return: Column names of the data frame generated by the most recent call of the feature generators 'generate' method. 

151 Returns None if generate was never called. 

152 """ 

153 return self._generatedColumnNames 

154 

155 def to_dft(self): 

156 return DFTFromFeatureGenerator(self) 

157 

158 @abstractmethod 

159 def _fit(self, x: pd.DataFrame, y: pd.DataFrame = None, ctx=None): 

160 """ 

161 Fits the feature generator based on the given data 

162 

163 :param x: the input/features data frame for the learning problem 

164 :param y: the corresponding output data frame for the learning problem 

165 (which will typically contain regression or classification target columns) 

166 :param ctx: a context object whose functionality may be required for feature generation; 

167 this is typically the model instance that this feature generator is to generate inputs for 

168 """ 

169 pass 

170 

171 def fit(self, x: pd.DataFrame, y: pd.DataFrame = None, ctx=None): 

172 """ 

173 Fits the feature generator based on the given data 

174 

175 :param x: the input/features data frame for the learning problem 

176 :param y: the corresponding output data frame for the learning problem 

177 (which will typically contain regression or classification target columns) 

178 :param ctx: a context object whose functionality may be required for feature generation; 

179 this is typically the model instance that this feature generator is to generate inputs for 

180 """ 

181 log.debug(f"Fitting {self}") 

182 self._fit(x, y=y, ctx=ctx) 

183 self._isFitted = True 

184 

185 def is_fitted(self): 

186 return self._isFitted 

187 

188 def generate(self, df: pd.DataFrame, ctx=None) -> pd.DataFrame: 

189 """ 

190 Generates features for the data points in the given data frame 

191 

192 :param df: the input data frame for which to generate features 

193 :param ctx: a context object whose functionality may be required for feature generation; 

194 this is typically the model instance that this feature generator is to generate inputs for 

195 :return: a data frame containing the generated features, which uses the same index as X (and Y) 

196 """ 

197 if not self.is_fitted(): 

198 raise Exception(f"Cannot generate features from a FeatureGenerator which is not fitted: " 

199 f"the feature generator {self.get_name()} requires fitting") 

200 

201 log.debug(f"Generating features with {self}") 

202 result_df = self._generate(df, ctx=ctx) 

203 

204 is_column_duplicated_array = result_df.columns.duplicated() 

205 if any(is_column_duplicated_array): 

206 duplicated_columns = set(result_df.columns[is_column_duplicated_array]) 

207 raise DuplicateColumnNamesException(f"Feature data frame contains duplicate column names: {duplicated_columns}") 

208 

209 # ensure that categorical columns have dtype 'category' 

210 categorical_feature_names = [] 

211 if self._categoricalFeatureNameRegex is not None: 

212 result_df = result_df.copy() # result_df we got might be a view of some other DF, so before we modify it, we must copy it 

213 categorical_feature_names = [col for col in result_df.columns if self.is_categorical_feature(col)] 

214 for colName in categorical_feature_names: 

215 series = result_df[colName].copy() 

216 if series.dtype.name != 'category': 

217 result_df[colName] = series.astype('category', copy=False) 

218 

219 self._generatedColumnNames = result_df.columns 

220 

221 # finalise normalisation rule template (if any) by making it apply to all non-categorical features 

222 # (a default rule applies to categorical features) 

223 if self._mustUpdateNormalisationRuleBasedOnColumnNames: 

224 non_categorical_features = list(set(self._generatedColumnNames).difference(categorical_feature_names)) 

225 # NOTE: We here update the existing rule which was instantiated with a dummy regex because 

226 # some mechanisms (e.g. MultiFeatureGenerators) retrieve rule instances early on (before generate 

227 # is ever called) and therefore updating an existing rule is the safe route and should always 

228 # work, because rules should never actually be applied before generate has indeed been called 

229 self._normalisationRules[0].set_regex(or_regex_group(non_categorical_features)) 

230 self._mustUpdateNormalisationRuleBasedOnColumnNames = False 

231 

232 return result_df 

233 

234 @abstractmethod 

235 def _generate(self, df: pd.DataFrame, ctx=None) -> pd.DataFrame: 

236 """ 

237 Generates features for the data points in the given data frame. 

238 

239 :param df: the input data frame for which to generate features 

240 :param ctx: a context object whose functionality may be required for feature generation; 

241 this is typically the model instance that this feature generator is to generate inputs for 

242 :return: a data frame containing the generated features, which uses the same index as ``df``. 

243 The data frame's columns holding categorical columns are not required to have dtype ``category``; 

244 this will be ensured by the encapsulating call as long as the respective columns' names 

245 were appropriately provided at construction. 

246 """ 

247 pass 

248 

249 def fit_generate(self, x: pd.DataFrame, y: pd.DataFrame = None, ctx=None) -> pd.DataFrame: 

250 """ 

251 Fits the feature generator and subsequently generates features for the data points in the given data frame 

252 

253 :param x: the input data frame for the learning problem and for which to generate features 

254 :param y: the corresponding output data frame for the learning problem 

255 (which will typically contain regression or classification target columns) 

256 :param ctx: a context object whose functionality may be required for feature generation; 

257 this is typically the model instance that this feature generator is to generate inputs for 

258 :return: a data frame containing the generated features, which uses the same index as X (and Y) 

259 """ 

260 self.fit(x, y, ctx) 

261 return self.generate(x, ctx) 

262 

263 def flattened(self, 

264 columns_to_flatten: List[str] = None, 

265 normalisation_rules=(), 

266 normalisation_rule_template: data_transformation.DFTNormalisation.RuleTemplate = None, 

267 keep_other_columns=True) -> "ChainedFeatureGenerator": 

268 """ 

269 Returns a new feature generator which returns flattened versions of one or more of the vector-valued columns generated 

270 by this feature generator. 

271 

272 :param columns_to_flatten: the list of columns to flatten; if None, flatten all columns 

273 :param normalisation_rules: a list of normalisation rules which apply to the flattened columns 

274 :param normalisation_rule_template: a normalisation rule template which applies to all generated flattened columns 

275 :param keep_other_columns: if True, any additional columns that are not to be flattened are to be retained 

276 by the returned feature generator; if False, additional columns are to be discarded 

277 :return: a feature generator which generates the flattened columns 

278 """ 

279 return flattened_feature_generator(self, columns_to_flatten=columns_to_flatten, normalisation_rules=normalisation_rules, 

280 keep_other_columns=keep_other_columns, normalisation_rule_template=normalisation_rule_template) 

281 

282 def concat(self, *others: "FeatureGenerator") -> "MultiFeatureGenerator": 

283 """ 

284 Concatenates this feature generator with one or more other feature generator in order to produce a feature generator that 

285 jointly generates all features 

286 

287 :param others: other feature generators 

288 :return: a :class:`MultiFeatureGenerator` 

289 """ 

290 if isinstance(self, MultiFeatureGenerator): 

291 fgens = list(self.featureGenerators) 

292 else: 

293 fgens = [self] 

294 fgens.extend(others) 

295 return MultiFeatureGenerator(fgens) 

296 

297 def chain(self, *others: "FeatureGenerator") -> "ChainedFeatureGenerator": 

298 """ 

299 Chains this feature generator with one or more other feature generators such that each feature generator 

300 receives as input the output of the preceding feature generator. The resulting feature generator 

301 produces the features of the last element in the chain. 

302 

303 :param others: other feature generator 

304 :return: a :class:`ChainedFeatureGenerator` 

305 """ 

306 if isinstance(self, ChainedFeatureGenerator): 

307 fgens = self.featureGenerators 

308 else: 

309 fgens = [self] 

310 fgens.extend(others) 

311 return ChainedFeatureGenerator(fgens) 

312 

313 

314class RuleBasedFeatureGenerator(FeatureGenerator, ABC): 

315 """ 

316 A feature generator which does not require fitting 

317 """ 

318 def fit(self, x, y=None, ctx=None): 

319 pass 

320 

321 def _fit(self, x: pd.DataFrame, y: pd.DataFrame = None, ctx=None): 

322 pass 

323 

324 def is_fitted(self): 

325 return True 

326 

327 

328class MultiFeatureGenerator(FeatureGenerator): 

329 """ 

330 Wrapper for multiple feature generators. Calling generate here applies all given feature generators independently and 

331 returns the concatenation of their outputs 

332 """ 

333 def __init__(self, *feature_generators: Union[FeatureGenerator, List[FeatureGenerator]]): 

334 self.featureGenerators = feature_generators = flatten_arguments(feature_generators) 

335 if len(self.featureGenerators) == 0: 

336 log.debug("Creating an empty MultiFeatureGenerator. It will generate a data frame without columns.") 

337 categorical_feature_name_regexes = [regex for regex in [fg.get_categorical_feature_name_regex() 

338 for fg in feature_generators] if regex is not None] 

339 if len(categorical_feature_name_regexes) > 0: 

340 categorical_feature_names = "|".join(categorical_feature_name_regexes) 

341 else: 

342 categorical_feature_names = () 

343 normalisation_rules = util.concat_sequences([fg.get_normalisation_rules() for fg in feature_generators]) 

344 super().__init__(categorical_feature_names=categorical_feature_names, normalisation_rules=normalisation_rules, 

345 add_categorical_default_rules=False) 

346 

347 def _tostring_object_info(self) -> str: 

348 return f"featureGenerators={list_string(self.featureGenerators)}" 

349 

350 def _generate_from_multiple(self, generate_features: Callable[[FeatureGenerator], pd.DataFrame], index) -> pd.DataFrame: 

351 dfs = [] 

352 for fg in self.featureGenerators: 

353 df = generate_features(fg) 

354 dfs.append(df) 

355 if len(dfs) == 0: 

356 return pd.DataFrame(index=index) 

357 else: 

358 combined_df = pd.concat(dfs, axis=1) 

359 if len(combined_df.columns) != len(set(combined_df.columns)): 

360 raise Exception(f"At least one column was generated more than once: {list(combined_df.columns)}; " 

361 f"check feature generators for correctness!") 

362 return combined_df 

363 

364 def _generate(self, input_df: pd.DataFrame, ctx=None): 

365 def generate_features(fg: FeatureGenerator): 

366 return fg.generate(input_df, ctx=ctx) 

367 return self._generate_from_multiple(generate_features, input_df.index) 

368 

369 def fit_generate(self, x: pd.DataFrame, y: pd.DataFrame = None, ctx=None) -> pd.DataFrame: 

370 log.debug(f"Fitting and generating features with {self}") 

371 

372 def generate_features(fg: FeatureGenerator): 

373 return fg.fit_generate(x, y, ctx) 

374 

375 return self._generate_from_multiple(generate_features, x.index) 

376 

377 def _fit(self, x: pd.DataFrame, y: pd.DataFrame = None, ctx=None): 

378 for fg in self.featureGenerators: 

379 fg.fit(x, y) 

380 

381 def is_fitted(self): 

382 return all([fg.is_fitted() for fg in self.featureGenerators]) 

383 

384 def info(self): 

385 info = super(MultiFeatureGenerator, self).info() 

386 info["featureGeneratorNames"] = self.get_names() 

387 return info 

388 

389 def get_names(self) -> list: 

390 return functools.reduce(lambda x, y: x + y, [fg.get_names() for fg in self.featureGenerators], []) 

391 

392 

393class FeatureGeneratorFromNamedTuples(FeatureGenerator, ABC): 

394 """ 

395 Generates feature values for one data point at a time, creating a dictionary with 

396 feature values from each named tuple 

397 """ 

398 def __init__(self, cache: util.cache.KeyValueCache = None, categorical_feature_names: Sequence[str] = (), 

399 normalisation_rules: Sequence[data_transformation.DFTNormalisation.Rule] = (), 

400 normalisation_rule_template: data_transformation.DFTNormalisation.RuleTemplate = None): 

401 super().__init__(categorical_feature_names=categorical_feature_names, normalisation_rules=normalisation_rules, 

402 normalisation_rule_template=normalisation_rule_template) 

403 self.cache = cache 

404 

405 def _generate(self, df: pd.DataFrame, ctx=None): 

406 dicts = [] 

407 for idx, nt in enumerate(df.itertuples()): 

408 nt: PandasNamedTuple 

409 if idx % 100 == 0: 

410 log.debug(f"Generating feature via {self.__class__.__name__} for index {idx}") 

411 value = None 

412 if self.cache is not None: 

413 value = self.cache.get(nt.Index) 

414 if value is None: 

415 value = self._generate_feature_dict(nt) 

416 if self.cache is not None: 

417 self.cache.set(nt.Index, value) 

418 dicts.append(value) 

419 return pd.DataFrame(dicts, index=df.index) 

420 

421 @abstractmethod 

422 def _generate_feature_dict(self, named_tuple) -> Dict[str, Any]: 

423 """ 

424 Creates a dictionary with feature values from a named tuple 

425 

426 :param named_tuple: the data point for which to generate features 

427 :return: a dictionary mapping feature names to values 

428 """ 

429 pass 

430 

431 

432class FeatureGeneratorTakeColumns(RuleBasedFeatureGenerator): 

433 def __init__(self, columns: Union[str, List[str]] = None, except_columns: Sequence[str] = (), 

434 categorical_feature_names: Optional[Union[Sequence[str], str]] = (), 

435 normalisation_rules: Sequence[data_transformation.DFTNormalisation.Rule] = (), 

436 normalisation_rule_template: data_transformation.DFTNormalisation.RuleTemplate = None, 

437 verify_column_names=True): 

438 """ 

439 :param columns: name of the column or list of names of columns to be taken. If None, all columns will be taken. 

440 :param except_columns: list of names of columns to not take if present in the input df 

441 :param categorical_feature_names: either a sequence of column names or a regex that is to match all categorical feature names 

442 (which must not only work for the feature generated by this feature generator, i.e. it should not match feature names generated 

443 by other feature generators). 

444 It will be ensured that the respective columns in the generated data frames will have dtype 'category'. 

445 Furthermore, presence of meta-information can later be leveraged for further transformations, e.g. one-hot encoding. 

446 :param normalisation_rules: Rules to be used by DFTNormalisation (e.g. for constructing an input transformer for a model). 

447 These rules are only relevant if a DFTNormalisation object consuming them is instantiated and used 

448 within a data processing pipeline. They do not affect feature generation. 

449 :param normalisation_rule_template: This parameter can be supplied instead of normalisationRules for the case where 

450 there shall be a single rule that applies to all columns generated by this feature generator that were not labeled as 

451 categorical. 

452 :param verify_column_names: if True and columns to take were specified, will raise an error in case said columns 

453 are missing during feature generation. If False, will log on info level instead 

454 """ 

455 super().__init__(categorical_feature_names=categorical_feature_names, normalisation_rules=normalisation_rules, 

456 normalisation_rule_template=normalisation_rule_template) 

457 if isinstance(columns, str): 

458 columns = [columns] 

459 self.columns = columns 

460 self.exceptColumns = except_columns 

461 self.verifyColumnNames = verify_column_names 

462 

463 def _generate(self, df: pd.DataFrame, ctx=None) -> pd.DataFrame: 

464 columns_to_take = self.columns if self.columns is not None else df.columns 

465 columns_to_take = [col for col in columns_to_take if col not in self.exceptColumns] 

466 

467 if self.columns is not None: 

468 missing_cols = set(columns_to_take).difference(df.columns) 

469 if len(missing_cols) > 0: 

470 missing_cols_notification = f"Columns {missing_cols} were specified but are not present in data frame. " \ 

471 f"verifyColumnNames was set to {self.verifyColumnNames}; " \ 

472 f"available columns: {list(df.columns)}" 

473 if self.verifyColumnNames: 

474 raise RuntimeError(missing_cols_notification) 

475 log.info(missing_cols_notification) 

476 return df[columns_to_take] 

477 

478 def info(self): 

479 info = super().info() 

480 info["columns"] = self.columns 

481 info["exceptColumns"] = self.exceptColumns 

482 return info 

483 

484 

485class FeatureGeneratorFlattenColumns(RuleBasedFeatureGenerator): 

486 """ 

487 Instances of this class take columns with vectors and creates a data frame with columns containing entries of 

488 these vectors. 

489 

490 For example, if columns "vec1", "vec2" contain vectors of dimensions dim1, dim2, a data frame with dim1+dim2 new columns 

491 will be created. It will contain the columns "vec1_<i1>", "vec2_<i2>" with i1, i2 ranging in (0, dim1), (0, dim2). 

492 

493 """ 

494 def __init__(self, columns: Optional[Union[str, Sequence[str]]] = None, categorical_feature_names: Sequence[str] = (), 

495 normalisation_rules: Sequence[data_transformation.DFTNormalisation.Rule] = (), 

496 normalisation_rule_template: data_transformation.DFTNormalisation.RuleTemplate = None): 

497 """ 

498 

499 :param columns: name of the column or list of names of columns to be flattened. If None, all columns will be flattened. 

500 :param categorical_feature_names: 

501 :param normalisation_rules: 

502 :param normalisation_rule_template: 

503 """ 

504 super().__init__(categorical_feature_names=categorical_feature_names, normalisation_rules=normalisation_rules, 

505 normalisation_rule_template=normalisation_rule_template) 

506 if isinstance(columns, str): 

507 columns = [columns] 

508 self.columns = columns 

509 

510 def _generate(self, df: pd.DataFrame, ctx=None) -> pd.DataFrame: 

511 result_df = pd.DataFrame(index=df.index) 

512 columns_to_flatten = self.columns if self.columns is not None else df.columns 

513 for col in columns_to_flatten: 

514 log.debug(f"Flattening column {col}") 

515 # NOTE: we found the use of np.stack to produce the most runtime-efficient results. 

516 # Other variants, e.g. based on lists instead of numpy.arrays, perform much worse. 

517 values = np.stack(df[col].values) 

518 if len(values.shape) != 2: 

519 raise ValueError(f"Column {col} was expected to contain one dimensional vectors, something went wrong") 

520 dimension = values.shape[1] 

521 new_columns = [f"{col}_{i}" for i in range(dimension)] 

522 log.debug(f"Flattening resulted in {len(new_columns)} new columns") 

523 result_df[new_columns] = pd.DataFrame(values, index=df.index) 

524 return result_df 

525 

526 def info(self): 

527 info = super().info() 

528 info["columns"] = self.columns 

529 return info 

530 

531 

532class FeatureGeneratorFromColumnGenerator(RuleBasedFeatureGenerator): 

533 """ 

534 Implements a feature generator via a column generator 

535 """ 

536 log = log.getChild(__qualname__) 

537 

538 def __init__(self, column_gen: 'ColumnGenerator', take_input_column_if_present=False, is_categorical=False, 

539 normalisation_rule_template: data_transformation.DFTNormalisation.RuleTemplate = None): 

540 """ 

541 :param column_gen: the underlying column generator 

542 :param take_input_column_if_present: if True, then if a column whose name corresponds to the column to generate exists 

543 in the input data, simply copy it to generate the output (without using the column generator); if False, always 

544 apply the columnGen to generate the output 

545 :param is_categorical: whether the resulting column is categorical 

546 :param normalisation_rule_template: template for a DFTNormalisation for the resulting column. 

547 This should only be provided if is_categorical is False 

548 """ 

549 if is_categorical and normalisation_rule_template is not None: 

550 raise ValueError(f"normalisationRuleTemplate should be None when the generated column is categorical") 

551 

552 categorical_feature_names = (column_gen.generatedColumnName,) if is_categorical else () 

553 super().__init__(categorical_feature_names=categorical_feature_names, normalisation_rule_template=normalisation_rule_template) 

554 

555 self.takeInputColumnIfPresent = take_input_column_if_present 

556 self.columnGen = column_gen 

557 

558 def info(self): 

559 info = super().info() 

560 info["takeInputColumnIfPresent"] = self.takeInputColumnIfPresent 

561 info["generatedColName"] = self.columnGen.generatedColumnName 

562 return info 

563 

564 def _generate(self, df: pd.DataFrame, ctx=None) -> pd.DataFrame: 

565 col_name = self.columnGen.generatedColumnName 

566 if self.takeInputColumnIfPresent and col_name in df.columns: 

567 self.log.debug(f"Taking column '{col_name}' from input data frame") 

568 series = df[col_name] 

569 else: 

570 self.log.debug(f"Generating column '{col_name}' via {self.columnGen}") 

571 series = self.columnGen.generate_column(df) 

572 return pd.DataFrame({col_name: series}) 

573 

574 

575class ChainedFeatureGenerator(FeatureGenerator): 

576 """ 

577 Chains feature generators such that they are executed one after another. The output of generator i>=1 is the input of 

578 generator i+1 in the generator sequence. 

579 """ 

580 def __init__(self, *feature_generators: Union[FeatureGenerator, List[FeatureGenerator]]): 

581 """ 

582 :param feature_generators: feature generators to apply in order; the properties of the last feature generator 

583 determine the relevant meta-data such as categorical feature names and normalisation rules 

584 """ 

585 self.featureGenerators = flatten_arguments(feature_generators) 

586 if len(feature_generators) == 0: 

587 raise ValueError("Empty list of feature generators") 

588 last_fg: FeatureGenerator = self.featureGenerators[-1] 

589 super().__init__( 

590 categorical_feature_names=last_fg.get_categorical_feature_name_regex(), normalisation_rules=last_fg.get_normalisation_rules(), 

591 add_categorical_default_rules=False) 

592 

593 def _generate(self, df: pd.DataFrame, ctx=None) -> pd.DataFrame: 

594 for featureGen in self.featureGenerators: 

595 df = featureGen.generate(df, ctx) 

596 return df 

597 

598 def _fit(self, x: pd.DataFrame, y: pd.DataFrame = None, ctx=None): 

599 self.fit_generate(x, y, ctx) 

600 

601 def is_fitted(self): 

602 return all([fg.is_fitted() for fg in self.featureGenerators]) 

603 

604 def info(self): 

605 info = super().info() 

606 info["chainedFeatureGeneratorNames"] = self.get_names() 

607 

608 def fit_generate(self, x: pd.DataFrame, y: pd.DataFrame = None, ctx=None) -> pd.DataFrame: 

609 log.debug(f"Fitting and generating features with {self}") 

610 for fg in self.featureGenerators: 

611 x = fg.fit_generate(x, y, ctx) 

612 return x 

613 

614 

615class FeatureGeneratorTargetDistribution(FeatureGenerator): 

616 """ 

617 A feature generator, which, for a column T (typically the categorical target column of a classification problem 

618 or the continuous target column of a regression problem), 

619 

620 * can ensure that T takes on limited set of values t_1, ..., t_n by allowing the user to apply 

621 binning using given bin boundaries 

622 * computes for each value c of a categorical column C the conditional empirical distribution 

623 P(T | C=c) in the training data during the training phase, 

624 * generates, for each requested column C and value c in the column, n features 

625 '<C>_<T>_distribution_<t_i>' = P(T=t_i | C=c) if flatten=True 

626 or one feature '<C>_<T>_distribution' = [P(T=t_i | C=c), ..., P(T=t_n | C=c)] if flatten=False 

627 

628 Being probability values, the features generated by this feature generator are already normalised. 

629 """ 

630 def __init__(self, 

631 columns: Union[str, Sequence[str]], 

632 target_column: str, 

633 target_column_bins: Optional[Union[Sequence[float], int, pd.IntervalIndex]], 

634 target_column_in_features_df=False, 

635 flatten=True): 

636 """ 

637 :param columns: the categorical columns for which to generate distribution features 

638 :param target_column: the column the distributions over which will make up the features. 

639 If targetColumnBins is not None, this column will be discretised before computing the conditional distributions 

640 :param target_column_bins: if not None, specifies the binning to apply via pandas.cut 

641 (see https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.cut.html). 

642 Note that if a value should match no bin, NaN will generated. To avoid this when specifying bin boundaries in a list, 

643 -inf and +inf should be used as the first and last entries. 

644 :param target_column_in_features_df: if True, when fitting will look for targetColumn in the features data frame (X) instead of in 

645 target data frame (Y) 

646 :param flatten: whether to generate a separate scalar feature per distribution value rather than one feature 

647 with all of the distribution's values 

648 """ 

649 self.flatten = flatten 

650 if isinstance(columns, str): 

651 columns = [columns] 

652 self.columns = columns 

653 self.targetColumn = target_column 

654 self.targetColumnInFeaturesDf = target_column_in_features_df 

655 self.targetColumnBins = target_column_bins 

656 if self.flatten: 

657 normalisation_rule_template = data_transformation.DFTNormalisation.RuleTemplate(skip=True) 

658 else: 

659 normalisation_rule_template = data_transformation.DFTNormalisation.RuleTemplate(unsupported=True) 

660 super().__init__(normalisation_rule_template=normalisation_rule_template) 

661 self._targetColumnValues = None 

662 # This will hold the mapping: column -> featureValue -> targetValue -> targetValueEmpiricalProbability 

663 self._discreteTargetDistributionsByColumn: Optional[Dict[str, Dict[Any, Dict[Any, float]]]] = None 

664 

665 def info(self): 

666 info = super().info() 

667 info["columns"] = self.columns 

668 info["targetColumn"] = self.targetColumn 

669 info["targetColumnBins"] = self.targetColumnBins 

670 info["flatten"] = self.flatten 

671 return info 

672 

673 def _fit(self, x: pd.DataFrame, y: pd.DataFrame = None, ctx=None): 

674 """ 

675 This will persist the empirical target probability distributions for all unique values in the specified columns 

676 """ 

677 if self.targetColumnInFeaturesDf: 

678 target = x[self.targetColumn] 

679 else: 

680 target = y[self.targetColumn] 

681 if self.targetColumnBins is not None: 

682 discretised_target = pd.cut(target, self.targetColumnBins) 

683 else: 

684 discretised_target = target 

685 self._targetColumnValues = discretised_target.unique() 

686 

687 self._discreteTargetDistributionsByColumn = {} 

688 for column in self.columns: 

689 self._discreteTargetDistributionsByColumn[column] = {} 

690 column_target_df = pd.DataFrame() 

691 column_target_df[column] = x[column] 

692 column_target_df["target"] = discretised_target.values 

693 for value, valueTargetsDf in column_target_df.groupby(column): 

694 # The normalized value_counts contain targetValue -> targetValueEmpiricalProbability for the current value 

695 self._discreteTargetDistributionsByColumn[column][value] = valueTargetsDf["target"].value_counts(normalize=True).to_dict() 

696 

697 def _generate(self, df: pd.DataFrame, ctx=None) -> pd.DataFrame: 

698 if self._discreteTargetDistributionsByColumn is None: 

699 raise Exception("Feature generator has not been fitted") 

700 result_df = pd.DataFrame(index=df.index) 

701 for column in self.columns: 

702 target_distribution_by_value = self._discreteTargetDistributionsByColumn[column] 

703 if self.flatten: 

704 for target_value in self._targetColumnValues: 

705 # Important: pd.Series.apply should not be used here, as it would label the resulting column as categorical 

706 result_df[f"{column}_{self.targetColumn}_distribution_{target_value}"] = \ 

707 [target_distribution_by_value[value].get(target_value, 0.0) for value in df[column]] 

708 else: 

709 distributions = [[target_distribution_by_value[value].get(targetValue, 0.0) for targetValue in self._targetColumnValues] 

710 for value in df[column]] 

711 result_df[f"{column}_{self.targetColumn}_distribution"] = pd.Series(distributions, index=df[column].index) 

712 return result_df 

713 

714 

715class FeatureGeneratorFromVectorModel(FeatureGenerator): 

716 def __init__(self, 

717 vector_model: "VectorModel", 

718 target_feature_generator: FeatureGenerator, 

719 categorical_feature_names: Sequence[str] = (), 

720 normalisation_rules: Sequence[data_transformation.DFTNormalisation.Rule] = (), 

721 normalisation_rule_template: data_transformation.DFTNormalisation.RuleTemplate = None, 

722 input_feature_generator: FeatureGenerator = None, 

723 use_target_feature_generator_for_training=False): 

724 """ 

725 Provides a feature via predictions of a given model 

726 

727 :param vector_model: model used for generate features from predictions 

728 :param target_feature_generator: generator for target to be predicted 

729 :param categorical_feature_names: 

730 :param normalisation_rules: 

731 :param normalisation_rule_template: 

732 :param input_feature_generator: optional feature generator to be applied to input of vectorModel's fit and predict 

733 :param use_target_feature_generator_for_training: if False, this generator will always apply the model 

734 to generate features. 

735 If True, this generator will use targetFeatureGenerator to generate features, bypassing the 

736 model. This is useful for the case where the model which is 

737 to receive the generated features shall be trained on the original targets rather than the predictions 

738 thereof. 

739 """ 

740 super().__init__(categorical_feature_names=categorical_feature_names, normalisation_rules=normalisation_rules, 

741 normalisation_rule_template=normalisation_rule_template) 

742 

743 self.useTargetFeatureGeneratorForTraining = use_target_feature_generator_for_training 

744 self.targetFeatureGenerator = target_feature_generator 

745 self.inputFeatureGenerator = input_feature_generator 

746 self.useTargetFeatureGeneratorForTraining = use_target_feature_generator_for_training 

747 self.vectorModel = vector_model 

748 

749 def _fit(self, x: pd.DataFrame, y: pd.DataFrame = None, ctx=None): 

750 target_df = self.targetFeatureGenerator.fit_generate(x, y) 

751 if self.inputFeatureGenerator: 

752 x = self.inputFeatureGenerator.fit_generate(x, y) 

753 self.vectorModel.fit(x, target_df) 

754 

755 def _generate(self, df: pd.DataFrame, ctx=None) -> pd.DataFrame: 

756 if self.inputFeatureGenerator: 

757 df = self.inputFeatureGenerator.generate(df) 

758 if self.useTargetFeatureGeneratorForTraining and not ctx.is_fitted(): 

759 log.debug(f"Using targetFeatureGenerator {self.targetFeatureGenerator.__class__.__name__} to generate target features") 

760 return self.targetFeatureGenerator.generate(df) 

761 else: 

762 log.debug(f"Generating target features via {self.vectorModel.__class__.__name__}") 

763 return self.vectorModel.predict(df) 

764 

765 def info(self): 

766 info = super().info() 

767 info["wrappedModel"] = str(self.vectorModel) 

768 return info 

769 

770 

771class FeatureGeneratorMapColumn(RuleBasedFeatureGenerator, ABC): 

772 """ 

773 Creates a single feature from a single input column by applying a function to each element of the input column 

774 """ 

775 def __init__(self, 

776 input_col_name: str, 

777 feature_col_name: str, 

778 categorical_feature_names: Optional[Union[Sequence[str], str]] = None, 

779 normalisation_rules: Sequence[data_transformation.DFTNormalisation.Rule] = (), 

780 normalisation_rule_template: data_transformation.DFTNormalisation.RuleTemplate = None, 

781 add_categorical_default_rules=True): 

782 super().__init__(categorical_feature_names=categorical_feature_names, normalisation_rules=normalisation_rules, 

783 normalisation_rule_template=normalisation_rule_template, add_categorical_default_rules=add_categorical_default_rules) 

784 self._inputColName = input_col_name 

785 self._featureColName = feature_col_name 

786 

787 def _generate(self, df: pd.DataFrame, ctx=None) -> pd.DataFrame: 

788 if self._inputColName not in df.columns: 

789 raise ValueError(f"Column '{self._inputColName}' required by feature generator not found in list of columns: " 

790 f"{list(df.columns)}") 

791 input_series = df[self._inputColName] 

792 values = input_series.apply(self._create_value) 

793 return pd.DataFrame({self._featureColName: values}, index=df.index) 

794 

795 @abstractmethod 

796 def _create_value(self, value): 

797 """ 

798 Maps a value from the input column to a feature value 

799 

800 :param value: a value from the input column 

801 :return: the feature value 

802 """ 

803 pass 

804 

805 

806class FeatureGeneratorMapColumnDict(RuleBasedFeatureGenerator, ABC): 

807 """ 

808 Creates an arbitrary number of features from a single input column by applying a function to each element of the input column 

809 """ 

810 def __init__(self, input_col_name: str, categorical_feature_names: Optional[Union[Sequence[str], str]] = None, 

811 normalisation_rules: Sequence[data_transformation.DFTNormalisation.Rule] = (), 

812 normalisation_rule_template: data_transformation.DFTNormalisation.RuleTemplate = None, add_categorical_default_rules=True): 

813 super().__init__(categorical_feature_names=categorical_feature_names, normalisation_rules=normalisation_rules, 

814 normalisation_rule_template=normalisation_rule_template, add_categorical_default_rules=add_categorical_default_rules) 

815 self._inputColName = input_col_name 

816 

817 def _generate(self, df: pd.DataFrame, ctx=None) -> pd.DataFrame: 

818 if self._inputColName not in df.columns: 

819 raise ValueError(f"Column '{self._inputColName}' required by feature generator not found in list of columns: " 

820 f"{list(df.columns)}") 

821 input_series = df[self._inputColName] 

822 values = [self._create_features_dict(v) for v in input_series] 

823 return pd.DataFrame(values, index=df.index) 

824 

825 @abstractmethod 

826 def _create_features_dict(self, value) -> Dict[str, Any]: 

827 """ 

828 Maps a value from the input column to a dictionary containing one or more features. 

829 

830 :param value: a value from the input column 

831 :return: a dictionary mapping feature names to values 

832 """ 

833 pass 

834 

835 

836class FeatureGeneratorNAMarker(RuleBasedFeatureGenerator): 

837 """ 

838 Creates features indicating whether another feature is N/A (not available). 

839 It can be practical to use this feature generator in conjunction with DFTFillNA for models that cannot handle missing values. 

840 """ 

841 def __init__(self, columns: List[str], value_a=0, value_na=1): 

842 """ 

843 Note: When changing the default values used, use only values that are considered to be normalised when using this 

844 feature generation in a context where DFTNormalisation is used (no normalisation is applied to features generated 

845 by this feature generator). 

846 

847 :param columns: the columns for which to generate 

848 :param value_a: the feature value if the input feature is available 

849 :param value_na: the feature value if the input feature is not available 

850 """ 

851 super().__init__(normalisation_rule_template=DFTNormalisation.RuleTemplate(skip=True)) 

852 self.columns = columns 

853 self.valueA = value_a 

854 self.valueNA = value_na 

855 

856 def _generate(self, df: pd.DataFrame, ctx=None) -> pd.DataFrame: 

857 new_cols = {} 

858 value_map = {True: self.valueNA, False: self.valueA} 

859 for col in self.columns: 

860 new_cols[f"{col}_na"] = [value_map[isNA] for isNA in df[col].isna()] 

861 return pd.DataFrame(new_cols, index=df.index) 

862 

863 

864def flattened_feature_generator(fgen: FeatureGenerator, columns_to_flatten: List[str] = None, keep_other_columns=True, 

865 normalisation_rules: Sequence[DFTNormalisation.Rule] = (), 

866 normalisation_rule_template: data_transformation.DFTNormalisation.RuleTemplate = None): 

867 """ 

868 Return a flattening version of the input feature generator. 

869 

870 :param fgen: the feature generator which generates columns that are to be flattened 

871 :param columns_to_flatten: list of names of output columns to be flattened; if None, flatten all columns 

872 :param keep_other_columns: whether any additional columns that are not to be flattened are to be retained 

873 by the returned feature generator 

874 :param normalisation_rules: additional normalisation rules for the flattened output columns 

875 :param normalisation_rule_template: This parameter can be supplied instead of normalisation_rules for the case where 

876 there shall be a single rule that applies to all flattened output columns 

877 :return: FeatureGenerator instance that will generate flattened versions of the specified columns and leave 

878 all other output columns as is. 

879  

880 Example: 

881 >>> from sensai.featuregen import FeatureGeneratorTakeColumns, flattened_feature_generator 

882 >>> import pandas as pd 

883 >>> 

884 >>> df = pd.DataFrame({"foo": [[1, 2], [3, 4]], "bar": ["a", "b"]}) 

885 >>> fgen = flattened_feature_generator(FeatureGeneratorTakeColumns(), columns_to_flatten=["foo"]) 

886 >>> fgen.generate(df) 

887 foo_0 foo_1 bar 

888 0 1 2 a 

889 1 3 4 b 

890 """ 

891 flattening_generator = FeatureGeneratorFlattenColumns(columns=columns_to_flatten, normalisation_rules=normalisation_rules, 

892 normalisation_rule_template=normalisation_rule_template) 

893 if columns_to_flatten is None or not keep_other_columns: 

894 return ChainedFeatureGenerator(fgen, flattening_generator) 

895 else: 

896 return ChainedFeatureGenerator(fgen, 

897 MultiFeatureGenerator(flattening_generator, FeatureGeneratorTakeColumns(except_columns=columns_to_flatten))) 

898 

899 

900class FeatureGeneratorFromDFT(FeatureGenerator): 

901 def __init__(self, dft: DataFrameTransformer, categorical_feature_names: Optional[Union[Sequence[str], str]] = None, 

902 normalisation_rules: Sequence[data_transformation.DFTNormalisation.Rule] = (), 

903 normalisation_rule_template: data_transformation.DFTNormalisation.RuleTemplate = None, 

904 add_categorical_default_rules=True): 

905 super().__init__(categorical_feature_names=categorical_feature_names, normalisation_rules=normalisation_rules, 

906 normalisation_rule_template=normalisation_rule_template, add_categorical_default_rules=add_categorical_default_rules) 

907 self.dft = dft 

908 

909 def _fit(self, x: pd.DataFrame, y: pd.DataFrame = None, ctx=None): 

910 self.dft.fit(x) 

911 

912 def _generate(self, df: pd.DataFrame, ctx=None) -> pd.DataFrame: 

913 return self.dft.apply(df)