Coverage for src/sensai/featuregen/feature

1import functools

2import logging

3import re

4from abc import ABC, abstractmethod

5from typing import Sequence, List, Union, Callable, Any, Dict, TYPE_CHECKING, Optional

7import numpy as np

8import pandas as pd

10from .. import util, data_transformation

11from ..data_transformation import DFTNormalisation, DFTFromFeatureGenerator, DataFrameTransformer

12from ..util import flatten_arguments

13from ..util.string import or_regex_group, ToStringMixin, list_string

14from ..util.typing import PandasNamedTuple

16if TYPE_CHECKING:

17 from ..vector_model import VectorModel

18 from ..columngen import ColumnGenerator

21log = logging.getLogger(__name__)

24class DuplicateColumnNamesException(Exception):

25 pass

28class FeatureGenerator(ToStringMixin, ABC):

29 """

30 Base class for feature generators that create a new DataFrame containing feature values

31 from an input DataFrame

32 """

33 def __init__(self,

34 categorical_feature_names: Optional[Union[Sequence[str], str]] = None,

35 normalisation_rules: Sequence[data_transformation.DFTNormalisation.Rule] = (),

36 normalisation_rule_template: Optional[data_transformation.DFTNormalisation.RuleTemplate] = None,

37 add_categorical_default_rules: bool = True):

38 """

39 :param categorical_feature_names: either a sequence of column names or a regex that is to match all categorical feature names

40 (which must not only work for the feature generated by this feature generator, i.e. it should not match feature names generated

41 by other feature generators).

42 It will be ensured that the respective columns in the generated data frames will have dtype 'category'.

43 Furthermore, the presence of meta-information can later be leveraged for further transformations, e.g., one-hot encoding.

44 :param normalisation_rules: Rules to be used by DFTNormalisation (e.g.,for constructing an input transformer for a model).

45 These rules are only relevant if a DFTNormalisation object consuming them is instantiated and used

46 within a data processing pipeline. They do not affect feature generation.

47 :param normalisation_rule_template: This parameter can be supplied instead of `normalisation_rules` for the case where

48 there shall be a single rule that applies to all columns generated by this feature generator that were not labeled as

49 categorical. Like normalisation_rules, this is only relevant if a DFTNormalisation object consuming

50 normalisation rules is instantiated and used within a data processing pipeline.

51 It does not affect feature generation.

52 :param add_categorical_default_rules:

53 If True, normalisation rules for categorical features (which are unsupported by normalisation) and their corresponding one-hot

54 encoded features (with "_<index>" appended) will be added. It does not affect feature generation.

55 """

56 # NOTE: While it would be more elegant to not have all of the above constructor arguments and instead provide

57 # them later using "with*" methods, this would have the significant drawback that it would enable

58 # all such attributes to be provided in all subclasses, even in ones where we know settings exactly

59 # and can provide them directly in the subclass constructor implementation. Thus it would enable

60 # non-sensical settings which should be avoided.

61 if len(normalisation_rules) > 0 and normalisation_rule_template is not None:

62 raise ValueError(f"Normalisation rules should be empty when a rule template is provided")

64 self._generatedColumnNames = None

65 self.__categoricalFeatureNames = categorical_feature_names

67 if type(categorical_feature_names) == str:

68 categorical_feature_name_regex = categorical_feature_names

69 else:

70 if categorical_feature_names is not None and len(categorical_feature_names) > 0:

71 categorical_feature_name_regex = or_regex_group(categorical_feature_names)

72 else:

73 categorical_feature_name_regex = None

74 self._categoricalFeatureNameRegex: str = categorical_feature_name_regex

75 self._categoricalFeatureRules = []

77 if normalisation_rule_template is not None:

78 # Note: placeholder rule's regex will be set in generate

79 self._normalisationRules = [normalisation_rule_template.to_placeholder_rule()]

80 self._mustUpdateNormalisationRuleBasedOnColumnNames = True

81 else:

82 self._normalisationRules = list(normalisation_rules)

83 self._mustUpdateNormalisationRuleBasedOnColumnNames = False

85 if add_categorical_default_rules:

86 if categorical_feature_name_regex is not None:

87 self._categoricalFeatureRules.append(data_transformation.DFTNormalisation.Rule(categorical_feature_name_regex,

88 unsupported=True))

89 self._categoricalFeatureRules.append(data_transformation.DFTNormalisation.Rule(categorical_feature_name_regex + r"_\d+",

90 skip=True)) # rule for one-hot transformation

92 self._name: Optional[str] = None

93 self._isFitted = False

95 # for backwards compatibility with persisted Featuregens based on code prior to commit 7088cbbe

96 # They lack the __isFitted attribute and we assume that each such Featuregen was fitted

97 def __setstate__(self, d):

98 d["_isFitted"] = d.get("_isFitted", True)

99 self.__dict__ = d

100

101 def _tostring_exclude_private(self) -> bool:

102 return True

103

104 def _tostring_additional_entries(self) -> Dict[str, Any]:

105 return dict(name=self.get_name())

106

107 def get_name(self) -> str:

108 """

109 :return: the name of this feature generator, which may be a default name if the name has not been set. Note that feature generators

110 created by a FeatureGeneratorFactory always get the name with which the generator factory was registered.

111 """

112 if self._name is None:

113 return f"{self.__class__.__name__}-{id(self)}"

114 return self._name

115

116 def set_name(self, name: str) -> None:

117 self._name = name

118

119 def get_names(self) -> List[str]:

120 """

121 :return: the list of names of feature generators; will be a list with a single name for a regular feature generator

122 """

123 return [self.get_name()]

124

125 def info(self):

126 return {

127 "name": self.get_name(),

128 "categoricalFeatureNames": self.__categoricalFeatureNames,

129 "generatedColumnNames": self.get_generated_column_names(),

130 "isFitted": self.is_fitted(),

131 "normalisationRules": self.get_normalisation_rules(),

132 }

133

134 def get_normalisation_rules(self, include_generated_categorical_rules=True) -> List[data_transformation.DFTNormalisation.Rule]:

135 if include_generated_categorical_rules:

136 return self._normalisationRules + self._categoricalFeatureRules

137 else:

138 return self._normalisationRules

139

140 def get_categorical_feature_name_regex(self) -> Optional[str]:

141 return self._categoricalFeatureNameRegex

142

143 def is_categorical_feature(self, feature_name):

144 if self._categoricalFeatureNameRegex is None:

145 return False

146 return re.fullmatch(self._categoricalFeatureNameRegex, feature_name) is not None

147

148 def get_generated_column_names(self) -> Optional[List[str]]:

149 """

150 :return: Column names of the data frame generated by the most recent call of the feature generators 'generate' method.

151 Returns None if generate was never called.

152 """

153 return self._generatedColumnNames

154

155 def to_dft(self):

156 return DFTFromFeatureGenerator(self)

157

158 @abstractmethod

159 def _fit(self, x: pd.DataFrame, y: pd.DataFrame = None, ctx=None):

160 """

161 Fits the feature generator based on the given data

162

163 :param x: the input/features data frame for the learning problem

164 :param y: the corresponding output data frame for the learning problem

165 (which will typically contain regression or classification target columns)

166 :param ctx: a context object whose functionality may be required for feature generation;

167 this is typically the model instance that this feature generator is to generate inputs for

168 """

169 pass

170

171 def fit(self, x: pd.DataFrame, y: pd.DataFrame = None, ctx=None):

172 """

173 Fits the feature generator based on the given data

174

175 :param x: the input/features data frame for the learning problem

176 :param y: the corresponding output data frame for the learning problem

177 (which will typically contain regression or classification target columns)

178 :param ctx: a context object whose functionality may be required for feature generation;

179 this is typically the model instance that this feature generator is to generate inputs for

180 """

181 log.debug(f"Fitting {self}")

182 self._fit(x, y=y, ctx=ctx)

183 self._isFitted = True

184

185 def is_fitted(self):

186 return self._isFitted

187

188 def generate(self, df: pd.DataFrame, ctx=None) -> pd.DataFrame:

189 """

190 Generates features for the data points in the given data frame

191

192 :param df: the input data frame for which to generate features

193 :param ctx: a context object whose functionality may be required for feature generation;

194 this is typically the model instance that this feature generator is to generate inputs for

195 :return: a data frame containing the generated features, which uses the same index as X (and Y)

196 """

197 if not self.is_fitted():

198 raise Exception(f"Cannot generate features from a FeatureGenerator which is not fitted: "

199 f"the feature generator {self.get_name()} requires fitting")

200

201 log.debug(f"Generating features with {self}")

202 result_df = self._generate(df, ctx=ctx)

203

204 is_column_duplicated_array = result_df.columns.duplicated()

205 if any(is_column_duplicated_array):

206 duplicated_columns = set(result_df.columns[is_column_duplicated_array])

207 raise DuplicateColumnNamesException(f"Feature data frame contains duplicate column names: {duplicated_columns}")

208

209 # ensure that categorical columns have dtype 'category'

210 categorical_feature_names = []

211 if self._categoricalFeatureNameRegex is not None:

212 result_df = result_df.copy() # result_df we got might be a view of some other DF, so before we modify it, we must copy it

213 categorical_feature_names = [col for col in result_df.columns if self.is_categorical_feature(col)]

214 for colName in categorical_feature_names:

215 series = result_df[colName].copy()

216 if series.dtype.name != 'category':

217 result_df[colName] = series.astype('category', copy=False)

218

219 self._generatedColumnNames = result_df.columns

220

221 # finalise normalisation rule template (if any) by making it apply to all non-categorical features

222 # (a default rule applies to categorical features)

223 if self._mustUpdateNormalisationRuleBasedOnColumnNames:

224 non_categorical_features = list(set(self._generatedColumnNames).difference(categorical_feature_names))

225 # NOTE: We here update the existing rule which was instantiated with a dummy regex because

226 # some mechanisms (e.g. MultiFeatureGenerators) retrieve rule instances early on (before generate

227 # is ever called) and therefore updating an existing rule is the safe route and should always

228 # work, because rules should never actually be applied before generate has indeed been called

229 self._normalisationRules[0].set_regex(or_regex_group(non_categorical_features))

230 self._mustUpdateNormalisationRuleBasedOnColumnNames = False

231

232 return result_df

233

234 @abstractmethod

235 def _generate(self, df: pd.DataFrame, ctx=None) -> pd.DataFrame:

236 """

237 Generates features for the data points in the given data frame.

238

239 :param df: the input data frame for which to generate features

240 :param ctx: a context object whose functionality may be required for feature generation;

241 this is typically the model instance that this feature generator is to generate inputs for

242 :return: a data frame containing the generated features, which uses the same index as ``df``.

243 The data frame's columns holding categorical columns are not required to have dtype ``category``;

244 this will be ensured by the encapsulating call as long as the respective columns' names

245 were appropriately provided at construction.

246 """

247 pass

248

249 def fit_generate(self, x: pd.DataFrame, y: pd.DataFrame = None, ctx=None) -> pd.DataFrame:

250 """

251 Fits the feature generator and subsequently generates features for the data points in the given data frame

252

253 :param x: the input data frame for the learning problem and for which to generate features

254 :param y: the corresponding output data frame for the learning problem

255 (which will typically contain regression or classification target columns)

256 :param ctx: a context object whose functionality may be required for feature generation;

257 this is typically the model instance that this feature generator is to generate inputs for

258 :return: a data frame containing the generated features, which uses the same index as X (and Y)

259 """

260 self.fit(x, y, ctx)

261 return self.generate(x, ctx)

262

263 def flattened(self,

264 columns_to_flatten: List[str] = None,

265 normalisation_rules=(),

266 normalisation_rule_template: data_transformation.DFTNormalisation.RuleTemplate = None,

267 keep_other_columns=True) -> "ChainedFeatureGenerator":

268 """

269 Returns a new feature generator which returns flattened versions of one or more of the vector-valued columns generated

270 by this feature generator.

271

272 :param columns_to_flatten: the list of columns to flatten; if None, flatten all columns

273 :param normalisation_rules: a list of normalisation rules which apply to the flattened columns

274 :param normalisation_rule_template: a normalisation rule template which applies to all generated flattened columns

275 :param keep_other_columns: if True, any additional columns that are not to be flattened are to be retained

276 by the returned feature generator; if False, additional columns are to be discarded

277 :return: a feature generator which generates the flattened columns

278 """

279 return flattened_feature_generator(self, columns_to_flatten=columns_to_flatten, normalisation_rules=normalisation_rules,

280 keep_other_columns=keep_other_columns, normalisation_rule_template=normalisation_rule_template)

281

282 def concat(self, *others: "FeatureGenerator") -> "MultiFeatureGenerator":

283 """

284 Concatenates this feature generator with one or more other feature generator in order to produce a feature generator that

285 jointly generates all features

286

287 :param others: other feature generators

288 :return: a :class:`MultiFeatureGenerator`

289 """

290 if isinstance(self, MultiFeatureGenerator):

291 fgens = list(self.featureGenerators)

292 else:

293 fgens = [self]

294 fgens.extend(others)

295 return MultiFeatureGenerator(fgens)

296

297 def chain(self, *others: "FeatureGenerator") -> "ChainedFeatureGenerator":

298 """

299 Chains this feature generator with one or more other feature generators such that each feature generator

300 receives as input the output of the preceding feature generator. The resulting feature generator

301 produces the features of the last element in the chain.

302

303 :param others: other feature generator

304 :return: a :class:`ChainedFeatureGenerator`

305 """

306 if isinstance(self, ChainedFeatureGenerator):

307 fgens = self.featureGenerators

308 else:

309 fgens = [self]

310 fgens.extend(others)

311 return ChainedFeatureGenerator(fgens)

312

313

314class RuleBasedFeatureGenerator(FeatureGenerator, ABC):

315 """

316 A feature generator which does not require fitting

317 """

318 def fit(self, x, y=None, ctx=None):

319 pass

320

321 def _fit(self, x: pd.DataFrame, y: pd.DataFrame = None, ctx=None):

322 pass

323

324 def is_fitted(self):

325 return True

326

327

328class MultiFeatureGenerator(FeatureGenerator):

329 """

330 Wrapper for multiple feature generators. Calling generate here applies all given feature generators independently and

331 returns the concatenation of their outputs

332 """

333 def __init__(self, *feature_generators: Union[FeatureGenerator, List[FeatureGenerator]]):

334 self.featureGenerators = feature_generators = flatten_arguments(feature_generators)

335 if len(self.featureGenerators) == 0:

336 log.debug("Creating an empty MultiFeatureGenerator. It will generate a data frame without columns.")

337 categorical_feature_name_regexes = [regex for regex in [fg.get_categorical_feature_name_regex()

338 for fg in feature_generators] if regex is not None]

339 if len(categorical_feature_name_regexes) > 0:

340 categorical_feature_names = "|".join(categorical_feature_name_regexes)

341 else:

342 categorical_feature_names = ()

343 normalisation_rules = util.concat_sequences([fg.get_normalisation_rules() for fg in feature_generators])

344 super().__init__(categorical_feature_names=categorical_feature_names, normalisation_rules=normalisation_rules,

345 add_categorical_default_rules=False)

346

347 def _tostring_object_info(self) -> str:

348 return f"featureGenerators={list_string(self.featureGenerators)}"

349

350 def _generate_from_multiple(self, generate_features: Callable[[FeatureGenerator], pd.DataFrame], index) -> pd.DataFrame:

351 dfs = []

352 for fg in self.featureGenerators:

353 df = generate_features(fg)

354 dfs.append(df)

355 if len(dfs) == 0:

356 return pd.DataFrame(index=index)

357 else:

358 combined_df = pd.concat(dfs, axis=1)

359 if len(combined_df.columns) != len(set(combined_df.columns)):

360 raise Exception(f"At least one column was generated more than once: {list(combined_df.columns)}; "

361 f"check feature generators for correctness!")

362 return combined_df

363

364 def _generate(self, input_df: pd.DataFrame, ctx=None):

365 def generate_features(fg: FeatureGenerator):

366 return fg.generate(input_df, ctx=ctx)

367 return self._generate_from_multiple(generate_features, input_df.index)

368

369 def fit_generate(self, x: pd.DataFrame, y: pd.DataFrame = None, ctx=None) -> pd.DataFrame:

370 log.debug(f"Fitting and generating features with {self}")

371

372 def generate_features(fg: FeatureGenerator):

373 return fg.fit_generate(x, y, ctx)

374

375 return self._generate_from_multiple(generate_features, x.index)

376

377 def _fit(self, x: pd.DataFrame, y: pd.DataFrame = None, ctx=None):

378 for fg in self.featureGenerators:

379 fg.fit(x, y)

380

381 def is_fitted(self):

382 return all([fg.is_fitted() for fg in self.featureGenerators])

383

384 def info(self):

385 info = super(MultiFeatureGenerator, self).info()

386 info["featureGeneratorNames"] = self.get_names()

387 return info

388

389 def get_names(self) -> list:

390 return functools.reduce(lambda x, y: x + y, [fg.get_names() for fg in self.featureGenerators], [])

391

392

393class FeatureGeneratorFromNamedTuples(FeatureGenerator, ABC):

394 """

395 Generates feature values for one data point at a time, creating a dictionary with

396 feature values from each named tuple

397 """

398 def __init__(self, cache: util.cache.KeyValueCache = None, categorical_feature_names: Sequence[str] = (),

399 normalisation_rules: Sequence[data_transformation.DFTNormalisation.Rule] = (),

400 normalisation_rule_template: data_transformation.DFTNormalisation.RuleTemplate = None):

401 super().__init__(categorical_feature_names=categorical_feature_names, normalisation_rules=normalisation_rules,

402 normalisation_rule_template=normalisation_rule_template)

403 self.cache = cache

404

405 def _generate(self, df: pd.DataFrame, ctx=None):

406 dicts = []

407 for idx, nt in enumerate(df.itertuples()):

408 nt: PandasNamedTuple

409 if idx % 100 == 0:

410 log.debug(f"Generating feature via {self.__class__.__name__} for index {idx}")

411 value = None

412 if self.cache is not None:

413 value = self.cache.get(nt.Index)

414 if value is None:

415 value = self._generate_feature_dict(nt)

416 if self.cache is not None:

417 self.cache.set(nt.Index, value)

418 dicts.append(value)

419 return pd.DataFrame(dicts, index=df.index)

420

421 @abstractmethod

422 def _generate_feature_dict(self, named_tuple) -> Dict[str, Any]:

423 """

424 Creates a dictionary with feature values from a named tuple

425

426 :param named_tuple: the data point for which to generate features

427 :return: a dictionary mapping feature names to values

428 """

429 pass

430

431

432class FeatureGeneratorTakeColumns(RuleBasedFeatureGenerator):

433 def __init__(self, columns: Union[str, List[str]] = None, except_columns: Sequence[str] = (),

434 categorical_feature_names: Optional[Union[Sequence[str], str]] = (),

435 normalisation_rules: Sequence[data_transformation.DFTNormalisation.Rule] = (),

436 normalisation_rule_template: data_transformation.DFTNormalisation.RuleTemplate = None,

437 verify_column_names=True):

438 """

439 :param columns: name of the column or list of names of columns to be taken. If None, all columns will be taken.

440 :param except_columns: list of names of columns to not take if present in the input df

441 :param categorical_feature_names: either a sequence of column names or a regex that is to match all categorical feature names

442 (which must not only work for the feature generated by this feature generator, i.e. it should not match feature names generated

443 by other feature generators).

444 It will be ensured that the respective columns in the generated data frames will have dtype 'category'.

445 Furthermore, presence of meta-information can later be leveraged for further transformations, e.g. one-hot encoding.

446 :param normalisation_rules: Rules to be used by DFTNormalisation (e.g. for constructing an input transformer for a model).

447 These rules are only relevant if a DFTNormalisation object consuming them is instantiated and used

448 within a data processing pipeline. They do not affect feature generation.

449 :param normalisation_rule_template: This parameter can be supplied instead of normalisationRules for the case where

450 there shall be a single rule that applies to all columns generated by this feature generator that were not labeled as

451 categorical.

452 :param verify_column_names: if True and columns to take were specified, will raise an error in case said columns

453 are missing during feature generation. If False, will log on info level instead

454 """

455 super().__init__(categorical_feature_names=categorical_feature_names, normalisation_rules=normalisation_rules,

456 normalisation_rule_template=normalisation_rule_template)

457 if isinstance(columns, str):

458 columns = [columns]

459 self.columns = columns

460 self.exceptColumns = except_columns

461 self.verifyColumnNames = verify_column_names

462

463 def _generate(self, df: pd.DataFrame, ctx=None) -> pd.DataFrame:

464 columns_to_take = self.columns if self.columns is not None else df.columns

465 columns_to_take = [col for col in columns_to_take if col not in self.exceptColumns]

466

467 if self.columns is not None:

468 missing_cols = set(columns_to_take).difference(df.columns)

469 if len(missing_cols) > 0:

470 missing_cols_notification = f"Columns {missing_cols} were specified but are not present in data frame. " \

471 f"verifyColumnNames was set to {self.verifyColumnNames}; " \

472 f"available columns: {list(df.columns)}"

473 if self.verifyColumnNames:

474 raise RuntimeError(missing_cols_notification)

475 log.info(missing_cols_notification)

476 return df[columns_to_take]

477

478 def info(self):

479 info = super().info()

480 info["columns"] = self.columns

481 info["exceptColumns"] = self.exceptColumns

482 return info

483

484

485class FeatureGeneratorFlattenColumns(RuleBasedFeatureGenerator):

486 """

487 Instances of this class take columns with vectors and creates a data frame with columns containing entries of

488 these vectors.

489

490 For example, if columns "vec1", "vec2" contain vectors of dimensions dim1, dim2, a data frame with dim1+dim2 new columns

491 will be created. It will contain the columns "vec1_<i1>", "vec2_<i2>" with i1, i2 ranging in (0, dim1), (0, dim2).

492

493 """

494 def __init__(self, columns: Optional[Union[str, Sequence[str]]] = None, categorical_feature_names: Sequence[str] = (),

495 normalisation_rules: Sequence[data_transformation.DFTNormalisation.Rule] = (),

496 normalisation_rule_template: data_transformation.DFTNormalisation.RuleTemplate = None):

497 """

498

499 :param columns: name of the column or list of names of columns to be flattened. If None, all columns will be flattened.

500 :param categorical_feature_names:

501 :param normalisation_rules:

502 :param normalisation_rule_template:

503 """

504 super().__init__(categorical_feature_names=categorical_feature_names, normalisation_rules=normalisation_rules,

505 normalisation_rule_template=normalisation_rule_template)

506 if isinstance(columns, str):

507 columns = [columns]

508 self.columns = columns

509

510 def _generate(self, df: pd.DataFrame, ctx=None) -> pd.DataFrame:

511 result_df = pd.DataFrame(index=df.index)

512 columns_to_flatten = self.columns if self.columns is not None else df.columns

513 for col in columns_to_flatten:

514 log.debug(f"Flattening column {col}")

515 # NOTE: we found the use of np.stack to produce the most runtime-efficient results.

516 # Other variants, e.g. based on lists instead of numpy.arrays, perform much worse.

517 values = np.stack(df[col].values)

518 if len(values.shape) != 2:

519 raise ValueError(f"Column {col} was expected to contain one dimensional vectors, something went wrong")

520 dimension = values.shape[1]

521 new_columns = [f"{col}_{i}" for i in range(dimension)]

522 log.debug(f"Flattening resulted in {len(new_columns)} new columns")

523 result_df[new_columns] = pd.DataFrame(values, index=df.index)

524 return result_df

525

526 def info(self):

527 info = super().info()

528 info["columns"] = self.columns

529 return info

530

531

532class FeatureGeneratorFromColumnGenerator(RuleBasedFeatureGenerator):

533 """

534 Implements a feature generator via a column generator

535 """

536 log = log.getChild(__qualname__)

537

538 def __init__(self, column_gen: 'ColumnGenerator', take_input_column_if_present=False, is_categorical=False,

539 normalisation_rule_template: data_transformation.DFTNormalisation.RuleTemplate = None):

540 """

541 :param column_gen: the underlying column generator

542 :param take_input_column_if_present: if True, then if a column whose name corresponds to the column to generate exists

543 in the input data, simply copy it to generate the output (without using the column generator); if False, always

544 apply the columnGen to generate the output

545 :param is_categorical: whether the resulting column is categorical

546 :param normalisation_rule_template: template for a DFTNormalisation for the resulting column.

547 This should only be provided if is_categorical is False

548 """

549 if is_categorical and normalisation_rule_template is not None:

550 raise ValueError(f"normalisationRuleTemplate should be None when the generated column is categorical")

551

552 categorical_feature_names = (column_gen.generatedColumnName,) if is_categorical else ()

553 super().__init__(categorical_feature_names=categorical_feature_names, normalisation_rule_template=normalisation_rule_template)

554

555 self.takeInputColumnIfPresent = take_input_column_if_present

556 self.columnGen = column_gen

557

558 def info(self):

559 info = super().info()

560 info["takeInputColumnIfPresent"] = self.takeInputColumnIfPresent

561 info["generatedColName"] = self.columnGen.generatedColumnName

562 return info

563

564 def _generate(self, df: pd.DataFrame, ctx=None) -> pd.DataFrame:

565 col_name = self.columnGen.generatedColumnName

566 if self.takeInputColumnIfPresent and col_name in df.columns:

567 self.log.debug(f"Taking column '{col_name}' from input data frame")

568 series = df[col_name]

569 else:

570 self.log.debug(f"Generating column '{col_name}' via {self.columnGen}")

571 series = self.columnGen.generate_column(df)

572 return pd.DataFrame({col_name: series})

573

574

575class ChainedFeatureGenerator(FeatureGenerator):

576 """

577 Chains feature generators such that they are executed one after another. The output of generator i>=1 is the input of

578 generator i+1 in the generator sequence.

579 """

580 def __init__(self, *feature_generators: Union[FeatureGenerator, List[FeatureGenerator]]):

581 """

582 :param feature_generators: feature generators to apply in order; the properties of the last feature generator

583 determine the relevant meta-data such as categorical feature names and normalisation rules

584 """

585 self.featureGenerators = flatten_arguments(feature_generators)

586 if len(feature_generators) == 0:

587 raise ValueError("Empty list of feature generators")

588 last_fg: FeatureGenerator = self.featureGenerators[-1]

589 super().__init__(

590 categorical_feature_names=last_fg.get_categorical_feature_name_regex(), normalisation_rules=last_fg.get_normalisation_rules(),

591 add_categorical_default_rules=False)

592

593 def _generate(self, df: pd.DataFrame, ctx=None) -> pd.DataFrame:

594 for featureGen in self.featureGenerators:

595 df = featureGen.generate(df, ctx)

596 return df

597

598 def _fit(self, x: pd.DataFrame, y: pd.DataFrame = None, ctx=None):

599 self.fit_generate(x, y, ctx)

600

601 def is_fitted(self):

602 return all([fg.is_fitted() for fg in self.featureGenerators])

603

604 def info(self):

605 info = super().info()

606 info["chainedFeatureGeneratorNames"] = self.get_names()

607

608 def fit_generate(self, x: pd.DataFrame, y: pd.DataFrame = None, ctx=None) -> pd.DataFrame:

609 log.debug(f"Fitting and generating features with {self}")

610 for fg in self.featureGenerators:

611 x = fg.fit_generate(x, y, ctx)

612 return x

613

614

615class FeatureGeneratorTargetDistribution(FeatureGenerator):

616 """

617 A feature generator, which, for a column T (typically the categorical target column of a classification problem

618 or the continuous target column of a regression problem),

619

620 * can ensure that T takes on limited set of values t_1, ..., t_n by allowing the user to apply

621 binning using given bin boundaries

622 * computes for each value c of a categorical column C the conditional empirical distribution

623 P(T | C=c) in the training data during the training phase,

624 * generates, for each requested column C and value c in the column, n features

625 '<C>_<T>_distribution_<t_i>' = P(T=t_i | C=c) if flatten=True

626 or one feature '<C>_<T>_distribution' = [P(T=t_i | C=c), ..., P(T=t_n | C=c)] if flatten=False

627

628 Being probability values, the features generated by this feature generator are already normalised.

629 """

630 def __init__(self,

631 columns: Union[str, Sequence[str]],

632 target_column: str,

633 target_column_bins: Optional[Union[Sequence[float], int, pd.IntervalIndex]],

634 target_column_in_features_df=False,

635 flatten=True):

636 """

637 :param columns: the categorical columns for which to generate distribution features

638 :param target_column: the column the distributions over which will make up the features.

639 If targetColumnBins is not None, this column will be discretised before computing the conditional distributions

640 :param target_column_bins: if not None, specifies the binning to apply via pandas.cut

641 (see https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.cut.html).

642 Note that if a value should match no bin, NaN will generated. To avoid this when specifying bin boundaries in a list,

643 -inf and +inf should be used as the first and last entries.

644 :param target_column_in_features_df: if True, when fitting will look for targetColumn in the features data frame (X) instead of in

645 target data frame (Y)

646 :param flatten: whether to generate a separate scalar feature per distribution value rather than one feature

647 with all of the distribution's values

648 """

649 self.flatten = flatten

650 if isinstance(columns, str):

651 columns = [columns]

652 self.columns = columns

653 self.targetColumn = target_column

654 self.targetColumnInFeaturesDf = target_column_in_features_df

655 self.targetColumnBins = target_column_bins

656 if self.flatten:

657 normalisation_rule_template = data_transformation.DFTNormalisation.RuleTemplate(skip=True)

658 else:

659 normalisation_rule_template = data_transformation.DFTNormalisation.RuleTemplate(unsupported=True)

660 super().__init__(normalisation_rule_template=normalisation_rule_template)

661 self._targetColumnValues = None

662 # This will hold the mapping: column -> featureValue -> targetValue -> targetValueEmpiricalProbability

663 self._discreteTargetDistributionsByColumn: Optional[Dict[str, Dict[Any, Dict[Any, float]]]] = None

664

665 def info(self):

666 info = super().info()

667 info["columns"] = self.columns

668 info["targetColumn"] = self.targetColumn

669 info["targetColumnBins"] = self.targetColumnBins

670 info["flatten"] = self.flatten

671 return info

672

673 def _fit(self, x: pd.DataFrame, y: pd.DataFrame = None, ctx=None):

674 """

675 This will persist the empirical target probability distributions for all unique values in the specified columns

676 """

677 if self.targetColumnInFeaturesDf:

678 target = x[self.targetColumn]

679 else:

680 target = y[self.targetColumn]

681 if self.targetColumnBins is not None:

682 discretised_target = pd.cut(target, self.targetColumnBins)

683 else:

684 discretised_target = target

685 self._targetColumnValues = discretised_target.unique()

686

687 self._discreteTargetDistributionsByColumn = {}

688 for column in self.columns:

689 self._discreteTargetDistributionsByColumn[column] = {}

690 column_target_df = pd.DataFrame()

691 column_target_df[column] = x[column]

692 column_target_df["target"] = discretised_target.values

693 for value, valueTargetsDf in column_target_df.groupby(column):

694 # The normalized value_counts contain targetValue -> targetValueEmpiricalProbability for the current value

695 self._discreteTargetDistributionsByColumn[column][value] = valueTargetsDf["target"].value_counts(normalize=True).to_dict()

696

697 def _generate(self, df: pd.DataFrame, ctx=None) -> pd.DataFrame:

698 if self._discreteTargetDistributionsByColumn is None:

699 raise Exception("Feature generator has not been fitted")

700 result_df = pd.DataFrame(index=df.index)

701 for column in self.columns:

702 target_distribution_by_value = self._discreteTargetDistributionsByColumn[column]

703 if self.flatten:

704 for target_value in self._targetColumnValues:

705 # Important: pd.Series.apply should not be used here, as it would label the resulting column as categorical

706 result_df[f"{column}_{self.targetColumn}_distribution_{target_value}"] = \

707 [target_distribution_by_value[value].get(target_value, 0.0) for value in df[column]]

708 else:

709 distributions = [[target_distribution_by_value[value].get(targetValue, 0.0) for targetValue in self._targetColumnValues]

710 for value in df[column]]

711 result_df[f"{column}_{self.targetColumn}_distribution"] = pd.Series(distributions, index=df[column].index)

712 return result_df

713

714

715class FeatureGeneratorFromVectorModel(FeatureGenerator):

716 def __init__(self,

717 vector_model: "VectorModel",

718 target_feature_generator: FeatureGenerator,

719 categorical_feature_names: Sequence[str] = (),

720 normalisation_rules: Sequence[data_transformation.DFTNormalisation.Rule] = (),

721 normalisation_rule_template: data_transformation.DFTNormalisation.RuleTemplate = None,

722 input_feature_generator: FeatureGenerator = None,

723 use_target_feature_generator_for_training=False):

724 """

725 Provides a feature via predictions of a given model

726

727 :param vector_model: model used for generate features from predictions

728 :param target_feature_generator: generator for target to be predicted

729 :param categorical_feature_names:

730 :param normalisation_rules:

731 :param normalisation_rule_template:

732 :param input_feature_generator: optional feature generator to be applied to input of vectorModel's fit and predict

733 :param use_target_feature_generator_for_training: if False, this generator will always apply the model

734 to generate features.

735 If True, this generator will use targetFeatureGenerator to generate features, bypassing the

736 model. This is useful for the case where the model which is

737 to receive the generated features shall be trained on the original targets rather than the predictions

738 thereof.

739 """

740 super().__init__(categorical_feature_names=categorical_feature_names, normalisation_rules=normalisation_rules,

741 normalisation_rule_template=normalisation_rule_template)

742

743 self.useTargetFeatureGeneratorForTraining = use_target_feature_generator_for_training

744 self.targetFeatureGenerator = target_feature_generator

745 self.inputFeatureGenerator = input_feature_generator

746 self.useTargetFeatureGeneratorForTraining = use_target_feature_generator_for_training

747 self.vectorModel = vector_model

748

749 def _fit(self, x: pd.DataFrame, y: pd.DataFrame = None, ctx=None):

750 target_df = self.targetFeatureGenerator.fit_generate(x, y)

751 if self.inputFeatureGenerator:

752 x = self.inputFeatureGenerator.fit_generate(x, y)

753 self.vectorModel.fit(x, target_df)

754

755 def _generate(self, df: pd.DataFrame, ctx=None) -> pd.DataFrame:

756 if self.inputFeatureGenerator:

757 df = self.inputFeatureGenerator.generate(df)

758 if self.useTargetFeatureGeneratorForTraining and not ctx.is_fitted():

759 log.debug(f"Using targetFeatureGenerator {self.targetFeatureGenerator.__class__.__name__} to generate target features")

760 return self.targetFeatureGenerator.generate(df)

761 else:

762 log.debug(f"Generating target features via {self.vectorModel.__class__.__name__}")

763 return self.vectorModel.predict(df)

764

765 def info(self):

766 info = super().info()

767 info["wrappedModel"] = str(self.vectorModel)

768 return info

769

770

771class FeatureGeneratorMapColumn(RuleBasedFeatureGenerator, ABC):

772 """

773 Creates a single feature from a single input column by applying a function to each element of the input column

774 """

775 def __init__(self,

776 input_col_name: str,

777 feature_col_name: str,

778 categorical_feature_names: Optional[Union[Sequence[str], str]] = None,

779 normalisation_rules: Sequence[data_transformation.DFTNormalisation.Rule] = (),

780 normalisation_rule_template: data_transformation.DFTNormalisation.RuleTemplate = None,

781 add_categorical_default_rules=True):

782 super().__init__(categorical_feature_names=categorical_feature_names, normalisation_rules=normalisation_rules,

783 normalisation_rule_template=normalisation_rule_template, add_categorical_default_rules=add_categorical_default_rules)

784 self._inputColName = input_col_name

785 self._featureColName = feature_col_name

786

787 def _generate(self, df: pd.DataFrame, ctx=None) -> pd.DataFrame:

788 if self._inputColName not in df.columns:

789 raise ValueError(f"Column '{self._inputColName}' required by feature generator not found in list of columns: "

790 f"{list(df.columns)}")

791 input_series = df[self._inputColName]

792 values = input_series.apply(self._create_value)

793 return pd.DataFrame({self._featureColName: values}, index=df.index)

794

795 @abstractmethod

796 def _create_value(self, value):

797 """

798 Maps a value from the input column to a feature value

799

800 :param value: a value from the input column

801 :return: the feature value

802 """

803 pass

804

805

806class FeatureGeneratorMapColumnDict(RuleBasedFeatureGenerator, ABC):

807 """

808 Creates an arbitrary number of features from a single input column by applying a function to each element of the input column

809 """

810 def __init__(self, input_col_name: str, categorical_feature_names: Optional[Union[Sequence[str], str]] = None,

811 normalisation_rules: Sequence[data_transformation.DFTNormalisation.Rule] = (),

812 normalisation_rule_template: data_transformation.DFTNormalisation.RuleTemplate = None, add_categorical_default_rules=True):

813 super().__init__(categorical_feature_names=categorical_feature_names, normalisation_rules=normalisation_rules,

814 normalisation_rule_template=normalisation_rule_template, add_categorical_default_rules=add_categorical_default_rules)

815 self._inputColName = input_col_name

816

817 def _generate(self, df: pd.DataFrame, ctx=None) -> pd.DataFrame:

818 if self._inputColName not in df.columns:

819 raise ValueError(f"Column '{self._inputColName}' required by feature generator not found in list of columns: "

820 f"{list(df.columns)}")

821 input_series = df[self._inputColName]

822 values = [self._create_features_dict(v) for v in input_series]

823 return pd.DataFrame(values, index=df.index)

824

825 @abstractmethod

826 def _create_features_dict(self, value) -> Dict[str, Any]:

827 """

828 Maps a value from the input column to a dictionary containing one or more features.

829

830 :param value: a value from the input column

831 :return: a dictionary mapping feature names to values

832 """

833 pass

834

835

836class FeatureGeneratorNAMarker(RuleBasedFeatureGenerator):

837 """

838 Creates features indicating whether another feature is N/A (not available).

839 It can be practical to use this feature generator in conjunction with DFTFillNA for models that cannot handle missing values.

840 """

841 def __init__(self, columns: List[str], value_a=0, value_na=1):

842 """

843 Note: When changing the default values used, use only values that are considered to be normalised when using this

844 feature generation in a context where DFTNormalisation is used (no normalisation is applied to features generated

845 by this feature generator).

846

847 :param columns: the columns for which to generate

848 :param value_a: the feature value if the input feature is available

849 :param value_na: the feature value if the input feature is not available

850 """

851 super().__init__(normalisation_rule_template=DFTNormalisation.RuleTemplate(skip=True))

852 self.columns = columns

853 self.valueA = value_a

854 self.valueNA = value_na

855

856 def _generate(self, df: pd.DataFrame, ctx=None) -> pd.DataFrame:

857 new_cols = {}

858 value_map = {True: self.valueNA, False: self.valueA}

859 for col in self.columns:

860 new_cols[f"{col}_na"] = [value_map[isNA] for isNA in df[col].isna()]

861 return pd.DataFrame(new_cols, index=df.index)

862

863

864def flattened_feature_generator(fgen: FeatureGenerator, columns_to_flatten: List[str] = None, keep_other_columns=True,

865 normalisation_rules: Sequence[DFTNormalisation.Rule] = (),

866 normalisation_rule_template: data_transformation.DFTNormalisation.RuleTemplate = None):

867 """

868 Return a flattening version of the input feature generator.

869

870 :param fgen: the feature generator which generates columns that are to be flattened

871 :param columns_to_flatten: list of names of output columns to be flattened; if None, flatten all columns

872 :param keep_other_columns: whether any additional columns that are not to be flattened are to be retained

873 by the returned feature generator

874 :param normalisation_rules: additional normalisation rules for the flattened output columns

875 :param normalisation_rule_template: This parameter can be supplied instead of normalisation_rules for the case where

876 there shall be a single rule that applies to all flattened output columns

877 :return: FeatureGenerator instance that will generate flattened versions of the specified columns and leave

878 all other output columns as is.

879

880 Example:

881 >>> from sensai.featuregen import FeatureGeneratorTakeColumns, flattened_feature_generator

882 >>> import pandas as pd

883 >>>

884 >>> df = pd.DataFrame({"foo": [[1, 2], [3, 4]], "bar": ["a", "b"]})

885 >>> fgen = flattened_feature_generator(FeatureGeneratorTakeColumns(), columns_to_flatten=["foo"])

886 >>> fgen.generate(df)

887 foo_0 foo_1 bar

888 0 1 2 a

889 1 3 4 b

890 """

891 flattening_generator = FeatureGeneratorFlattenColumns(columns=columns_to_flatten, normalisation_rules=normalisation_rules,

892 normalisation_rule_template=normalisation_rule_template)

893 if columns_to_flatten is None or not keep_other_columns:

894 return ChainedFeatureGenerator(fgen, flattening_generator)

895 else:

896 return ChainedFeatureGenerator(fgen,

897 MultiFeatureGenerator(flattening_generator, FeatureGeneratorTakeColumns(except_columns=columns_to_flatten)))

898

899

900class FeatureGeneratorFromDFT(FeatureGenerator):

901 def __init__(self, dft: DataFrameTransformer, categorical_feature_names: Optional[Union[Sequence[str], str]] = None,

902 normalisation_rules: Sequence[data_transformation.DFTNormalisation.Rule] = (),

903 normalisation_rule_template: data_transformation.DFTNormalisation.RuleTemplate = None,

904 add_categorical_default_rules=True):

905 super().__init__(categorical_feature_names=categorical_feature_names, normalisation_rules=normalisation_rules,

906 normalisation_rule_template=normalisation_rule_template, add_categorical_default_rules=add_categorical_default_rules)

907 self.dft = dft

908

909 def _fit(self, x: pd.DataFrame, y: pd.DataFrame = None, ctx=None):

910 self.dft.fit(x)

911

912 def _generate(self, df: pd.DataFrame, ctx=None) -> pd.DataFrame:

913 return self.dft.apply(df)

Coverage for src/sensai/featuregen/feature_generator.py: 61%

416 statements