Coverage for src/sensai/featuregen/feature_generator.py: 61%
416 statements
« prev ^ index » next coverage.py v7.6.1, created at 2024-08-13 22:17 +0000
« prev ^ index » next coverage.py v7.6.1, created at 2024-08-13 22:17 +0000
1import functools
2import logging
3import re
4from abc import ABC, abstractmethod
5from typing import Sequence, List, Union, Callable, Any, Dict, TYPE_CHECKING, Optional
7import numpy as np
8import pandas as pd
10from .. import util, data_transformation
11from ..data_transformation import DFTNormalisation, DFTFromFeatureGenerator, DataFrameTransformer
12from ..util import flatten_arguments
13from ..util.string import or_regex_group, ToStringMixin, list_string
14from ..util.typing import PandasNamedTuple
16if TYPE_CHECKING:
17 from ..vector_model import VectorModel
18 from ..columngen import ColumnGenerator
21log = logging.getLogger(__name__)
24class DuplicateColumnNamesException(Exception):
25 pass
28class FeatureGenerator(ToStringMixin, ABC):
29 """
30 Base class for feature generators that create a new DataFrame containing feature values
31 from an input DataFrame
32 """
33 def __init__(self,
34 categorical_feature_names: Optional[Union[Sequence[str], str]] = None,
35 normalisation_rules: Sequence[data_transformation.DFTNormalisation.Rule] = (),
36 normalisation_rule_template: Optional[data_transformation.DFTNormalisation.RuleTemplate] = None,
37 add_categorical_default_rules: bool = True):
38 """
39 :param categorical_feature_names: either a sequence of column names or a regex that is to match all categorical feature names
40 (which must not only work for the feature generated by this feature generator, i.e. it should not match feature names generated
41 by other feature generators).
42 It will be ensured that the respective columns in the generated data frames will have dtype 'category'.
43 Furthermore, the presence of meta-information can later be leveraged for further transformations, e.g., one-hot encoding.
44 :param normalisation_rules: Rules to be used by DFTNormalisation (e.g.,for constructing an input transformer for a model).
45 These rules are only relevant if a DFTNormalisation object consuming them is instantiated and used
46 within a data processing pipeline. They do not affect feature generation.
47 :param normalisation_rule_template: This parameter can be supplied instead of `normalisation_rules` for the case where
48 there shall be a single rule that applies to all columns generated by this feature generator that were not labeled as
49 categorical. Like normalisation_rules, this is only relevant if a DFTNormalisation object consuming
50 normalisation rules is instantiated and used within a data processing pipeline.
51 It does not affect feature generation.
52 :param add_categorical_default_rules:
53 If True, normalisation rules for categorical features (which are unsupported by normalisation) and their corresponding one-hot
54 encoded features (with "_<index>" appended) will be added. It does not affect feature generation.
55 """
56 # NOTE: While it would be more elegant to not have all of the above constructor arguments and instead provide
57 # them later using "with*" methods, this would have the significant drawback that it would enable
58 # all such attributes to be provided in all subclasses, even in ones where we know settings exactly
59 # and can provide them directly in the subclass constructor implementation. Thus it would enable
60 # non-sensical settings which should be avoided.
61 if len(normalisation_rules) > 0 and normalisation_rule_template is not None:
62 raise ValueError(f"Normalisation rules should be empty when a rule template is provided")
64 self._generatedColumnNames = None
65 self.__categoricalFeatureNames = categorical_feature_names
67 if type(categorical_feature_names) == str:
68 categorical_feature_name_regex = categorical_feature_names
69 else:
70 if categorical_feature_names is not None and len(categorical_feature_names) > 0:
71 categorical_feature_name_regex = or_regex_group(categorical_feature_names)
72 else:
73 categorical_feature_name_regex = None
74 self._categoricalFeatureNameRegex: str = categorical_feature_name_regex
75 self._categoricalFeatureRules = []
77 if normalisation_rule_template is not None:
78 # Note: placeholder rule's regex will be set in generate
79 self._normalisationRules = [normalisation_rule_template.to_placeholder_rule()]
80 self._mustUpdateNormalisationRuleBasedOnColumnNames = True
81 else:
82 self._normalisationRules = list(normalisation_rules)
83 self._mustUpdateNormalisationRuleBasedOnColumnNames = False
85 if add_categorical_default_rules:
86 if categorical_feature_name_regex is not None:
87 self._categoricalFeatureRules.append(data_transformation.DFTNormalisation.Rule(categorical_feature_name_regex,
88 unsupported=True))
89 self._categoricalFeatureRules.append(data_transformation.DFTNormalisation.Rule(categorical_feature_name_regex + r"_\d+",
90 skip=True)) # rule for one-hot transformation
92 self._name: Optional[str] = None
93 self._isFitted = False
95 # for backwards compatibility with persisted Featuregens based on code prior to commit 7088cbbe
96 # They lack the __isFitted attribute and we assume that each such Featuregen was fitted
97 def __setstate__(self, d):
98 d["_isFitted"] = d.get("_isFitted", True)
99 self.__dict__ = d
101 def _tostring_exclude_private(self) -> bool:
102 return True
104 def _tostring_additional_entries(self) -> Dict[str, Any]:
105 return dict(name=self.get_name())
107 def get_name(self) -> str:
108 """
109 :return: the name of this feature generator, which may be a default name if the name has not been set. Note that feature generators
110 created by a FeatureGeneratorFactory always get the name with which the generator factory was registered.
111 """
112 if self._name is None:
113 return f"{self.__class__.__name__}-{id(self)}"
114 return self._name
116 def set_name(self, name: str) -> None:
117 self._name = name
119 def get_names(self) -> List[str]:
120 """
121 :return: the list of names of feature generators; will be a list with a single name for a regular feature generator
122 """
123 return [self.get_name()]
125 def info(self):
126 return {
127 "name": self.get_name(),
128 "categoricalFeatureNames": self.__categoricalFeatureNames,
129 "generatedColumnNames": self.get_generated_column_names(),
130 "isFitted": self.is_fitted(),
131 "normalisationRules": self.get_normalisation_rules(),
132 }
134 def get_normalisation_rules(self, include_generated_categorical_rules=True) -> List[data_transformation.DFTNormalisation.Rule]:
135 if include_generated_categorical_rules:
136 return self._normalisationRules + self._categoricalFeatureRules
137 else:
138 return self._normalisationRules
140 def get_categorical_feature_name_regex(self) -> Optional[str]:
141 return self._categoricalFeatureNameRegex
143 def is_categorical_feature(self, feature_name):
144 if self._categoricalFeatureNameRegex is None:
145 return False
146 return re.fullmatch(self._categoricalFeatureNameRegex, feature_name) is not None
148 def get_generated_column_names(self) -> Optional[List[str]]:
149 """
150 :return: Column names of the data frame generated by the most recent call of the feature generators 'generate' method.
151 Returns None if generate was never called.
152 """
153 return self._generatedColumnNames
155 def to_dft(self):
156 return DFTFromFeatureGenerator(self)
158 @abstractmethod
159 def _fit(self, x: pd.DataFrame, y: pd.DataFrame = None, ctx=None):
160 """
161 Fits the feature generator based on the given data
163 :param x: the input/features data frame for the learning problem
164 :param y: the corresponding output data frame for the learning problem
165 (which will typically contain regression or classification target columns)
166 :param ctx: a context object whose functionality may be required for feature generation;
167 this is typically the model instance that this feature generator is to generate inputs for
168 """
169 pass
171 def fit(self, x: pd.DataFrame, y: pd.DataFrame = None, ctx=None):
172 """
173 Fits the feature generator based on the given data
175 :param x: the input/features data frame for the learning problem
176 :param y: the corresponding output data frame for the learning problem
177 (which will typically contain regression or classification target columns)
178 :param ctx: a context object whose functionality may be required for feature generation;
179 this is typically the model instance that this feature generator is to generate inputs for
180 """
181 log.debug(f"Fitting {self}")
182 self._fit(x, y=y, ctx=ctx)
183 self._isFitted = True
185 def is_fitted(self):
186 return self._isFitted
188 def generate(self, df: pd.DataFrame, ctx=None) -> pd.DataFrame:
189 """
190 Generates features for the data points in the given data frame
192 :param df: the input data frame for which to generate features
193 :param ctx: a context object whose functionality may be required for feature generation;
194 this is typically the model instance that this feature generator is to generate inputs for
195 :return: a data frame containing the generated features, which uses the same index as X (and Y)
196 """
197 if not self.is_fitted():
198 raise Exception(f"Cannot generate features from a FeatureGenerator which is not fitted: "
199 f"the feature generator {self.get_name()} requires fitting")
201 log.debug(f"Generating features with {self}")
202 result_df = self._generate(df, ctx=ctx)
204 is_column_duplicated_array = result_df.columns.duplicated()
205 if any(is_column_duplicated_array):
206 duplicated_columns = set(result_df.columns[is_column_duplicated_array])
207 raise DuplicateColumnNamesException(f"Feature data frame contains duplicate column names: {duplicated_columns}")
209 # ensure that categorical columns have dtype 'category'
210 categorical_feature_names = []
211 if self._categoricalFeatureNameRegex is not None:
212 result_df = result_df.copy() # result_df we got might be a view of some other DF, so before we modify it, we must copy it
213 categorical_feature_names = [col for col in result_df.columns if self.is_categorical_feature(col)]
214 for colName in categorical_feature_names:
215 series = result_df[colName].copy()
216 if series.dtype.name != 'category':
217 result_df[colName] = series.astype('category', copy=False)
219 self._generatedColumnNames = result_df.columns
221 # finalise normalisation rule template (if any) by making it apply to all non-categorical features
222 # (a default rule applies to categorical features)
223 if self._mustUpdateNormalisationRuleBasedOnColumnNames:
224 non_categorical_features = list(set(self._generatedColumnNames).difference(categorical_feature_names))
225 # NOTE: We here update the existing rule which was instantiated with a dummy regex because
226 # some mechanisms (e.g. MultiFeatureGenerators) retrieve rule instances early on (before generate
227 # is ever called) and therefore updating an existing rule is the safe route and should always
228 # work, because rules should never actually be applied before generate has indeed been called
229 self._normalisationRules[0].set_regex(or_regex_group(non_categorical_features))
230 self._mustUpdateNormalisationRuleBasedOnColumnNames = False
232 return result_df
234 @abstractmethod
235 def _generate(self, df: pd.DataFrame, ctx=None) -> pd.DataFrame:
236 """
237 Generates features for the data points in the given data frame.
239 :param df: the input data frame for which to generate features
240 :param ctx: a context object whose functionality may be required for feature generation;
241 this is typically the model instance that this feature generator is to generate inputs for
242 :return: a data frame containing the generated features, which uses the same index as ``df``.
243 The data frame's columns holding categorical columns are not required to have dtype ``category``;
244 this will be ensured by the encapsulating call as long as the respective columns' names
245 were appropriately provided at construction.
246 """
247 pass
249 def fit_generate(self, x: pd.DataFrame, y: pd.DataFrame = None, ctx=None) -> pd.DataFrame:
250 """
251 Fits the feature generator and subsequently generates features for the data points in the given data frame
253 :param x: the input data frame for the learning problem and for which to generate features
254 :param y: the corresponding output data frame for the learning problem
255 (which will typically contain regression or classification target columns)
256 :param ctx: a context object whose functionality may be required for feature generation;
257 this is typically the model instance that this feature generator is to generate inputs for
258 :return: a data frame containing the generated features, which uses the same index as X (and Y)
259 """
260 self.fit(x, y, ctx)
261 return self.generate(x, ctx)
263 def flattened(self,
264 columns_to_flatten: List[str] = None,
265 normalisation_rules=(),
266 normalisation_rule_template: data_transformation.DFTNormalisation.RuleTemplate = None,
267 keep_other_columns=True) -> "ChainedFeatureGenerator":
268 """
269 Returns a new feature generator which returns flattened versions of one or more of the vector-valued columns generated
270 by this feature generator.
272 :param columns_to_flatten: the list of columns to flatten; if None, flatten all columns
273 :param normalisation_rules: a list of normalisation rules which apply to the flattened columns
274 :param normalisation_rule_template: a normalisation rule template which applies to all generated flattened columns
275 :param keep_other_columns: if True, any additional columns that are not to be flattened are to be retained
276 by the returned feature generator; if False, additional columns are to be discarded
277 :return: a feature generator which generates the flattened columns
278 """
279 return flattened_feature_generator(self, columns_to_flatten=columns_to_flatten, normalisation_rules=normalisation_rules,
280 keep_other_columns=keep_other_columns, normalisation_rule_template=normalisation_rule_template)
282 def concat(self, *others: "FeatureGenerator") -> "MultiFeatureGenerator":
283 """
284 Concatenates this feature generator with one or more other feature generator in order to produce a feature generator that
285 jointly generates all features
287 :param others: other feature generators
288 :return: a :class:`MultiFeatureGenerator`
289 """
290 if isinstance(self, MultiFeatureGenerator):
291 fgens = list(self.featureGenerators)
292 else:
293 fgens = [self]
294 fgens.extend(others)
295 return MultiFeatureGenerator(fgens)
297 def chain(self, *others: "FeatureGenerator") -> "ChainedFeatureGenerator":
298 """
299 Chains this feature generator with one or more other feature generators such that each feature generator
300 receives as input the output of the preceding feature generator. The resulting feature generator
301 produces the features of the last element in the chain.
303 :param others: other feature generator
304 :return: a :class:`ChainedFeatureGenerator`
305 """
306 if isinstance(self, ChainedFeatureGenerator):
307 fgens = self.featureGenerators
308 else:
309 fgens = [self]
310 fgens.extend(others)
311 return ChainedFeatureGenerator(fgens)
314class RuleBasedFeatureGenerator(FeatureGenerator, ABC):
315 """
316 A feature generator which does not require fitting
317 """
318 def fit(self, x, y=None, ctx=None):
319 pass
321 def _fit(self, x: pd.DataFrame, y: pd.DataFrame = None, ctx=None):
322 pass
324 def is_fitted(self):
325 return True
328class MultiFeatureGenerator(FeatureGenerator):
329 """
330 Wrapper for multiple feature generators. Calling generate here applies all given feature generators independently and
331 returns the concatenation of their outputs
332 """
333 def __init__(self, *feature_generators: Union[FeatureGenerator, List[FeatureGenerator]]):
334 self.featureGenerators = feature_generators = flatten_arguments(feature_generators)
335 if len(self.featureGenerators) == 0:
336 log.debug("Creating an empty MultiFeatureGenerator. It will generate a data frame without columns.")
337 categorical_feature_name_regexes = [regex for regex in [fg.get_categorical_feature_name_regex()
338 for fg in feature_generators] if regex is not None]
339 if len(categorical_feature_name_regexes) > 0:
340 categorical_feature_names = "|".join(categorical_feature_name_regexes)
341 else:
342 categorical_feature_names = ()
343 normalisation_rules = util.concat_sequences([fg.get_normalisation_rules() for fg in feature_generators])
344 super().__init__(categorical_feature_names=categorical_feature_names, normalisation_rules=normalisation_rules,
345 add_categorical_default_rules=False)
347 def _tostring_object_info(self) -> str:
348 return f"featureGenerators={list_string(self.featureGenerators)}"
350 def _generate_from_multiple(self, generate_features: Callable[[FeatureGenerator], pd.DataFrame], index) -> pd.DataFrame:
351 dfs = []
352 for fg in self.featureGenerators:
353 df = generate_features(fg)
354 dfs.append(df)
355 if len(dfs) == 0:
356 return pd.DataFrame(index=index)
357 else:
358 combined_df = pd.concat(dfs, axis=1)
359 if len(combined_df.columns) != len(set(combined_df.columns)):
360 raise Exception(f"At least one column was generated more than once: {list(combined_df.columns)}; "
361 f"check feature generators for correctness!")
362 return combined_df
364 def _generate(self, input_df: pd.DataFrame, ctx=None):
365 def generate_features(fg: FeatureGenerator):
366 return fg.generate(input_df, ctx=ctx)
367 return self._generate_from_multiple(generate_features, input_df.index)
369 def fit_generate(self, x: pd.DataFrame, y: pd.DataFrame = None, ctx=None) -> pd.DataFrame:
370 log.debug(f"Fitting and generating features with {self}")
372 def generate_features(fg: FeatureGenerator):
373 return fg.fit_generate(x, y, ctx)
375 return self._generate_from_multiple(generate_features, x.index)
377 def _fit(self, x: pd.DataFrame, y: pd.DataFrame = None, ctx=None):
378 for fg in self.featureGenerators:
379 fg.fit(x, y)
381 def is_fitted(self):
382 return all([fg.is_fitted() for fg in self.featureGenerators])
384 def info(self):
385 info = super(MultiFeatureGenerator, self).info()
386 info["featureGeneratorNames"] = self.get_names()
387 return info
389 def get_names(self) -> list:
390 return functools.reduce(lambda x, y: x + y, [fg.get_names() for fg in self.featureGenerators], [])
393class FeatureGeneratorFromNamedTuples(FeatureGenerator, ABC):
394 """
395 Generates feature values for one data point at a time, creating a dictionary with
396 feature values from each named tuple
397 """
398 def __init__(self, cache: util.cache.KeyValueCache = None, categorical_feature_names: Sequence[str] = (),
399 normalisation_rules: Sequence[data_transformation.DFTNormalisation.Rule] = (),
400 normalisation_rule_template: data_transformation.DFTNormalisation.RuleTemplate = None):
401 super().__init__(categorical_feature_names=categorical_feature_names, normalisation_rules=normalisation_rules,
402 normalisation_rule_template=normalisation_rule_template)
403 self.cache = cache
405 def _generate(self, df: pd.DataFrame, ctx=None):
406 dicts = []
407 for idx, nt in enumerate(df.itertuples()):
408 nt: PandasNamedTuple
409 if idx % 100 == 0:
410 log.debug(f"Generating feature via {self.__class__.__name__} for index {idx}")
411 value = None
412 if self.cache is not None:
413 value = self.cache.get(nt.Index)
414 if value is None:
415 value = self._generate_feature_dict(nt)
416 if self.cache is not None:
417 self.cache.set(nt.Index, value)
418 dicts.append(value)
419 return pd.DataFrame(dicts, index=df.index)
421 @abstractmethod
422 def _generate_feature_dict(self, named_tuple) -> Dict[str, Any]:
423 """
424 Creates a dictionary with feature values from a named tuple
426 :param named_tuple: the data point for which to generate features
427 :return: a dictionary mapping feature names to values
428 """
429 pass
432class FeatureGeneratorTakeColumns(RuleBasedFeatureGenerator):
433 def __init__(self, columns: Union[str, List[str]] = None, except_columns: Sequence[str] = (),
434 categorical_feature_names: Optional[Union[Sequence[str], str]] = (),
435 normalisation_rules: Sequence[data_transformation.DFTNormalisation.Rule] = (),
436 normalisation_rule_template: data_transformation.DFTNormalisation.RuleTemplate = None,
437 verify_column_names=True):
438 """
439 :param columns: name of the column or list of names of columns to be taken. If None, all columns will be taken.
440 :param except_columns: list of names of columns to not take if present in the input df
441 :param categorical_feature_names: either a sequence of column names or a regex that is to match all categorical feature names
442 (which must not only work for the feature generated by this feature generator, i.e. it should not match feature names generated
443 by other feature generators).
444 It will be ensured that the respective columns in the generated data frames will have dtype 'category'.
445 Furthermore, presence of meta-information can later be leveraged for further transformations, e.g. one-hot encoding.
446 :param normalisation_rules: Rules to be used by DFTNormalisation (e.g. for constructing an input transformer for a model).
447 These rules are only relevant if a DFTNormalisation object consuming them is instantiated and used
448 within a data processing pipeline. They do not affect feature generation.
449 :param normalisation_rule_template: This parameter can be supplied instead of normalisationRules for the case where
450 there shall be a single rule that applies to all columns generated by this feature generator that were not labeled as
451 categorical.
452 :param verify_column_names: if True and columns to take were specified, will raise an error in case said columns
453 are missing during feature generation. If False, will log on info level instead
454 """
455 super().__init__(categorical_feature_names=categorical_feature_names, normalisation_rules=normalisation_rules,
456 normalisation_rule_template=normalisation_rule_template)
457 if isinstance(columns, str):
458 columns = [columns]
459 self.columns = columns
460 self.exceptColumns = except_columns
461 self.verifyColumnNames = verify_column_names
463 def _generate(self, df: pd.DataFrame, ctx=None) -> pd.DataFrame:
464 columns_to_take = self.columns if self.columns is not None else df.columns
465 columns_to_take = [col for col in columns_to_take if col not in self.exceptColumns]
467 if self.columns is not None:
468 missing_cols = set(columns_to_take).difference(df.columns)
469 if len(missing_cols) > 0:
470 missing_cols_notification = f"Columns {missing_cols} were specified but are not present in data frame. " \
471 f"verifyColumnNames was set to {self.verifyColumnNames}; " \
472 f"available columns: {list(df.columns)}"
473 if self.verifyColumnNames:
474 raise RuntimeError(missing_cols_notification)
475 log.info(missing_cols_notification)
476 return df[columns_to_take]
478 def info(self):
479 info = super().info()
480 info["columns"] = self.columns
481 info["exceptColumns"] = self.exceptColumns
482 return info
485class FeatureGeneratorFlattenColumns(RuleBasedFeatureGenerator):
486 """
487 Instances of this class take columns with vectors and creates a data frame with columns containing entries of
488 these vectors.
490 For example, if columns "vec1", "vec2" contain vectors of dimensions dim1, dim2, a data frame with dim1+dim2 new columns
491 will be created. It will contain the columns "vec1_<i1>", "vec2_<i2>" with i1, i2 ranging in (0, dim1), (0, dim2).
493 """
494 def __init__(self, columns: Optional[Union[str, Sequence[str]]] = None, categorical_feature_names: Sequence[str] = (),
495 normalisation_rules: Sequence[data_transformation.DFTNormalisation.Rule] = (),
496 normalisation_rule_template: data_transformation.DFTNormalisation.RuleTemplate = None):
497 """
499 :param columns: name of the column or list of names of columns to be flattened. If None, all columns will be flattened.
500 :param categorical_feature_names:
501 :param normalisation_rules:
502 :param normalisation_rule_template:
503 """
504 super().__init__(categorical_feature_names=categorical_feature_names, normalisation_rules=normalisation_rules,
505 normalisation_rule_template=normalisation_rule_template)
506 if isinstance(columns, str):
507 columns = [columns]
508 self.columns = columns
510 def _generate(self, df: pd.DataFrame, ctx=None) -> pd.DataFrame:
511 result_df = pd.DataFrame(index=df.index)
512 columns_to_flatten = self.columns if self.columns is not None else df.columns
513 for col in columns_to_flatten:
514 log.debug(f"Flattening column {col}")
515 # NOTE: we found the use of np.stack to produce the most runtime-efficient results.
516 # Other variants, e.g. based on lists instead of numpy.arrays, perform much worse.
517 values = np.stack(df[col].values)
518 if len(values.shape) != 2:
519 raise ValueError(f"Column {col} was expected to contain one dimensional vectors, something went wrong")
520 dimension = values.shape[1]
521 new_columns = [f"{col}_{i}" for i in range(dimension)]
522 log.debug(f"Flattening resulted in {len(new_columns)} new columns")
523 result_df[new_columns] = pd.DataFrame(values, index=df.index)
524 return result_df
526 def info(self):
527 info = super().info()
528 info["columns"] = self.columns
529 return info
532class FeatureGeneratorFromColumnGenerator(RuleBasedFeatureGenerator):
533 """
534 Implements a feature generator via a column generator
535 """
536 log = log.getChild(__qualname__)
538 def __init__(self, column_gen: 'ColumnGenerator', take_input_column_if_present=False, is_categorical=False,
539 normalisation_rule_template: data_transformation.DFTNormalisation.RuleTemplate = None):
540 """
541 :param column_gen: the underlying column generator
542 :param take_input_column_if_present: if True, then if a column whose name corresponds to the column to generate exists
543 in the input data, simply copy it to generate the output (without using the column generator); if False, always
544 apply the columnGen to generate the output
545 :param is_categorical: whether the resulting column is categorical
546 :param normalisation_rule_template: template for a DFTNormalisation for the resulting column.
547 This should only be provided if is_categorical is False
548 """
549 if is_categorical and normalisation_rule_template is not None:
550 raise ValueError(f"normalisationRuleTemplate should be None when the generated column is categorical")
552 categorical_feature_names = (column_gen.generatedColumnName,) if is_categorical else ()
553 super().__init__(categorical_feature_names=categorical_feature_names, normalisation_rule_template=normalisation_rule_template)
555 self.takeInputColumnIfPresent = take_input_column_if_present
556 self.columnGen = column_gen
558 def info(self):
559 info = super().info()
560 info["takeInputColumnIfPresent"] = self.takeInputColumnIfPresent
561 info["generatedColName"] = self.columnGen.generatedColumnName
562 return info
564 def _generate(self, df: pd.DataFrame, ctx=None) -> pd.DataFrame:
565 col_name = self.columnGen.generatedColumnName
566 if self.takeInputColumnIfPresent and col_name in df.columns:
567 self.log.debug(f"Taking column '{col_name}' from input data frame")
568 series = df[col_name]
569 else:
570 self.log.debug(f"Generating column '{col_name}' via {self.columnGen}")
571 series = self.columnGen.generate_column(df)
572 return pd.DataFrame({col_name: series})
575class ChainedFeatureGenerator(FeatureGenerator):
576 """
577 Chains feature generators such that they are executed one after another. The output of generator i>=1 is the input of
578 generator i+1 in the generator sequence.
579 """
580 def __init__(self, *feature_generators: Union[FeatureGenerator, List[FeatureGenerator]]):
581 """
582 :param feature_generators: feature generators to apply in order; the properties of the last feature generator
583 determine the relevant meta-data such as categorical feature names and normalisation rules
584 """
585 self.featureGenerators = flatten_arguments(feature_generators)
586 if len(feature_generators) == 0:
587 raise ValueError("Empty list of feature generators")
588 last_fg: FeatureGenerator = self.featureGenerators[-1]
589 super().__init__(
590 categorical_feature_names=last_fg.get_categorical_feature_name_regex(), normalisation_rules=last_fg.get_normalisation_rules(),
591 add_categorical_default_rules=False)
593 def _generate(self, df: pd.DataFrame, ctx=None) -> pd.DataFrame:
594 for featureGen in self.featureGenerators:
595 df = featureGen.generate(df, ctx)
596 return df
598 def _fit(self, x: pd.DataFrame, y: pd.DataFrame = None, ctx=None):
599 self.fit_generate(x, y, ctx)
601 def is_fitted(self):
602 return all([fg.is_fitted() for fg in self.featureGenerators])
604 def info(self):
605 info = super().info()
606 info["chainedFeatureGeneratorNames"] = self.get_names()
608 def fit_generate(self, x: pd.DataFrame, y: pd.DataFrame = None, ctx=None) -> pd.DataFrame:
609 log.debug(f"Fitting and generating features with {self}")
610 for fg in self.featureGenerators:
611 x = fg.fit_generate(x, y, ctx)
612 return x
615class FeatureGeneratorTargetDistribution(FeatureGenerator):
616 """
617 A feature generator, which, for a column T (typically the categorical target column of a classification problem
618 or the continuous target column of a regression problem),
620 * can ensure that T takes on limited set of values t_1, ..., t_n by allowing the user to apply
621 binning using given bin boundaries
622 * computes for each value c of a categorical column C the conditional empirical distribution
623 P(T | C=c) in the training data during the training phase,
624 * generates, for each requested column C and value c in the column, n features
625 '<C>_<T>_distribution_<t_i>' = P(T=t_i | C=c) if flatten=True
626 or one feature '<C>_<T>_distribution' = [P(T=t_i | C=c), ..., P(T=t_n | C=c)] if flatten=False
628 Being probability values, the features generated by this feature generator are already normalised.
629 """
630 def __init__(self,
631 columns: Union[str, Sequence[str]],
632 target_column: str,
633 target_column_bins: Optional[Union[Sequence[float], int, pd.IntervalIndex]],
634 target_column_in_features_df=False,
635 flatten=True):
636 """
637 :param columns: the categorical columns for which to generate distribution features
638 :param target_column: the column the distributions over which will make up the features.
639 If targetColumnBins is not None, this column will be discretised before computing the conditional distributions
640 :param target_column_bins: if not None, specifies the binning to apply via pandas.cut
641 (see https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.cut.html).
642 Note that if a value should match no bin, NaN will generated. To avoid this when specifying bin boundaries in a list,
643 -inf and +inf should be used as the first and last entries.
644 :param target_column_in_features_df: if True, when fitting will look for targetColumn in the features data frame (X) instead of in
645 target data frame (Y)
646 :param flatten: whether to generate a separate scalar feature per distribution value rather than one feature
647 with all of the distribution's values
648 """
649 self.flatten = flatten
650 if isinstance(columns, str):
651 columns = [columns]
652 self.columns = columns
653 self.targetColumn = target_column
654 self.targetColumnInFeaturesDf = target_column_in_features_df
655 self.targetColumnBins = target_column_bins
656 if self.flatten:
657 normalisation_rule_template = data_transformation.DFTNormalisation.RuleTemplate(skip=True)
658 else:
659 normalisation_rule_template = data_transformation.DFTNormalisation.RuleTemplate(unsupported=True)
660 super().__init__(normalisation_rule_template=normalisation_rule_template)
661 self._targetColumnValues = None
662 # This will hold the mapping: column -> featureValue -> targetValue -> targetValueEmpiricalProbability
663 self._discreteTargetDistributionsByColumn: Optional[Dict[str, Dict[Any, Dict[Any, float]]]] = None
665 def info(self):
666 info = super().info()
667 info["columns"] = self.columns
668 info["targetColumn"] = self.targetColumn
669 info["targetColumnBins"] = self.targetColumnBins
670 info["flatten"] = self.flatten
671 return info
673 def _fit(self, x: pd.DataFrame, y: pd.DataFrame = None, ctx=None):
674 """
675 This will persist the empirical target probability distributions for all unique values in the specified columns
676 """
677 if self.targetColumnInFeaturesDf:
678 target = x[self.targetColumn]
679 else:
680 target = y[self.targetColumn]
681 if self.targetColumnBins is not None:
682 discretised_target = pd.cut(target, self.targetColumnBins)
683 else:
684 discretised_target = target
685 self._targetColumnValues = discretised_target.unique()
687 self._discreteTargetDistributionsByColumn = {}
688 for column in self.columns:
689 self._discreteTargetDistributionsByColumn[column] = {}
690 column_target_df = pd.DataFrame()
691 column_target_df[column] = x[column]
692 column_target_df["target"] = discretised_target.values
693 for value, valueTargetsDf in column_target_df.groupby(column):
694 # The normalized value_counts contain targetValue -> targetValueEmpiricalProbability for the current value
695 self._discreteTargetDistributionsByColumn[column][value] = valueTargetsDf["target"].value_counts(normalize=True).to_dict()
697 def _generate(self, df: pd.DataFrame, ctx=None) -> pd.DataFrame:
698 if self._discreteTargetDistributionsByColumn is None:
699 raise Exception("Feature generator has not been fitted")
700 result_df = pd.DataFrame(index=df.index)
701 for column in self.columns:
702 target_distribution_by_value = self._discreteTargetDistributionsByColumn[column]
703 if self.flatten:
704 for target_value in self._targetColumnValues:
705 # Important: pd.Series.apply should not be used here, as it would label the resulting column as categorical
706 result_df[f"{column}_{self.targetColumn}_distribution_{target_value}"] = \
707 [target_distribution_by_value[value].get(target_value, 0.0) for value in df[column]]
708 else:
709 distributions = [[target_distribution_by_value[value].get(targetValue, 0.0) for targetValue in self._targetColumnValues]
710 for value in df[column]]
711 result_df[f"{column}_{self.targetColumn}_distribution"] = pd.Series(distributions, index=df[column].index)
712 return result_df
715class FeatureGeneratorFromVectorModel(FeatureGenerator):
716 def __init__(self,
717 vector_model: "VectorModel",
718 target_feature_generator: FeatureGenerator,
719 categorical_feature_names: Sequence[str] = (),
720 normalisation_rules: Sequence[data_transformation.DFTNormalisation.Rule] = (),
721 normalisation_rule_template: data_transformation.DFTNormalisation.RuleTemplate = None,
722 input_feature_generator: FeatureGenerator = None,
723 use_target_feature_generator_for_training=False):
724 """
725 Provides a feature via predictions of a given model
727 :param vector_model: model used for generate features from predictions
728 :param target_feature_generator: generator for target to be predicted
729 :param categorical_feature_names:
730 :param normalisation_rules:
731 :param normalisation_rule_template:
732 :param input_feature_generator: optional feature generator to be applied to input of vectorModel's fit and predict
733 :param use_target_feature_generator_for_training: if False, this generator will always apply the model
734 to generate features.
735 If True, this generator will use targetFeatureGenerator to generate features, bypassing the
736 model. This is useful for the case where the model which is
737 to receive the generated features shall be trained on the original targets rather than the predictions
738 thereof.
739 """
740 super().__init__(categorical_feature_names=categorical_feature_names, normalisation_rules=normalisation_rules,
741 normalisation_rule_template=normalisation_rule_template)
743 self.useTargetFeatureGeneratorForTraining = use_target_feature_generator_for_training
744 self.targetFeatureGenerator = target_feature_generator
745 self.inputFeatureGenerator = input_feature_generator
746 self.useTargetFeatureGeneratorForTraining = use_target_feature_generator_for_training
747 self.vectorModel = vector_model
749 def _fit(self, x: pd.DataFrame, y: pd.DataFrame = None, ctx=None):
750 target_df = self.targetFeatureGenerator.fit_generate(x, y)
751 if self.inputFeatureGenerator:
752 x = self.inputFeatureGenerator.fit_generate(x, y)
753 self.vectorModel.fit(x, target_df)
755 def _generate(self, df: pd.DataFrame, ctx=None) -> pd.DataFrame:
756 if self.inputFeatureGenerator:
757 df = self.inputFeatureGenerator.generate(df)
758 if self.useTargetFeatureGeneratorForTraining and not ctx.is_fitted():
759 log.debug(f"Using targetFeatureGenerator {self.targetFeatureGenerator.__class__.__name__} to generate target features")
760 return self.targetFeatureGenerator.generate(df)
761 else:
762 log.debug(f"Generating target features via {self.vectorModel.__class__.__name__}")
763 return self.vectorModel.predict(df)
765 def info(self):
766 info = super().info()
767 info["wrappedModel"] = str(self.vectorModel)
768 return info
771class FeatureGeneratorMapColumn(RuleBasedFeatureGenerator, ABC):
772 """
773 Creates a single feature from a single input column by applying a function to each element of the input column
774 """
775 def __init__(self,
776 input_col_name: str,
777 feature_col_name: str,
778 categorical_feature_names: Optional[Union[Sequence[str], str]] = None,
779 normalisation_rules: Sequence[data_transformation.DFTNormalisation.Rule] = (),
780 normalisation_rule_template: data_transformation.DFTNormalisation.RuleTemplate = None,
781 add_categorical_default_rules=True):
782 super().__init__(categorical_feature_names=categorical_feature_names, normalisation_rules=normalisation_rules,
783 normalisation_rule_template=normalisation_rule_template, add_categorical_default_rules=add_categorical_default_rules)
784 self._inputColName = input_col_name
785 self._featureColName = feature_col_name
787 def _generate(self, df: pd.DataFrame, ctx=None) -> pd.DataFrame:
788 if self._inputColName not in df.columns:
789 raise ValueError(f"Column '{self._inputColName}' required by feature generator not found in list of columns: "
790 f"{list(df.columns)}")
791 input_series = df[self._inputColName]
792 values = input_series.apply(self._create_value)
793 return pd.DataFrame({self._featureColName: values}, index=df.index)
795 @abstractmethod
796 def _create_value(self, value):
797 """
798 Maps a value from the input column to a feature value
800 :param value: a value from the input column
801 :return: the feature value
802 """
803 pass
806class FeatureGeneratorMapColumnDict(RuleBasedFeatureGenerator, ABC):
807 """
808 Creates an arbitrary number of features from a single input column by applying a function to each element of the input column
809 """
810 def __init__(self, input_col_name: str, categorical_feature_names: Optional[Union[Sequence[str], str]] = None,
811 normalisation_rules: Sequence[data_transformation.DFTNormalisation.Rule] = (),
812 normalisation_rule_template: data_transformation.DFTNormalisation.RuleTemplate = None, add_categorical_default_rules=True):
813 super().__init__(categorical_feature_names=categorical_feature_names, normalisation_rules=normalisation_rules,
814 normalisation_rule_template=normalisation_rule_template, add_categorical_default_rules=add_categorical_default_rules)
815 self._inputColName = input_col_name
817 def _generate(self, df: pd.DataFrame, ctx=None) -> pd.DataFrame:
818 if self._inputColName not in df.columns:
819 raise ValueError(f"Column '{self._inputColName}' required by feature generator not found in list of columns: "
820 f"{list(df.columns)}")
821 input_series = df[self._inputColName]
822 values = [self._create_features_dict(v) for v in input_series]
823 return pd.DataFrame(values, index=df.index)
825 @abstractmethod
826 def _create_features_dict(self, value) -> Dict[str, Any]:
827 """
828 Maps a value from the input column to a dictionary containing one or more features.
830 :param value: a value from the input column
831 :return: a dictionary mapping feature names to values
832 """
833 pass
836class FeatureGeneratorNAMarker(RuleBasedFeatureGenerator):
837 """
838 Creates features indicating whether another feature is N/A (not available).
839 It can be practical to use this feature generator in conjunction with DFTFillNA for models that cannot handle missing values.
840 """
841 def __init__(self, columns: List[str], value_a=0, value_na=1):
842 """
843 Note: When changing the default values used, use only values that are considered to be normalised when using this
844 feature generation in a context where DFTNormalisation is used (no normalisation is applied to features generated
845 by this feature generator).
847 :param columns: the columns for which to generate
848 :param value_a: the feature value if the input feature is available
849 :param value_na: the feature value if the input feature is not available
850 """
851 super().__init__(normalisation_rule_template=DFTNormalisation.RuleTemplate(skip=True))
852 self.columns = columns
853 self.valueA = value_a
854 self.valueNA = value_na
856 def _generate(self, df: pd.DataFrame, ctx=None) -> pd.DataFrame:
857 new_cols = {}
858 value_map = {True: self.valueNA, False: self.valueA}
859 for col in self.columns:
860 new_cols[f"{col}_na"] = [value_map[isNA] for isNA in df[col].isna()]
861 return pd.DataFrame(new_cols, index=df.index)
864def flattened_feature_generator(fgen: FeatureGenerator, columns_to_flatten: List[str] = None, keep_other_columns=True,
865 normalisation_rules: Sequence[DFTNormalisation.Rule] = (),
866 normalisation_rule_template: data_transformation.DFTNormalisation.RuleTemplate = None):
867 """
868 Return a flattening version of the input feature generator.
870 :param fgen: the feature generator which generates columns that are to be flattened
871 :param columns_to_flatten: list of names of output columns to be flattened; if None, flatten all columns
872 :param keep_other_columns: whether any additional columns that are not to be flattened are to be retained
873 by the returned feature generator
874 :param normalisation_rules: additional normalisation rules for the flattened output columns
875 :param normalisation_rule_template: This parameter can be supplied instead of normalisation_rules for the case where
876 there shall be a single rule that applies to all flattened output columns
877 :return: FeatureGenerator instance that will generate flattened versions of the specified columns and leave
878 all other output columns as is.
880 Example:
881 >>> from sensai.featuregen import FeatureGeneratorTakeColumns, flattened_feature_generator
882 >>> import pandas as pd
883 >>>
884 >>> df = pd.DataFrame({"foo": [[1, 2], [3, 4]], "bar": ["a", "b"]})
885 >>> fgen = flattened_feature_generator(FeatureGeneratorTakeColumns(), columns_to_flatten=["foo"])
886 >>> fgen.generate(df)
887 foo_0 foo_1 bar
888 0 1 2 a
889 1 3 4 b
890 """
891 flattening_generator = FeatureGeneratorFlattenColumns(columns=columns_to_flatten, normalisation_rules=normalisation_rules,
892 normalisation_rule_template=normalisation_rule_template)
893 if columns_to_flatten is None or not keep_other_columns:
894 return ChainedFeatureGenerator(fgen, flattening_generator)
895 else:
896 return ChainedFeatureGenerator(fgen,
897 MultiFeatureGenerator(flattening_generator, FeatureGeneratorTakeColumns(except_columns=columns_to_flatten)))
900class FeatureGeneratorFromDFT(FeatureGenerator):
901 def __init__(self, dft: DataFrameTransformer, categorical_feature_names: Optional[Union[Sequence[str], str]] = None,
902 normalisation_rules: Sequence[data_transformation.DFTNormalisation.Rule] = (),
903 normalisation_rule_template: data_transformation.DFTNormalisation.RuleTemplate = None,
904 add_categorical_default_rules=True):
905 super().__init__(categorical_feature_names=categorical_feature_names, normalisation_rules=normalisation_rules,
906 normalisation_rule_template=normalisation_rule_template, add_categorical_default_rules=add_categorical_default_rules)
907 self.dft = dft
909 def _fit(self, x: pd.DataFrame, y: pd.DataFrame = None, ctx=None):
910 self.dft.fit(x)
912 def _generate(self, df: pd.DataFrame, ctx=None) -> pd.DataFrame:
913 return self.dft.apply(df)