Source code for sensai.featuregen.feature_generator

import functools
import logging
import re
from abc import ABC, abstractmethod
from typing import Sequence, List, Union, Callable, Any, Dict, TYPE_CHECKING, Optional

import numpy as np
import pandas as pd

from .. import util, data_transformation
from ..data_transformation import DFTNormalisation, DFTFromFeatureGenerator, DataFrameTransformer
from ..util import flatten_arguments
from ..util.string import or_regex_group, ToStringMixin, list_string
from ..util.typing import PandasNamedTuple

if TYPE_CHECKING:
    from ..vector_model import VectorModel
    from ..columngen import ColumnGenerator


log = logging.getLogger(__name__)


[docs]class DuplicateColumnNamesException(Exception): pass
[docs]class FeatureGenerator(ToStringMixin, ABC): """ Base class for feature generators that create a new DataFrame containing feature values from an input DataFrame """ def __init__(self, categorical_feature_names: Optional[Union[Sequence[str], str]] = None, normalisation_rules: Sequence[data_transformation.DFTNormalisation.Rule] = (), normalisation_rule_template: Optional[data_transformation.DFTNormalisation.RuleTemplate] = None, add_categorical_default_rules: bool = True): """ :param categorical_feature_names: either a sequence of column names or a regex that is to match all categorical feature names (which must not only work for the feature generated by this feature generator, i.e. it should not match feature names generated by other feature generators). It will be ensured that the respective columns in the generated data frames will have dtype 'category'. Furthermore, the presence of meta-information can later be leveraged for further transformations, e.g., one-hot encoding. :param normalisation_rules: Rules to be used by DFTNormalisation (e.g.,for constructing an input transformer for a model). These rules are only relevant if a DFTNormalisation object consuming them is instantiated and used within a data processing pipeline. They do not affect feature generation. :param normalisation_rule_template: This parameter can be supplied instead of `normalisation_rules` for the case where there shall be a single rule that applies to all columns generated by this feature generator that were not labeled as categorical. Like normalisation_rules, this is only relevant if a DFTNormalisation object consuming normalisation rules is instantiated and used within a data processing pipeline. It does not affect feature generation. :param add_categorical_default_rules: If True, normalisation rules for categorical features (which are unsupported by normalisation) and their corresponding one-hot encoded features (with "_<index>" appended) will be added. It does not affect feature generation. """ # NOTE: While it would be more elegant to not have all of the above constructor arguments and instead provide # them later using "with*" methods, this would have the significant drawback that it would enable # all such attributes to be provided in all subclasses, even in ones where we know settings exactly # and can provide them directly in the subclass constructor implementation. Thus it would enable # non-sensical settings which should be avoided. if len(normalisation_rules) > 0 and normalisation_rule_template is not None: raise ValueError(f"Normalisation rules should be empty when a rule template is provided") self._generatedColumnNames = None self.__categoricalFeatureNames = categorical_feature_names if type(categorical_feature_names) == str: categorical_feature_name_regex = categorical_feature_names else: if categorical_feature_names is not None and len(categorical_feature_names) > 0: categorical_feature_name_regex = or_regex_group(categorical_feature_names) else: categorical_feature_name_regex = None self._categoricalFeatureNameRegex: str = categorical_feature_name_regex self._categoricalFeatureRules = [] if normalisation_rule_template is not None: # Note: placeholder rule's regex will be set in generate self._normalisationRules = [normalisation_rule_template.to_placeholder_rule()] self._mustUpdateNormalisationRuleBasedOnColumnNames = True else: self._normalisationRules = list(normalisation_rules) self._mustUpdateNormalisationRuleBasedOnColumnNames = False if add_categorical_default_rules: if categorical_feature_name_regex is not None: self._categoricalFeatureRules.append(data_transformation.DFTNormalisation.Rule(categorical_feature_name_regex, unsupported=True)) self._categoricalFeatureRules.append(data_transformation.DFTNormalisation.Rule(categorical_feature_name_regex + r"_\d+", skip=True)) # rule for one-hot transformation self._name: Optional[str] = None self._isFitted = False # for backwards compatibility with persisted Featuregens based on code prior to commit 7088cbbe # They lack the __isFitted attribute and we assume that each such Featuregen was fitted def __setstate__(self, d): d["_isFitted"] = d.get("_isFitted", True) self.__dict__ = d def _tostring_exclude_private(self) -> bool: return True def _tostring_additional_entries(self) -> Dict[str, Any]: return dict(name=self.get_name())
[docs] def get_name(self) -> str: """ :return: the name of this feature generator, which may be a default name if the name has not been set. Note that feature generators created by a FeatureGeneratorFactory always get the name with which the generator factory was registered. """ if self._name is None: return f"{self.__class__.__name__}-{id(self)}" return self._name
[docs] def set_name(self, name: str) -> None: self._name = name
[docs] def get_names(self) -> List[str]: """ :return: the list of names of feature generators; will be a list with a single name for a regular feature generator """ return [self.get_name()]
[docs] def info(self): return { "name": self.get_name(), "categoricalFeatureNames": self.__categoricalFeatureNames, "generatedColumnNames": self.get_generated_column_names(), "isFitted": self.is_fitted(), "normalisationRules": self.get_normalisation_rules(), }
[docs] def get_normalisation_rules(self, include_generated_categorical_rules=True) -> List[data_transformation.DFTNormalisation.Rule]: if include_generated_categorical_rules: return self._normalisationRules + self._categoricalFeatureRules else: return self._normalisationRules
[docs] def get_categorical_feature_name_regex(self) -> Optional[str]: return self._categoricalFeatureNameRegex
[docs] def is_categorical_feature(self, feature_name): if self._categoricalFeatureNameRegex is None: return False return re.fullmatch(self._categoricalFeatureNameRegex, feature_name) is not None
[docs] def get_generated_column_names(self) -> Optional[List[str]]: """ :return: Column names of the data frame generated by the most recent call of the feature generators 'generate' method. Returns None if generate was never called. """ return self._generatedColumnNames
[docs] def to_dft(self): return DFTFromFeatureGenerator(self)
@abstractmethod def _fit(self, x: pd.DataFrame, y: pd.DataFrame = None, ctx=None): """ Fits the feature generator based on the given data :param x: the input/features data frame for the learning problem :param y: the corresponding output data frame for the learning problem (which will typically contain regression or classification target columns) :param ctx: a context object whose functionality may be required for feature generation; this is typically the model instance that this feature generator is to generate inputs for """ pass
[docs] def fit(self, x: pd.DataFrame, y: pd.DataFrame = None, ctx=None): """ Fits the feature generator based on the given data :param x: the input/features data frame for the learning problem :param y: the corresponding output data frame for the learning problem (which will typically contain regression or classification target columns) :param ctx: a context object whose functionality may be required for feature generation; this is typically the model instance that this feature generator is to generate inputs for """ log.debug(f"Fitting {self}") self._fit(x, y=y, ctx=ctx) self._isFitted = True
[docs] def is_fitted(self): return self._isFitted
[docs] def generate(self, df: pd.DataFrame, ctx=None) -> pd.DataFrame: """ Generates features for the data points in the given data frame :param df: the input data frame for which to generate features :param ctx: a context object whose functionality may be required for feature generation; this is typically the model instance that this feature generator is to generate inputs for :return: a data frame containing the generated features, which uses the same index as X (and Y) """ if not self.is_fitted(): raise Exception(f"Cannot generate features from a FeatureGenerator which is not fitted: " f"the feature generator {self.get_name()} requires fitting") log.debug(f"Generating features with {self}") result_df = self._generate(df, ctx=ctx) is_column_duplicated_array = result_df.columns.duplicated() if any(is_column_duplicated_array): duplicated_columns = set(result_df.columns[is_column_duplicated_array]) raise DuplicateColumnNamesException(f"Feature data frame contains duplicate column names: {duplicated_columns}") # ensure that categorical columns have dtype 'category' categorical_feature_names = [] if self._categoricalFeatureNameRegex is not None: result_df = result_df.copy() # result_df we got might be a view of some other DF, so before we modify it, we must copy it categorical_feature_names = [col for col in result_df.columns if self.is_categorical_feature(col)] for colName in categorical_feature_names: series = result_df[colName].copy() if series.dtype.name != 'category': result_df[colName] = series.astype('category', copy=False) self._generatedColumnNames = result_df.columns # finalise normalisation rule template (if any) by making it apply to all non-categorical features # (a default rule applies to categorical features) if self._mustUpdateNormalisationRuleBasedOnColumnNames: non_categorical_features = list(set(self._generatedColumnNames).difference(categorical_feature_names)) # NOTE: We here update the existing rule which was instantiated with a dummy regex because # some mechanisms (e.g. MultiFeatureGenerators) retrieve rule instances early on (before generate # is ever called) and therefore updating an existing rule is the safe route and should always # work, because rules should never actually be applied before generate has indeed been called self._normalisationRules[0].set_regex(or_regex_group(non_categorical_features)) self._mustUpdateNormalisationRuleBasedOnColumnNames = False return result_df
@abstractmethod def _generate(self, df: pd.DataFrame, ctx=None) -> pd.DataFrame: """ Generates features for the data points in the given data frame. :param df: the input data frame for which to generate features :param ctx: a context object whose functionality may be required for feature generation; this is typically the model instance that this feature generator is to generate inputs for :return: a data frame containing the generated features, which uses the same index as ``df``. The data frame's columns holding categorical columns are not required to have dtype ``category``; this will be ensured by the encapsulating call as long as the respective columns' names were appropriately provided at construction. """ pass
[docs] def fit_generate(self, x: pd.DataFrame, y: pd.DataFrame = None, ctx=None) -> pd.DataFrame: """ Fits the feature generator and subsequently generates features for the data points in the given data frame :param x: the input data frame for the learning problem and for which to generate features :param y: the corresponding output data frame for the learning problem (which will typically contain regression or classification target columns) :param ctx: a context object whose functionality may be required for feature generation; this is typically the model instance that this feature generator is to generate inputs for :return: a data frame containing the generated features, which uses the same index as X (and Y) """ self.fit(x, y, ctx) return self.generate(x, ctx)
[docs] def flattened(self, columns_to_flatten: List[str] = None, normalisation_rules=(), normalisation_rule_template: data_transformation.DFTNormalisation.RuleTemplate = None, keep_other_columns=True) -> "ChainedFeatureGenerator": """ Returns a new feature generator which returns flattened versions of one or more of the vector-valued columns generated by this feature generator. :param columns_to_flatten: the list of columns to flatten; if None, flatten all columns :param normalisation_rules: a list of normalisation rules which apply to the flattened columns :param normalisation_rule_template: a normalisation rule template which applies to all generated flattened columns :param keep_other_columns: if True, any additional columns that are not to be flattened are to be retained by the returned feature generator; if False, additional columns are to be discarded :return: a feature generator which generates the flattened columns """ return flattened_feature_generator(self, columns_to_flatten=columns_to_flatten, normalisation_rules=normalisation_rules, keep_other_columns=keep_other_columns, normalisation_rule_template=normalisation_rule_template)
[docs] def concat(self, *others: "FeatureGenerator") -> "MultiFeatureGenerator": """ Concatenates this feature generator with one or more other feature generator in order to produce a feature generator that jointly generates all features :param others: other feature generators :return: a :class:`MultiFeatureGenerator` """ if isinstance(self, MultiFeatureGenerator): fgens = list(self.featureGenerators) else: fgens = [self] fgens.extend(others) return MultiFeatureGenerator(fgens)
[docs] def chain(self, *others: "FeatureGenerator") -> "ChainedFeatureGenerator": """ Chains this feature generator with one or more other feature generators such that each feature generator receives as input the output of the preceding feature generator. The resulting feature generator produces the features of the last element in the chain. :param others: other feature generator :return: a :class:`ChainedFeatureGenerator` """ if isinstance(self, ChainedFeatureGenerator): fgens = self.featureGenerators else: fgens = [self] fgens.extend(others) return ChainedFeatureGenerator(fgens)
[docs]class RuleBasedFeatureGenerator(FeatureGenerator, ABC): """ A feature generator which does not require fitting """
[docs] def fit(self, x, y=None, ctx=None): pass
def _fit(self, x: pd.DataFrame, y: pd.DataFrame = None, ctx=None): pass
[docs] def is_fitted(self): return True
[docs]class MultiFeatureGenerator(FeatureGenerator): """ Wrapper for multiple feature generators. Calling generate here applies all given feature generators independently and returns the concatenation of their outputs """ def __init__(self, *feature_generators: Union[FeatureGenerator, List[FeatureGenerator]]): self.featureGenerators = feature_generators = flatten_arguments(feature_generators) if len(self.featureGenerators) == 0: log.debug("Creating an empty MultiFeatureGenerator. It will generate a data frame without columns.") categorical_feature_name_regexes = [regex for regex in [fg.get_categorical_feature_name_regex() for fg in feature_generators] if regex is not None] if len(categorical_feature_name_regexes) > 0: categorical_feature_names = "|".join(categorical_feature_name_regexes) else: categorical_feature_names = () normalisation_rules = util.concat_sequences([fg.get_normalisation_rules() for fg in feature_generators]) super().__init__(categorical_feature_names=categorical_feature_names, normalisation_rules=normalisation_rules, add_categorical_default_rules=False) def _tostring_object_info(self) -> str: return f"featureGenerators={list_string(self.featureGenerators)}" def _generate_from_multiple(self, generate_features: Callable[[FeatureGenerator], pd.DataFrame], index) -> pd.DataFrame: dfs = [] for fg in self.featureGenerators: df = generate_features(fg) dfs.append(df) if len(dfs) == 0: return pd.DataFrame(index=index) else: combined_df = pd.concat(dfs, axis=1) if len(combined_df.columns) != len(set(combined_df.columns)): raise Exception(f"At least one column was generated more than once: {list(combined_df.columns)}; " f"check feature generators for correctness!") return combined_df def _generate(self, input_df: pd.DataFrame, ctx=None): def generate_features(fg: FeatureGenerator): return fg.generate(input_df, ctx=ctx) return self._generate_from_multiple(generate_features, input_df.index)
[docs] def fit_generate(self, x: pd.DataFrame, y: pd.DataFrame = None, ctx=None) -> pd.DataFrame: log.debug(f"Fitting and generating features with {self}") def generate_features(fg: FeatureGenerator): return fg.fit_generate(x, y, ctx) return self._generate_from_multiple(generate_features, x.index)
def _fit(self, x: pd.DataFrame, y: pd.DataFrame = None, ctx=None): for fg in self.featureGenerators: fg.fit(x, y)
[docs] def is_fitted(self): return all([fg.is_fitted() for fg in self.featureGenerators])
[docs] def info(self): info = super(MultiFeatureGenerator, self).info() info["featureGeneratorNames"] = self.get_names() return info
[docs] def get_names(self) -> list: return functools.reduce(lambda x, y: x + y, [fg.get_names() for fg in self.featureGenerators], [])
[docs]class FeatureGeneratorFromNamedTuples(FeatureGenerator, ABC): """ Generates feature values for one data point at a time, creating a dictionary with feature values from each named tuple """ def __init__(self, cache: util.cache.KeyValueCache = None, categorical_feature_names: Sequence[str] = (), normalisation_rules: Sequence[data_transformation.DFTNormalisation.Rule] = (), normalisation_rule_template: data_transformation.DFTNormalisation.RuleTemplate = None): super().__init__(categorical_feature_names=categorical_feature_names, normalisation_rules=normalisation_rules, normalisation_rule_template=normalisation_rule_template) self.cache = cache def _generate(self, df: pd.DataFrame, ctx=None): dicts = [] for idx, nt in enumerate(df.itertuples()): nt: PandasNamedTuple if idx % 100 == 0: log.debug(f"Generating feature via {self.__class__.__name__} for index {idx}") value = None if self.cache is not None: value = self.cache.get(nt.Index) if value is None: value = self._generate_feature_dict(nt) if self.cache is not None: self.cache.set(nt.Index, value) dicts.append(value) return pd.DataFrame(dicts, index=df.index) @abstractmethod def _generate_feature_dict(self, named_tuple) -> Dict[str, Any]: """ Creates a dictionary with feature values from a named tuple :param named_tuple: the data point for which to generate features :return: a dictionary mapping feature names to values """ pass
[docs]class FeatureGeneratorTakeColumns(RuleBasedFeatureGenerator): def __init__(self, columns: Union[str, List[str]] = None, except_columns: Sequence[str] = (), categorical_feature_names: Optional[Union[Sequence[str], str]] = (), normalisation_rules: Sequence[data_transformation.DFTNormalisation.Rule] = (), normalisation_rule_template: data_transformation.DFTNormalisation.RuleTemplate = None, verify_column_names=True): """ :param columns: name of the column or list of names of columns to be taken. If None, all columns will be taken. :param except_columns: list of names of columns to not take if present in the input df :param categorical_feature_names: either a sequence of column names or a regex that is to match all categorical feature names (which must not only work for the feature generated by this feature generator, i.e. it should not match feature names generated by other feature generators). It will be ensured that the respective columns in the generated data frames will have dtype 'category'. Furthermore, presence of meta-information can later be leveraged for further transformations, e.g. one-hot encoding. :param normalisation_rules: Rules to be used by DFTNormalisation (e.g. for constructing an input transformer for a model). These rules are only relevant if a DFTNormalisation object consuming them is instantiated and used within a data processing pipeline. They do not affect feature generation. :param normalisation_rule_template: This parameter can be supplied instead of normalisationRules for the case where there shall be a single rule that applies to all columns generated by this feature generator that were not labeled as categorical. :param verify_column_names: if True and columns to take were specified, will raise an error in case said columns are missing during feature generation. If False, will log on info level instead """ super().__init__(categorical_feature_names=categorical_feature_names, normalisation_rules=normalisation_rules, normalisation_rule_template=normalisation_rule_template) if isinstance(columns, str): columns = [columns] self.columns = columns self.exceptColumns = except_columns self.verifyColumnNames = verify_column_names def _generate(self, df: pd.DataFrame, ctx=None) -> pd.DataFrame: columns_to_take = self.columns if self.columns is not None else df.columns columns_to_take = [col for col in columns_to_take if col not in self.exceptColumns] if self.columns is not None: missing_cols = set(columns_to_take).difference(df.columns) if len(missing_cols) > 0: missing_cols_notification = f"Columns {missing_cols} were specified but are not present in data frame. " \ f"verifyColumnNames was set to {self.verifyColumnNames}; " \ f"available columns: {list(df.columns)}" if self.verifyColumnNames: raise RuntimeError(missing_cols_notification) log.info(missing_cols_notification) return df[columns_to_take]
[docs] def info(self): info = super().info() info["columns"] = self.columns info["exceptColumns"] = self.exceptColumns return info
[docs]class FeatureGeneratorFlattenColumns(RuleBasedFeatureGenerator): """ Instances of this class take columns with vectors and creates a data frame with columns containing entries of these vectors. For example, if columns "vec1", "vec2" contain vectors of dimensions dim1, dim2, a data frame with dim1+dim2 new columns will be created. It will contain the columns "vec1_<i1>", "vec2_<i2>" with i1, i2 ranging in (0, dim1), (0, dim2). """ def __init__(self, columns: Optional[Union[str, Sequence[str]]] = None, categorical_feature_names: Sequence[str] = (), normalisation_rules: Sequence[data_transformation.DFTNormalisation.Rule] = (), normalisation_rule_template: data_transformation.DFTNormalisation.RuleTemplate = None): """ :param columns: name of the column or list of names of columns to be flattened. If None, all columns will be flattened. :param categorical_feature_names: :param normalisation_rules: :param normalisation_rule_template: """ super().__init__(categorical_feature_names=categorical_feature_names, normalisation_rules=normalisation_rules, normalisation_rule_template=normalisation_rule_template) if isinstance(columns, str): columns = [columns] self.columns = columns def _generate(self, df: pd.DataFrame, ctx=None) -> pd.DataFrame: result_df = pd.DataFrame(index=df.index) columns_to_flatten = self.columns if self.columns is not None else df.columns for col in columns_to_flatten: log.debug(f"Flattening column {col}") # NOTE: we found the use of np.stack to produce the most runtime-efficient results. # Other variants, e.g. based on lists instead of numpy.arrays, perform much worse. values = np.stack(df[col].values) if len(values.shape) != 2: raise ValueError(f"Column {col} was expected to contain one dimensional vectors, something went wrong") dimension = values.shape[1] new_columns = [f"{col}_{i}" for i in range(dimension)] log.debug(f"Flattening resulted in {len(new_columns)} new columns") result_df[new_columns] = pd.DataFrame(values, index=df.index) return result_df
[docs] def info(self): info = super().info() info["columns"] = self.columns return info
[docs]class FeatureGeneratorFromColumnGenerator(RuleBasedFeatureGenerator): """ Implements a feature generator via a column generator """ log = log.getChild(__qualname__) def __init__(self, column_gen: 'ColumnGenerator', take_input_column_if_present=False, is_categorical=False, normalisation_rule_template: data_transformation.DFTNormalisation.RuleTemplate = None): """ :param column_gen: the underlying column generator :param take_input_column_if_present: if True, then if a column whose name corresponds to the column to generate exists in the input data, simply copy it to generate the output (without using the column generator); if False, always apply the columnGen to generate the output :param is_categorical: whether the resulting column is categorical :param normalisation_rule_template: template for a DFTNormalisation for the resulting column. This should only be provided if is_categorical is False """ if is_categorical and normalisation_rule_template is not None: raise ValueError(f"normalisationRuleTemplate should be None when the generated column is categorical") categorical_feature_names = (column_gen.generatedColumnName,) if is_categorical else () super().__init__(categorical_feature_names=categorical_feature_names, normalisation_rule_template=normalisation_rule_template) self.takeInputColumnIfPresent = take_input_column_if_present self.columnGen = column_gen
[docs] def info(self): info = super().info() info["takeInputColumnIfPresent"] = self.takeInputColumnIfPresent info["generatedColName"] = self.columnGen.generatedColumnName return info
def _generate(self, df: pd.DataFrame, ctx=None) -> pd.DataFrame: col_name = self.columnGen.generatedColumnName if self.takeInputColumnIfPresent and col_name in df.columns: self.log.debug(f"Taking column '{col_name}' from input data frame") series = df[col_name] else: self.log.debug(f"Generating column '{col_name}' via {self.columnGen}") series = self.columnGen.generate_column(df) return pd.DataFrame({col_name: series})
[docs]class ChainedFeatureGenerator(FeatureGenerator): """ Chains feature generators such that they are executed one after another. The output of generator i>=1 is the input of generator i+1 in the generator sequence. """ def __init__(self, *feature_generators: Union[FeatureGenerator, List[FeatureGenerator]]): """ :param feature_generators: feature generators to apply in order; the properties of the last feature generator determine the relevant meta-data such as categorical feature names and normalisation rules """ self.featureGenerators = flatten_arguments(feature_generators) if len(feature_generators) == 0: raise ValueError("Empty list of feature generators") last_fg: FeatureGenerator = self.featureGenerators[-1] super().__init__( categorical_feature_names=last_fg.get_categorical_feature_name_regex(), normalisation_rules=last_fg.get_normalisation_rules(), add_categorical_default_rules=False) def _generate(self, df: pd.DataFrame, ctx=None) -> pd.DataFrame: for featureGen in self.featureGenerators: df = featureGen.generate(df, ctx) return df def _fit(self, x: pd.DataFrame, y: pd.DataFrame = None, ctx=None): self.fit_generate(x, y, ctx)
[docs] def is_fitted(self): return all([fg.is_fitted() for fg in self.featureGenerators])
[docs] def info(self): info = super().info() info["chainedFeatureGeneratorNames"] = self.get_names()
[docs] def fit_generate(self, x: pd.DataFrame, y: pd.DataFrame = None, ctx=None) -> pd.DataFrame: log.debug(f"Fitting and generating features with {self}") for fg in self.featureGenerators: x = fg.fit_generate(x, y, ctx) return x
[docs]class FeatureGeneratorTargetDistribution(FeatureGenerator): """ A feature generator, which, for a column T (typically the categorical target column of a classification problem or the continuous target column of a regression problem), * can ensure that T takes on limited set of values t_1, ..., t_n by allowing the user to apply binning using given bin boundaries * computes for each value c of a categorical column C the conditional empirical distribution P(T | C=c) in the training data during the training phase, * generates, for each requested column C and value c in the column, n features '<C>_<T>_distribution_<t_i>' = P(T=t_i | C=c) if flatten=True or one feature '<C>_<T>_distribution' = [P(T=t_i | C=c), ..., P(T=t_n | C=c)] if flatten=False Being probability values, the features generated by this feature generator are already normalised. """ def __init__(self, columns: Union[str, Sequence[str]], target_column: str, target_column_bins: Optional[Union[Sequence[float], int, pd.IntervalIndex]], target_column_in_features_df=False, flatten=True): """ :param columns: the categorical columns for which to generate distribution features :param target_column: the column the distributions over which will make up the features. If targetColumnBins is not None, this column will be discretised before computing the conditional distributions :param target_column_bins: if not None, specifies the binning to apply via pandas.cut (see https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.cut.html). Note that if a value should match no bin, NaN will generated. To avoid this when specifying bin boundaries in a list, -inf and +inf should be used as the first and last entries. :param target_column_in_features_df: if True, when fitting will look for targetColumn in the features data frame (X) instead of in target data frame (Y) :param flatten: whether to generate a separate scalar feature per distribution value rather than one feature with all of the distribution's values """ self.flatten = flatten if isinstance(columns, str): columns = [columns] self.columns = columns self.targetColumn = target_column self.targetColumnInFeaturesDf = target_column_in_features_df self.targetColumnBins = target_column_bins if self.flatten: normalisation_rule_template = data_transformation.DFTNormalisation.RuleTemplate(skip=True) else: normalisation_rule_template = data_transformation.DFTNormalisation.RuleTemplate(unsupported=True) super().__init__(normalisation_rule_template=normalisation_rule_template) self._targetColumnValues = None # This will hold the mapping: column -> featureValue -> targetValue -> targetValueEmpiricalProbability self._discreteTargetDistributionsByColumn: Optional[Dict[str, Dict[Any, Dict[Any, float]]]] = None
[docs] def info(self): info = super().info() info["columns"] = self.columns info["targetColumn"] = self.targetColumn info["targetColumnBins"] = self.targetColumnBins info["flatten"] = self.flatten return info
def _fit(self, x: pd.DataFrame, y: pd.DataFrame = None, ctx=None): """ This will persist the empirical target probability distributions for all unique values in the specified columns """ if self.targetColumnInFeaturesDf: target = x[self.targetColumn] else: target = y[self.targetColumn] if self.targetColumnBins is not None: discretised_target = pd.cut(target, self.targetColumnBins) else: discretised_target = target self._targetColumnValues = discretised_target.unique() self._discreteTargetDistributionsByColumn = {} for column in self.columns: self._discreteTargetDistributionsByColumn[column] = {} column_target_df = pd.DataFrame() column_target_df[column] = x[column] column_target_df["target"] = discretised_target.values for value, valueTargetsDf in column_target_df.groupby(column): # The normalized value_counts contain targetValue -> targetValueEmpiricalProbability for the current value self._discreteTargetDistributionsByColumn[column][value] = valueTargetsDf["target"].value_counts(normalize=True).to_dict() def _generate(self, df: pd.DataFrame, ctx=None) -> pd.DataFrame: if self._discreteTargetDistributionsByColumn is None: raise Exception("Feature generator has not been fitted") result_df = pd.DataFrame(index=df.index) for column in self.columns: target_distribution_by_value = self._discreteTargetDistributionsByColumn[column] if self.flatten: for target_value in self._targetColumnValues: # Important: pd.Series.apply should not be used here, as it would label the resulting column as categorical result_df[f"{column}_{self.targetColumn}_distribution_{target_value}"] = \ [target_distribution_by_value[value].get(target_value, 0.0) for value in df[column]] else: distributions = [[target_distribution_by_value[value].get(targetValue, 0.0) for targetValue in self._targetColumnValues] for value in df[column]] result_df[f"{column}_{self.targetColumn}_distribution"] = pd.Series(distributions, index=df[column].index) return result_df
[docs]class FeatureGeneratorFromVectorModel(FeatureGenerator): def __init__(self, vector_model: "VectorModel", target_feature_generator: FeatureGenerator, categorical_feature_names: Sequence[str] = (), normalisation_rules: Sequence[data_transformation.DFTNormalisation.Rule] = (), normalisation_rule_template: data_transformation.DFTNormalisation.RuleTemplate = None, input_feature_generator: FeatureGenerator = None, use_target_feature_generator_for_training=False): """ Provides a feature via predictions of a given model :param vector_model: model used for generate features from predictions :param target_feature_generator: generator for target to be predicted :param categorical_feature_names: :param normalisation_rules: :param normalisation_rule_template: :param input_feature_generator: optional feature generator to be applied to input of vectorModel's fit and predict :param use_target_feature_generator_for_training: if False, this generator will always apply the model to generate features. If True, this generator will use targetFeatureGenerator to generate features, bypassing the model. This is useful for the case where the model which is to receive the generated features shall be trained on the original targets rather than the predictions thereof. """ super().__init__(categorical_feature_names=categorical_feature_names, normalisation_rules=normalisation_rules, normalisation_rule_template=normalisation_rule_template) self.useTargetFeatureGeneratorForTraining = use_target_feature_generator_for_training self.targetFeatureGenerator = target_feature_generator self.inputFeatureGenerator = input_feature_generator self.useTargetFeatureGeneratorForTraining = use_target_feature_generator_for_training self.vectorModel = vector_model def _fit(self, x: pd.DataFrame, y: pd.DataFrame = None, ctx=None): target_df = self.targetFeatureGenerator.fit_generate(x, y) if self.inputFeatureGenerator: x = self.inputFeatureGenerator.fit_generate(x, y) self.vectorModel.fit(x, target_df) def _generate(self, df: pd.DataFrame, ctx=None) -> pd.DataFrame: if self.inputFeatureGenerator: df = self.inputFeatureGenerator.generate(df) if self.useTargetFeatureGeneratorForTraining and not ctx.is_fitted(): log.debug(f"Using targetFeatureGenerator {self.targetFeatureGenerator.__class__.__name__} to generate target features") return self.targetFeatureGenerator.generate(df) else: log.debug(f"Generating target features via {self.vectorModel.__class__.__name__}") return self.vectorModel.predict(df)
[docs] def info(self): info = super().info() info["wrappedModel"] = str(self.vectorModel) return info
[docs]class FeatureGeneratorMapColumn(RuleBasedFeatureGenerator, ABC): """ Creates a single feature from a single input column by applying a function to each element of the input column """ def __init__(self, input_col_name: str, feature_col_name: str, categorical_feature_names: Optional[Union[Sequence[str], str]] = None, normalisation_rules: Sequence[data_transformation.DFTNormalisation.Rule] = (), normalisation_rule_template: data_transformation.DFTNormalisation.RuleTemplate = None, add_categorical_default_rules=True): super().__init__(categorical_feature_names=categorical_feature_names, normalisation_rules=normalisation_rules, normalisation_rule_template=normalisation_rule_template, add_categorical_default_rules=add_categorical_default_rules) self._inputColName = input_col_name self._featureColName = feature_col_name def _generate(self, df: pd.DataFrame, ctx=None) -> pd.DataFrame: if self._inputColName not in df.columns: raise ValueError(f"Column '{self._inputColName}' required by feature generator not found in list of columns: " f"{list(df.columns)}") input_series = df[self._inputColName] values = input_series.apply(self._create_value) return pd.DataFrame({self._featureColName: values}, index=df.index) @abstractmethod def _create_value(self, value): """ Maps a value from the input column to a feature value :param value: a value from the input column :return: the feature value """ pass
[docs]class FeatureGeneratorMapColumnDict(RuleBasedFeatureGenerator, ABC): """ Creates an arbitrary number of features from a single input column by applying a function to each element of the input column """ def __init__(self, input_col_name: str, categorical_feature_names: Optional[Union[Sequence[str], str]] = None, normalisation_rules: Sequence[data_transformation.DFTNormalisation.Rule] = (), normalisation_rule_template: data_transformation.DFTNormalisation.RuleTemplate = None, add_categorical_default_rules=True): super().__init__(categorical_feature_names=categorical_feature_names, normalisation_rules=normalisation_rules, normalisation_rule_template=normalisation_rule_template, add_categorical_default_rules=add_categorical_default_rules) self._inputColName = input_col_name def _generate(self, df: pd.DataFrame, ctx=None) -> pd.DataFrame: if self._inputColName not in df.columns: raise ValueError(f"Column '{self._inputColName}' required by feature generator not found in list of columns: " f"{list(df.columns)}") input_series = df[self._inputColName] values = [self._create_features_dict(v) for v in input_series] return pd.DataFrame(values, index=df.index) @abstractmethod def _create_features_dict(self, value) -> Dict[str, Any]: """ Maps a value from the input column to a dictionary containing one or more features. :param value: a value from the input column :return: a dictionary mapping feature names to values """ pass
[docs]class FeatureGeneratorNAMarker(RuleBasedFeatureGenerator): """ Creates features indicating whether another feature is N/A (not available). It can be practical to use this feature generator in conjunction with DFTFillNA for models that cannot handle missing values. """ def __init__(self, columns: List[str], value_a=0, value_na=1): """ Note: When changing the default values used, use only values that are considered to be normalised when using this feature generation in a context where DFTNormalisation is used (no normalisation is applied to features generated by this feature generator). :param columns: the columns for which to generate :param value_a: the feature value if the input feature is available :param value_na: the feature value if the input feature is not available """ super().__init__(normalisation_rule_template=DFTNormalisation.RuleTemplate(skip=True)) self.columns = columns self.valueA = value_a self.valueNA = value_na def _generate(self, df: pd.DataFrame, ctx=None) -> pd.DataFrame: new_cols = {} value_map = {True: self.valueNA, False: self.valueA} for col in self.columns: new_cols[f"{col}_na"] = [value_map[isNA] for isNA in df[col].isna()] return pd.DataFrame(new_cols, index=df.index)
[docs]def flattened_feature_generator(fgen: FeatureGenerator, columns_to_flatten: List[str] = None, keep_other_columns=True, normalisation_rules: Sequence[DFTNormalisation.Rule] = (), normalisation_rule_template: data_transformation.DFTNormalisation.RuleTemplate = None): """ Return a flattening version of the input feature generator. :param fgen: the feature generator which generates columns that are to be flattened :param columns_to_flatten: list of names of output columns to be flattened; if None, flatten all columns :param keep_other_columns: whether any additional columns that are not to be flattened are to be retained by the returned feature generator :param normalisation_rules: additional normalisation rules for the flattened output columns :param normalisation_rule_template: This parameter can be supplied instead of normalisation_rules for the case where there shall be a single rule that applies to all flattened output columns :return: FeatureGenerator instance that will generate flattened versions of the specified columns and leave all other output columns as is. Example: >>> from sensai.featuregen import FeatureGeneratorTakeColumns, flattened_feature_generator >>> import pandas as pd >>> >>> df = pd.DataFrame({"foo": [[1, 2], [3, 4]], "bar": ["a", "b"]}) >>> fgen = flattened_feature_generator(FeatureGeneratorTakeColumns(), columns_to_flatten=["foo"]) >>> fgen.generate(df) foo_0 foo_1 bar 0 1 2 a 1 3 4 b """ flattening_generator = FeatureGeneratorFlattenColumns(columns=columns_to_flatten, normalisation_rules=normalisation_rules, normalisation_rule_template=normalisation_rule_template) if columns_to_flatten is None or not keep_other_columns: return ChainedFeatureGenerator(fgen, flattening_generator) else: return ChainedFeatureGenerator(fgen, MultiFeatureGenerator(flattening_generator, FeatureGeneratorTakeColumns(except_columns=columns_to_flatten)))
[docs]class FeatureGeneratorFromDFT(FeatureGenerator): def __init__(self, dft: DataFrameTransformer, categorical_feature_names: Optional[Union[Sequence[str], str]] = None, normalisation_rules: Sequence[data_transformation.DFTNormalisation.Rule] = (), normalisation_rule_template: data_transformation.DFTNormalisation.RuleTemplate = None, add_categorical_default_rules=True): super().__init__(categorical_feature_names=categorical_feature_names, normalisation_rules=normalisation_rules, normalisation_rule_template=normalisation_rule_template, add_categorical_default_rules=add_categorical_default_rules) self.dft = dft def _fit(self, x: pd.DataFrame, y: pd.DataFrame = None, ctx=None): self.dft.fit(x) def _generate(self, df: pd.DataFrame, ctx=None) -> pd.DataFrame: return self.dft.apply(df)