import copy
import logging
import re
from abc import ABC, abstractmethod
from typing import List, Sequence, Union, Dict, Callable, Any, Optional, Set
import numpy as np
import pandas as pd
import sklearn
from sklearn.preprocessing import OneHotEncoder
from .sklearn_transformer import SkLearnTransformerProtocol
from ..util import flatten_arguments, count_not_none
from ..util.pandas import DataFrameColumnChangeTracker
from ..util.pickle import setstate
from ..util.string import or_regex_group, ToStringMixin
from typing import TYPE_CHECKING
from ..util.version import Version
if TYPE_CHECKING:
from ..featuregen import FeatureGenerator
from ..columngen import ColumnGenerator
log = logging.getLogger(__name__)
[docs]class DFTFromFeatureGenerator(DataFrameTransformer):
def _fit(self, df: pd.DataFrame):
self.fgen.fit(df, ctx=None)
def _apply(self, df: pd.DataFrame) -> pd.DataFrame:
return self.fgen.generate(df)
def __init__(self, fgen: "FeatureGenerator"):
super().__init__()
self.fgen = fgen
self.set_name(f"{self.__class__.__name__}[{self.fgen.get_name()}]")
[docs]class DFTRenameColumns(RuleBasedDataFrameTransformer):
def __init__(self, columns_map: Dict[str, str]):
"""
:param columns_map: dictionary mapping old column names to new names
"""
super().__init__()
self.columnsMap = columns_map
def _apply(self, df: pd.DataFrame) -> pd.DataFrame:
return df.rename(columns=self.columnsMap)
[docs]class DFTConditionalRowFilterOnColumn(RuleBasedDataFrameTransformer):
"""
Filters a data frame by applying a boolean function to one of the columns and retaining only the rows
for which the function returns True
"""
def __init__(self, column: str, condition: Callable[[Any], bool]):
super().__init__()
self.column = column
self.condition = condition
def _apply(self, df: pd.DataFrame) -> pd.DataFrame:
return df[df[self.column].apply(self.condition)]
[docs]class DFTInSetComparisonRowFilterOnColumn(RuleBasedDataFrameTransformer):
"""
Filters a data frame on the selected column and retains only the rows for which the value is in the setToKeep
"""
def __init__(self, column: str, set_to_keep: Set):
super().__init__()
self.setToKeep = set_to_keep
self.column = column
def _apply(self, df: pd.DataFrame) -> pd.DataFrame:
return df[df[self.column].isin(self.setToKeep)]
[docs] def info(self):
info = super().info()
info["column"] = self.column
info["setToKeep"] = self.setToKeep
return info
[docs]class DFTNotInSetComparisonRowFilterOnColumn(RuleBasedDataFrameTransformer):
"""
Filters a data frame on the selected column and retains only the rows for which the value is not in the setToDrop
"""
def __init__(self, column: str, set_to_drop: Set):
super().__init__()
self.setToDrop = set_to_drop
self.column = column
def _apply(self, df: pd.DataFrame) -> pd.DataFrame:
return df[~df[self.column].isin(self.setToDrop)]
[docs] def info(self):
info = super().info()
info["column"] = self.column
info["setToDrop"] = self.setToDrop
return info
[docs]class DFTVectorizedConditionalRowFilterOnColumn(RuleBasedDataFrameTransformer):
"""
Filters a data frame by applying a vectorized condition on the selected column and retaining only the rows
for which it returns True
"""
def __init__(self, column: str, vectorized_condition: Callable[[pd.Series], Sequence[bool]]):
super().__init__()
self.column = column
self.vectorizedCondition = vectorized_condition
def _apply(self, df: pd.DataFrame) -> pd.DataFrame:
return df[self.vectorizedCondition(df[self.column])]
[docs] def info(self):
info = super().info()
info["column"] = self.column
return info
[docs]class DFTRowFilter(RuleBasedDataFrameTransformer):
"""
Filters a data frame by applying a condition function to each row and retaining only the rows
for which it returns True
"""
def __init__(self, condition: Callable[[Any], bool]):
super().__init__()
self.condition = condition
def _apply(self, df: pd.DataFrame) -> pd.DataFrame:
return df[df.apply(self.condition, axis=1)]
[docs]class DFTModifyColumn(RuleBasedDataFrameTransformer):
"""
Modifies a column specified by 'column' using 'columnTransform'
"""
def __init__(self, column: str, column_transform: Union[Callable, np.ufunc]):
"""
:param column: the name of the column to be modified
:param column_transform: a function operating on single cells or a Numpy ufunc that applies to an entire Series
"""
super().__init__()
self.column = column
self.columnTransform = column_transform
def _apply(self, df: pd.DataFrame) -> pd.DataFrame:
df[self.column] = df[self.column].apply(self.columnTransform)
return df
[docs]class DFTModifyColumnVectorized(RuleBasedDataFrameTransformer):
"""
Modifies a column specified by 'column' using 'columnTransform'. This transformer can be used to utilise Numpy vectorisation for
performance optimisation.
"""
def __init__(self, column: str, column_transform: Callable[[np.ndarray], Union[Sequence, pd.Series, np.ndarray]]):
"""
:param column: the name of the column to be modified
:param column_transform: a function that takes a Numpy array and from which the returned value will be assigned to the column as
a whole
"""
super().__init__()
self.column = column
self.columnTransform = column_transform
def _apply(self, df: pd.DataFrame) -> pd.DataFrame:
df[self.column] = self.columnTransform(df[self.column].values)
return df
[docs]class DFTOneHotEncoder(DataFrameTransformer):
def __init__(self, columns: Optional[Union[str, Sequence[str]]],
categories: Union[List[np.ndarray], Dict[str, np.ndarray]] = None, inplace=False, ignore_unknown=False,
array_valued_result=False):
"""
One hot encode categorical variables
:param columns: list of names or regex matching names of columns that are to be replaced by a list one-hot encoded columns each
(or an array-valued column for the case where useArrayValues=True);
If None, then no columns are actually to be one-hot-encoded
:param categories: numpy arrays containing the possible values of each of the specified columns (for case where sequence is
specified in 'columns') or dictionary mapping column name to array of possible categories for the column name.
If None, the possible values will be inferred from the columns
:param inplace: whether to perform the transformation in-place
:param ignore_unknown: if True and an unknown category is encountered during transform, the resulting one-hot
encoded columns for this feature will be all zeros. if False, an unknown category will raise an error.
:param array_valued_result: whether to replace the input columns by columns of the same name containing arrays as values
instead of creating a separate column per original value
"""
super().__init__()
self._paramInfo["columns"] = columns
self._paramInfo["inferCategories"] = categories is None
self.oneHotEncoders = None
if columns is None:
self._columnsToEncode = []
self._columnNameRegex = "$"
elif type(columns) == str:
self._columnNameRegex = columns
self._columnsToEncode = None
else:
self._columnNameRegex = or_regex_group(columns)
self._columnsToEncode = columns
self.inplace = inplace
self.arrayValuedResult = array_valued_result
self.handleUnknown = "ignore" if ignore_unknown else "error"
if categories is not None:
if type(categories) == dict:
self.oneHotEncoders = {col: OneHotEncoder(categories=[np.sort(categories)], handle_unknown=self.handleUnknown,
**self._sparse_kwargs()) for col, categories in categories.items()}
else:
if len(columns) != len(categories):
raise ValueError(f"Given categories must have the same length as columns to process")
self.oneHotEncoders = {col: OneHotEncoder(categories=[np.sort(categories)], handle_unknown=self.handleUnknown,
**self._sparse_kwargs()) for col, categories in zip(columns, categories)}
@staticmethod
def _sparse_kwargs(sparse=False):
if Version(sklearn).is_at_least(1, 2):
return dict(sparse_output=sparse)
else:
return dict(sparse=sparse)
def __setstate__(self, state):
if "arrayValuedResult" not in state:
state["arrayValuedResult"] = False
super().__setstate__(state)
def _tostring_additional_entries(self) -> Dict[str, Any]:
d = super()._tostring_additional_entries()
d["columns"] = self._paramInfo.get("columns")
return d
def _fit(self, df: pd.DataFrame):
if self._columnsToEncode is None:
self._columnsToEncode = [c for c in df.columns if re.fullmatch(self._columnNameRegex, c) is not None]
if len(self._columnsToEncode) == 0:
log.warning(f"{self} does not apply to any columns, transformer has no effect; regex='{self._columnNameRegex}'")
if self.oneHotEncoders is None:
self.oneHotEncoders = {}
sparse_kwargs = self._sparse_kwargs()
for column in self._columnsToEncode:
values = df[column].dropna().unique()
categories = [np.sort(values)]
self.oneHotEncoders[column] = OneHotEncoder(categories=categories, handle_unknown=self.handleUnknown,
**sparse_kwargs)
for columnName in self._columnsToEncode:
self.oneHotEncoders[columnName].fit(df[[columnName]])
def _apply(self, df: pd.DataFrame):
if len(self._columnsToEncode) == 0:
return df
if not self.inplace:
df = df.copy()
for columnName in self._columnsToEncode:
encoded_array = self.oneHotEncoders[columnName].transform(df[[columnName]])
if not self.arrayValuedResult:
df = df.drop(columns=columnName)
for i in range(encoded_array.shape[1]):
df["%s_%d" % (columnName, i)] = encoded_array[:, i]
else:
df[columnName] = list(encoded_array)
return df
[docs] def info(self):
info = super().info()
info["inplace"] = self.inplace
info["handleUnknown"] = self.handleUnknown
info["arrayValuedResult"] = self.arrayValuedResult
info.update(self._paramInfo)
return info
[docs]class DFTColumnFilter(RuleBasedDataFrameTransformer):
"""
A DataFrame transformer that filters columns by retaining or dropping specified columns
"""
def __init__(self, keep: Union[str, Sequence[str]] = None, drop: Union[str, Sequence[str]] = None):
super().__init__()
self.keep = [keep] if type(keep) == str else keep
self.drop = drop
def _apply(self, df: pd.DataFrame) -> pd.DataFrame:
df = df.copy()
if self.keep is not None:
df = df[self.keep]
if self.drop is not None:
df = df.drop(columns=self.drop)
return df
[docs] def info(self):
info = super().info()
info["keep"] = self.keep
info["drop"] = self.drop
return info
[docs]class DFTKeepColumns(DFTColumnFilter):
def _apply(self, df: pd.DataFrame) -> pd.DataFrame:
return df[self.keep]
[docs]class DFTDRowFilterOnIndex(RuleBasedDataFrameTransformer):
def __init__(self, keep: Set = None, drop: Set = None):
super().__init__()
self.drop = drop
self.keep = keep
def _apply(self, df: pd.DataFrame) -> pd.DataFrame:
df = df.copy()
if self.keep is not None:
df = df.loc[self.keep]
if self.drop is not None:
df = df.drop(self.drop) # type: ignore
return df
[docs]class DFTNormalisation(DataFrameTransformer):
"""
Applies normalisation/scaling to a data frame by applying a set of transformation rules, where each
rule defines a set of columns to which it applies (learning a single transformer based on the values
of all applicable columns).
DFTNormalisation ignores N/A values during fitting and application.
"""
[docs] class RuleTemplate:
def __init__(self,
skip: bool = False,
unsupported: bool = False,
transformer: Optional[SkLearnTransformerProtocol] = None,
transformer_factory: Callable[[], SkLearnTransformerProtocol] = None,
independent_columns: Optional[bool] = None,
array_valued: bool = False,
fit: bool = True):
"""
A template from which a rule which matches multiple columns can be created.
This is useful for the generation of rules which shall apply to all the (numerical) columns generated
by a :class:`FeatureGenerator` without specifically naming them.
Use the parameters as follows:
* If the relevant features are already normalised, pass ``skip=True``
* If the relevant features cannot be normalised (e.g. because they are categorical), pass ``unsupported=True``
* If the relevant features shall be normalised, the other parameters apply.
No parameters, i.e. ``RuleTemplate()``, are an option if ...
* a default transformer factory is specified in the :class:`DFTNormalisation` instance and its application
is suitable for the relevant set of features.
Otherwise, specify either ``transformer_factory`` or ``transformer``.
* the resulting rule will match only a single column. Otherwise, ``independent_columns``
must be specified to True or False.
:param skip: flag indicating whether no transformation shall be performed on the matched columns (e.g. because they are already
normalised).
:param unsupported: flag indicating whether normalisation of matched columns is unsupported (shall trigger an exception if
attempted). Useful e.g. for preventing intermediate features that need further processing (like columns containing strings) from making
their way into the final dataframe that will be normalised and used for training a model.
:param transformer: a transformer instance (following the sklearn.preprocessing interface, e.g. StandardScaler) to apply to the matching column(s)
for the case where a transformation is necessary (skip=False, unsupported=False). If None is given, either
transformer_factory or the containing ``DFTNormalisation`` instance's default factory will be used when the normaliser is
fitted.
NOTE: Using a transformer_factory is usually preferred. Use an instance only if you want the
same transformer instance to be used in multiple places - e.g. sharing it across several feature generators or models that
use the same type of column with associated rule/rule template (disabling `fit` where appropriate).
:param transformer_factory: a factory for the generation of the transformer instance, which will only be applied if
`transformer` is not given; if neither `transformer` nor `transformer_factory` are given, the containing ``DFTNormalisation`` instance's default
factory will be used. See :class:`SkLearnTransformerFactoryFactory` for convenient construction options.
:param array_valued: whether the column values are not scalars but arrays (of some fixed but arbitrary length).
It is assumed that all entries in such arrays are to be normalised in the same way, i.e. the same
transformation will be applied to each entry in the array.
Only a single matching column is supported for array_valued=True, i.e. the rule must apply to at most one column.
:param fit: whether the rule's transformer shall be fitted. One use case for setting this to False is
if a transformer instance is provided (instead of a factory), which is already fitted.
:param independent_columns: whether, for the case where the rule matches multiple columns, the columns are independent and a
separate transformation is to be learned for each of them (rather than using the same transformation for all columns and
learning the transformation from the data of all columns).
This parameter must be specified for rules matching more than one column,
None is acceptable for rules matching a single column, in which case None, True, and False all have the same effect.
"""
# NOTE: keep in sync with Rule!
if (skip or unsupported) and count_not_none(transformer, transformer_factory) > 0:
raise ValueError("Passed transformer or transformer_factory while skip=True or unsupported=True")
self.skip = skip
self.unsupported = unsupported
self.transformer = transformer
self.transformerFactory = transformer_factory
self.independentColumns = independent_columns
self.arrayValued = array_valued
self.fit = fit
def __setstate__(self, state):
setstate(DFTNormalisation.RuleTemplate, self, state, new_default_properties=dict(arrayValued=False, fit=True))
[docs] def to_rule(self, regex: Optional[Union[str, re.Pattern]]):
"""
Convert the template to a rule for all columns matching the regex
:param regex: a regular expression defining the column the rule applies to
:return: the resulting Rule
"""
return DFTNormalisation.Rule(regex, skip=self.skip, unsupported=self.unsupported, transformer=self.transformer,
transformer_factory=self.transformerFactory, independent_columns=self.independentColumns, array_valued=self.arrayValued, fit=self.fit)
[docs] def to_placeholder_rule(self):
return self.to_rule(None)
[docs] class Rule(ToStringMixin):
def __init__(self,
regex: Optional[Union[str, re.Pattern]],
skip: bool = False,
unsupported: bool = False,
transformer: Optional[SkLearnTransformerProtocol] = None,
transformer_factory: Optional[Callable[[], SkLearnTransformerProtocol]] = None,
array_valued: bool = False,
fit: bool = True,
independent_columns: Optional[bool] = None):
"""
Use the parameters as follows:
* If the relevant features are already normalised, pass ``skip=True``
* If the relevant features cannot be normalised (e.g. because they are categorical), pass ``unsupported=True``
* If the relevant features shall be normalised, the other parameters apply.
No parameters other than regex, i.e. ``Rule(regex)``, are an option if ...
* a default transformer factory is specified in the :class:`DFTNormalisation` instance and its application
is suitable for the relevant set of features.
Otherwise, specify either ``transformer_factory`` or ``transformer``.
* the resulting rule will match only a single column. Otherwise, ``independent_columns``
must be specified to True or False.
:param regex: a regular expression defining the column(s) the rule applies to.
If it matches multiple columns, these columns will be normalised in the same way (using the same normalisation
process for each column) unless independent_columns=True.
If None, the rule is a placeholder rule and the regex must be set later via set_regex or the rule will not be applicable.
:param skip: flag indicating whether no transformation shall be performed on the matched columns (e.g. because they are already
normalised).
:param unsupported: flag indicating whether normalisation of matched columns is unsupported (shall trigger an exception if
attempted). Useful e.g. for preventing intermediate features that need further processing (like columns containing strings) from making
their way into the final dataframe that will be normalised and used for training a model.
:param transformer: a transformer instance (following the sklearn.preprocessing interface, e.g. StandardScaler) to apply to the matching column(s)
for the case where a transformation is necessary (skip=False, unsupported=False). If None is given, either
transformer_factory or the containing ``DFTNormalisation`` instance's default factory will be used when the normaliser is
fitted.
NOTE: Using a transformer_factory is usually preferred. Use an instance only if you want the
same transformer instance to be used in multiple places - e.g. sharing it across several feature generators or models that
use the same type of column with associated rule/rule template (disabling `fit` where appropriate).
:param transformer_factory: a factory for the generation of the transformer instance, which will only be applied if
`transformer` is not given; if neither `transformer` nor `transformer_factory` are given, the containing ``DFTNormalisation`` instance's default
factory will be used. See :class:`SkLearnTransformerFactoryFactory` for convenient construction options.
:param array_valued: whether the column values are not scalars but arrays (of some fixed but arbitrary length).
It is assumed that all entries in such arrays are to be normalised in the same way, i.e. the same
transformation will be applied to each entry in the array.
Only a single matching column is supported for array_valued=True, i.e. the regex must match at most one column.
:param fit: whether the rule's transformer shall be fitted. One use case for setting this to False is
if a transformer instance is provided (instead of a factory), which is already fitted.
:param independent_columns: whether, for the case where the rule matches multiple columns, the columns are independent and a
separate transformation is to be learned for each of them (rather than using the same transformation for all columns and
learning the transformation from the data of all columns).
This parameter must be specified to for rules matching more than one column,
None is acceptable for rules matching a single column, in which case None, True, and False all have the same effect.
"""
if (skip or unsupported) and count_not_none(transformer, transformer_factory) > 0:
raise ValueError("Passed transformer or transformer_factory while skip=True or unsupported=True")
if isinstance(regex, str):
regex = re.compile(regex)
self.regex = regex
# NOTE: keep in sync with RuleTemplate!
self.skip = skip
self.unsupported = unsupported
self.transformer = transformer
self.transformerFactory = transformer_factory
self.arrayValued = array_valued
self.fit = fit
self.independentColumns = independent_columns
def __setstate__(self, state):
setstate(DFTNormalisation.Rule, self, state, new_default_properties=dict(arrayValued=False, fit=True, independentColumns=False,
transformerFactory=None))
def _tostring_excludes(self) -> List[str]:
return super()._tostring_excludes() + ["regex"]
def _tostring_additional_entries(self) -> Dict[str, Any]:
d = super()._tostring_additional_entries()
if self.regex is not None:
d["regex"] = f"'{self.regex.pattern}'"
return d
[docs] def set_regex(self, regex: str):
try:
self.regex = re.compile(regex)
except Exception as e:
raise Exception(f"Could not compile regex '{regex}': {e}")
[docs] def matches(self, column: str):
if self.regex is None:
raise Exception("Attempted to apply a placeholder rule. Perhaps the feature generator from which the rule originated was "
"never applied in order to have the rule instantiated.")
return self.regex.fullmatch(column) is not None
[docs] def matching_columns(self, columns: Sequence[str]) -> List[str]:
return [col for col in columns if self.matches(col)]
def __init__(self, rules: Sequence[Rule], default_transformer_factory: Optional[Callable[[], SkLearnTransformerProtocol]] = None,
require_all_handled: bool = True, inplace: bool = False):
"""
:param rules: the set of rules; rules (i.e., their transformers) are always fitted and applied in the given order.
A convenient way to obtain a set of rules in the :class:`sensai.vector_model.VectorModel` context is from a
:class:`sensai.featuregen.FeatureCollector` or :class:`sensai.featuregen.MultiFeatureGenerator`.
Generally, it is often a good idea to associate rules (or a rule template) with a feature generator.
Then the rules can be obtained from it using `get_normalisation_rules`.
:param default_transformer_factory: a factory for the creation of transformer instances (which implements the
API used by sklearn.preprocessing, e.g. StandardScaler) that shall be used to create a transformer for all
rules that do not specify a particular transformer.
The default transformer will only be applied to columns matched by such rules, unmatched columns will
not be transformed.
Use :class:`SkLearnTransformerFactoryFactory` to conveniently create a factory.
:param require_all_handled: whether to raise an exception if any column is not matched by a rule
:param inplace: whether to apply data frame transformations in-place
"""
super().__init__()
self.requireAllHandled = require_all_handled
self.inplace = inplace
self._userRules = rules
self._defaultTransformerFactory = default_transformer_factory
self._rules = None
def _tostring_additional_entries(self) -> Dict[str, Any]:
d = super()._tostring_additional_entries()
if self._rules is not None:
d["rules"] = self._rules
else:
d["userRules"] = self._userRules
return d
def _fit(self, df: pd.DataFrame):
matched_rules_by_column = {}
self._rules = []
# For rules matching multiple columns, if independent_columns is False, the columns
# will be concatenated and treated as a single column for fitting the transformer.
# Note that transformers follow sklearn interfaces, thus just passing an array
# to them will learn a per-column-transformation. This will be the case for independent_columns=True.
for rule in self._userRules:
matching_columns = rule.matching_columns(df.columns)
for c in matching_columns:
if c in matched_rules_by_column:
raise Exception(f"More than one rule applies to column '{c}': {matched_rules_by_column[c]}, {rule}")
matched_rules_by_column[c] = rule
if len(matching_columns) > 0:
if rule.unsupported:
raise Exception(f"Normalisation of columns {matching_columns} is unsupported according to {rule}. "
f"If you want to make use of these columns, transform them into a supported column before applying "
f"{self.__class__.__name__}.")
if not rule.skip:
if rule.transformer is None:
if rule.transformerFactory is not None:
rule.transformer = rule.transformerFactory()
else:
if self._defaultTransformerFactory is None:
raise Exception(f"No transformer to fit: {rule} defines no transformer and instance has no transformer "
f"factory")
rule.transformer = self._defaultTransformerFactory()
if rule.fit:
# fit transformer
applicable_df = df[sorted(matching_columns)]
if rule.arrayValued:
if len(matching_columns) > 1:
raise Exception(f"Array-valued case is only supported for a single column, "
f"matched {matching_columns} for {rule}")
values = np.concatenate(applicable_df.values.flatten())
values = values.reshape((len(values), 1))
elif rule.independentColumns:
values = applicable_df.values
else:
values = applicable_df.values.flatten()
values = values.reshape((len(values), 1))
rule.transformer.fit(values)
else:
log.log(logging.DEBUG - 1, f"{rule} matched no columns")
# collect specialised rule for application
specialised_rule = copy.copy(rule)
if not specialised_rule.skip and specialised_rule.independentColumns is None and len(matching_columns) > 1:
raise ValueError(f"Normalisation rule matching multiple columns {matching_columns} must set `independentColumns` "
f"(got None)")
specialised_rule.set_regex(or_regex_group(matching_columns))
self._rules.append(specialised_rule)
def _check_unhandled_columns(self, df, matched_rules_by_column):
if self.requireAllHandled:
unhandled_columns = set(df.columns) - set(matched_rules_by_column.keys())
if len(unhandled_columns) > 0:
raise Exception(f"The following columns are not handled by any rules: {unhandled_columns}; "
f"rules: {', '.join(map(str, self._rules))}")
def _apply(self, df: pd.DataFrame) -> pd.DataFrame:
if not self.inplace:
df = df.copy()
matched_rules_by_column = {}
for rule in self._rules:
matching_columns = rule.matching_columns(df.columns)
if len(matching_columns) == 0:
continue
for c in matching_columns:
matched_rules_by_column[c] = rule
if not rule.skip:
if rule.independentColumns and not rule.arrayValued:
matching_columns = sorted(matching_columns)
df[matching_columns] = rule.transformer.transform(df[matching_columns].values)
else:
for c in matching_columns:
if not rule.arrayValued:
df[c] = rule.transformer.transform(df[[c]].values)
else:
df[c] = [rule.transformer.transform(np.array([x]).T)[:, 0] for x in df[c]]
self._check_unhandled_columns(df, matched_rules_by_column)
return df
[docs] def info(self):
info = super().info()
info["requireAllHandled"] = self.requireAllHandled
info["inplace"] = self.inplace
return info
[docs] def find_rule(self, col_name: str) -> "DFTNormalisation.Rule":
for rule in self._rules:
if rule.matches(col_name):
return rule
[docs]class DFTFromColumnGenerators(RuleBasedDataFrameTransformer):
"""
Extends a data frame with columns generated from ColumnGenerator instances
"""
def __init__(self, column_generators: Sequence['ColumnGenerator'], inplace=False):
super().__init__()
self.columnGenerators = column_generators
self.inplace = inplace
def _apply(self, df: pd.DataFrame) -> pd.DataFrame:
if not self.inplace:
df = df.copy()
for cg in self.columnGenerators:
series = cg.generate_column(df)
df[series.name] = series
return df
[docs] def info(self):
info = super().info()
info["inplace"] = self.inplace
return info
[docs]class DFTCountEntries(RuleBasedDataFrameTransformer):
"""
Transforms a data frame, based on one of its columns, into a new data frame containing two columns that indicate the counts
of unique values in the input column. It is the "DataFrame output version" of pd.Series.value_counts.
Each row of the output column holds a unique value of the input column and the number of times it appears in the input column.
"""
def __init__(self, column_for_entry_count: str, column_name_for_resulting_counts: str = "counts"):
super().__init__()
self.columnNameForResultingCounts = column_name_for_resulting_counts
self.columnForEntryCount = column_for_entry_count
def _apply(self, df: pd.DataFrame) -> pd.DataFrame:
series = df[self.columnForEntryCount].value_counts()
return pd.DataFrame({self.columnForEntryCount: series.index, self.columnNameForResultingCounts: series.values})
[docs] def info(self):
info = super().info()
info["columnNameForResultingCounts"] = self.columnNameForResultingCounts
info["columnForEntryCount"] = self.columnForEntryCount
return info
[docs]class DFTAggregationOnColumn(RuleBasedDataFrameTransformer):
def __init__(self, column_for_aggregation: str, aggregation: Callable):
super().__init__()
self.columnForAggregation = column_for_aggregation
self.aggregation = aggregation
def _apply(self, df: pd.DataFrame) -> pd.DataFrame:
return df.groupby(self.columnForAggregation).agg(self.aggregation)
[docs]class DFTRoundFloats(RuleBasedDataFrameTransformer):
def __init__(self, decimals=0):
super().__init__()
self.decimals = decimals
def _apply(self, df: pd.DataFrame) -> pd.DataFrame:
return pd.DataFrame(np.round(df.values, self.decimals), columns=df.columns, index=df.index)
[docs] def info(self):
info = super().info()
info["decimals"] = self.decimals
return info
[docs]class DFTSortColumns(RuleBasedDataFrameTransformer):
"""
Sorts a data frame's columns in ascending order
"""
def _apply(self, df: pd.DataFrame) -> pd.DataFrame:
return df[sorted(df.columns)]
[docs]class DFTFillNA(RuleBasedDataFrameTransformer):
"""
Fills NA/NaN values with the given value
"""
def __init__(self, fill_value, inplace: bool = False):
super().__init__()
self.fillValue = fill_value
self.inplace = inplace
def _apply(self, df: pd.DataFrame) -> pd.DataFrame:
if self.inplace:
df.fillna(value=self.fillValue, inplace=True)
return df
else:
return df.fillna(value=self.fillValue)
[docs]class DFTCastCategoricalColumns(RuleBasedDataFrameTransformer):
"""
Casts columns with dtype category to the given type.
This can be useful in cases where categorical columns are not accepted by the model but the column values are actually numeric,
in which case the cast to a numeric value yields an acceptable label encoding.
"""
def __init__(self, columns: Optional[List[str]] = None, dtype=float):
"""
:param columns: the columns to convert; if None, convert all that have dtype category
:param dtype: the data type to which categorical columns are to be converted
"""
super().__init__()
self.columns = columns
self.dtype = dtype
def _apply(self, df: pd.DataFrame) -> pd.DataFrame:
df = df.copy()
columns = self.columns if self.columns is not None else df.columns
for col in columns:
s = df[col]
if s.dtype.name == "category":
df[col] = s.astype(self.dtype)
return df
[docs]class DFTDropNA(RuleBasedDataFrameTransformer):
"""
Drops rows or columns containing NA/NaN values
"""
def __init__(self, axis=0, inplace=False):
"""
:param axis: 0 to drop rows, 1 to drop columns containing an N/A value
:param inplace: whether to perform the operation in-place on the input data frame
"""
super().__init__()
self.axis = axis
self.inplace = inplace
def _apply(self, df: pd.DataFrame) -> pd.DataFrame:
if self.inplace:
df.dropna(axis=self.axis, inplace=True)
return df
else:
return df.dropna(axis=self.axis)