Coverage for src/sensai/featuregen/feature_generator_registry.py: 87%
68 statements
« prev ^ index » next coverage.py v7.6.1, created at 2024-08-13 22:17 +0000
« prev ^ index » next coverage.py v7.6.1, created at 2024-08-13 22:17 +0000
1import logging
2from typing import Callable, Dict, TYPE_CHECKING, Hashable, Union
4import pandas as pd
6from . import FeatureGenerator, MultiFeatureGenerator
7from ..data_transformation import DFTNormalisation, DFTOneHotEncoder
8from ..util.string import list_string
10if TYPE_CHECKING:
11 pass
13log = logging.getLogger(__name__)
16class FeatureGeneratorRegistry:
17 """
18 Represents a registry for (named) feature generator factories
19 """
20 def __init__(self, use_singletons: bool = False):
21 """
22 :param use_singletons: if True, internally maintain feature generator singletons, such that there is at most one
23 instance for each name/key
24 """
25 self._feature_generator_factories: Dict[Hashable, Callable[[], FeatureGenerator]] = {}
26 self._feature_generator_singletons: Dict[Hashable, FeatureGenerator] = {}
27 self._use_singletons = use_singletons
29 @property
30 def available_features(self):
31 return list(self._feature_generator_factories.keys())
33 @staticmethod
34 def _name(name: Hashable):
35 # for enums, which have .name, use the name only, because it is less problematic to persist
36 if hasattr(name, "name"):
37 name = name.name
38 return name
40 def register_factory(self, name: Hashable, factory: Callable[[], FeatureGenerator]):
41 """
42 Registers a feature generator factory which can subsequently be referenced by models via their name/hashable key
44 :param name: the name/key (which can, in particular, be a string or an Enum item). Especially for larger projects
45 the use of an Enum is recommended (for optimal IDE support)
46 :param factory: the factory
47 """
48 name = self._name(name)
49 if name in self._feature_generator_factories:
50 raise ValueError(f"Generator for name '{name}' already registered")
51 self._feature_generator_factories[name] = factory
53 def get_feature_generator(self, name: str) -> FeatureGenerator:
54 """
55 Creates a feature generator from a name, which must have been previously registered.
56 The name of the returned feature generator (as returned by getName()) is set to name.
58 :param name: the name (which can, in particular, be a string or an enum item)
59 :return: a new feature generator instance (or existing instance for the case where useSingletons is enabled)
60 """
61 name = self._name(name)
62 generator = self._feature_generator_singletons.get(name)
63 if generator is None:
64 factory = self._feature_generator_factories.get(name)
65 if factory is None:
66 raise ValueError(f"No factory registered for name '{name}': known names: {list_string(self._feature_generator_factories.keys())}. Use registerFeatureGeneratorFactory to register a new feature generator factory.")
67 generator = factory()
68 generator.set_name(name)
69 if self._use_singletons:
70 self._feature_generator_singletons[name] = generator
71 return generator
73 def collect_features(self, *feature_generators_or_names: Union[Hashable, FeatureGenerator]) -> "FeatureCollector":
74 """
75 Creates a feature collector for the given feature names/keys/instances, which can subsequently be added to a model.
77 :param feature_generators_or_names: feature names/keys known to this registry or feature generator instances
78 """
79 return FeatureCollector(*feature_generators_or_names, registry=self)
82class FeatureCollector(object):
83 """
84 A feature collector which facilitates the collection of features that shall be used by a model as well as the
85 generation of commonly used feature transformers that are informed by the features' meta-data.
86 """
88 def __init__(self,
89 *feature_generators_or_names: Union[Hashable, FeatureGenerator],
90 registry: FeatureGeneratorRegistry = None):
91 """
92 :param feature_generators_or_names: generator names/keys (known to the registry) or generator instances
93 :param registry: the feature generator registry for the case where names/keys are passed
94 """
95 self._feature_generators_or_names = feature_generators_or_names
96 self._registry = registry
97 self._multi_feature_generator = self.create_multi_feature_generator()
99 def get_multi_feature_generator(self) -> MultiFeatureGenerator:
100 """
101 Gets the multi-feature generator that was created for this collector.
102 To create a new, independent instance (e.g. when using this collector for multiple
103 models), use :meth:`create_multi_feature_generator` instead.
105 :return: the multi-feature generator that was created for this instance
106 """
107 return self._multi_feature_generator
109 def get_normalisation_rules(self, include_generated_categorical_rules=True):
110 return self.get_multi_feature_generator().get_normalisation_rules(
111 include_generated_categorical_rules=include_generated_categorical_rules)
113 def get_categorical_feature_name_regex(self) -> str:
114 """
115 :return: a regular expression that matches all known categorical feature names
116 """
117 return self.get_multi_feature_generator().get_categorical_feature_name_regex()
119 def create_multi_feature_generator(self):
120 """
121 Creates a new instance of the multi-feature generator that generates the features
122 collected by this instance. If the feature collector instance is not used for
123 multiple models, use :meth:`get_multi_feature_generator` instead to obtain
124 the instance that has already been created.
126 :return: a new multi-feature generator that generates the collected features
127 """
128 feature_generators = []
129 for f in self._feature_generators_or_names:
130 if isinstance(f, FeatureGenerator):
131 feature_generators.append(f)
132 else:
133 if self._registry is None:
134 raise Exception(f"Received feature name '{f}' instead of instance but no registry to perform the lookup")
135 feature_generators.append(self._registry.get_feature_generator(f))
136 return MultiFeatureGenerator(*feature_generators)
138 def create_dft_normalisation(self, default_transformer_factory=None, require_all_handled=True, inplace=False) -> DFTNormalisation:
139 """
140 Creates a feature transformer that will apply normalisation to all supported (numeric) features
142 :param default_transformer_factory: a factory for the creation of transformer instances (which implements the
143 API used by sklearn.preprocessing, e.g. StandardScaler) that shall be used to create a transformer for all
144 rules that do not specify a particular transformer.
145 The default transformer will only be applied to columns matched by such rules, unmatched columns will
146 not be transformed.
147 Use SkLearnTransformerFactoryFactory to conveniently create a factory.
148 :param require_all_handled: whether to raise an exception if not all columns are matched by a rule
149 :param inplace: whether to apply data frame transformations in-place
150 :return: the transformer
151 """
152 return DFTNormalisation(self.get_normalisation_rules(), default_transformer_factory=default_transformer_factory,
153 require_all_handled=require_all_handled, inplace=inplace)
155 def create_dft_one_hot_encoder(self, ignore_unknown=False, inplace=False):
156 """
157 Creates a feature transformer that will apply one-hot encoding to all the features that are known to be categorical
159 :param inplace: whether to perform the transformation in-place
160 :param ignore_unknown: if True and an unknown category is encountered during transform, the resulting one-hot
161 encoded columns for this feature will be all zeros. if False, an unknown category will raise an error.
162 :return: the transformer
163 """
164 return DFTOneHotEncoder(self.get_categorical_feature_name_regex(), ignore_unknown=ignore_unknown, inplace=inplace)
166 def create_feature_transformer_normalisation(self, default_transformer_factory=None, require_all_handled=True, inplace=False) \
167 -> DFTNormalisation:
168 """
169 Creates a feature transformer that will apply normalisation to all supported (numeric) features.
170 Alias of create_dft_normalisation.
172 :param default_transformer_factory: a factory for the creation of transformer instances (which implements the
173 API used by sklearn.preprocessing, e.g. StandardScaler) that shall be used to create a transformer for all
174 rules that do not specify a particular transformer.
175 The default transformer will only be applied to columns matched by such rules, unmatched columns will
176 not be transformed.
177 Use SkLearnTransformerFactoryFactory to conveniently create a factory.
178 :param require_all_handled: whether to raise an exception if not all columns are matched by a rule
179 :param inplace: whether to apply data frame transformations in-place
180 :return: the transformer
181 """
182 return self.create_dft_normalisation(default_transformer_factory=default_transformer_factory,
183 require_all_handled=require_all_handled, inplace=inplace)
185 def create_feature_transformer_one_hot_encoder(self, ignore_unknown=False, inplace=False):
186 """
187 Creates a feature transformer that will apply one-hot encoding to all the features that are known to be categorical.
188 Alias of create_dft_one_hot_encoder.
190 :param inplace: whether to perform the transformation in-place
191 :param ignore_unknown: if True and an unknown category is encountered during transform, the resulting one-hot
192 encoded columns for this feature will be all zeros. if False, an unknown category will raise an error.
193 :return: the transformer
194 """
195 return self.create_dft_one_hot_encoder(ignore_unknown=ignore_unknown, inplace=inplace)