Coverage for src/sensai/featuregen/feature_generator_registry.py: 87%

68 statements  

« prev     ^ index     » next       coverage.py v7.6.1, created at 2024-08-13 22:17 +0000

1import logging 

2from typing import Callable, Dict, TYPE_CHECKING, Hashable, Union 

3 

4import pandas as pd 

5 

6from . import FeatureGenerator, MultiFeatureGenerator 

7from ..data_transformation import DFTNormalisation, DFTOneHotEncoder 

8from ..util.string import list_string 

9 

10if TYPE_CHECKING: 

11 pass 

12 

13log = logging.getLogger(__name__) 

14 

15 

16class FeatureGeneratorRegistry: 

17 """ 

18 Represents a registry for (named) feature generator factories 

19 """ 

20 def __init__(self, use_singletons: bool = False): 

21 """ 

22 :param use_singletons: if True, internally maintain feature generator singletons, such that there is at most one 

23 instance for each name/key 

24 """ 

25 self._feature_generator_factories: Dict[Hashable, Callable[[], FeatureGenerator]] = {} 

26 self._feature_generator_singletons: Dict[Hashable, FeatureGenerator] = {} 

27 self._use_singletons = use_singletons 

28 

29 @property 

30 def available_features(self): 

31 return list(self._feature_generator_factories.keys()) 

32 

33 @staticmethod 

34 def _name(name: Hashable): 

35 # for enums, which have .name, use the name only, because it is less problematic to persist 

36 if hasattr(name, "name"): 

37 name = name.name 

38 return name 

39 

40 def register_factory(self, name: Hashable, factory: Callable[[], FeatureGenerator]): 

41 """ 

42 Registers a feature generator factory which can subsequently be referenced by models via their name/hashable key 

43 

44 :param name: the name/key (which can, in particular, be a string or an Enum item). Especially for larger projects 

45 the use of an Enum is recommended (for optimal IDE support) 

46 :param factory: the factory 

47 """ 

48 name = self._name(name) 

49 if name in self._feature_generator_factories: 

50 raise ValueError(f"Generator for name '{name}' already registered") 

51 self._feature_generator_factories[name] = factory 

52 

53 def get_feature_generator(self, name: str) -> FeatureGenerator: 

54 """ 

55 Creates a feature generator from a name, which must have been previously registered. 

56 The name of the returned feature generator (as returned by getName()) is set to name. 

57 

58 :param name: the name (which can, in particular, be a string or an enum item) 

59 :return: a new feature generator instance (or existing instance for the case where useSingletons is enabled) 

60 """ 

61 name = self._name(name) 

62 generator = self._feature_generator_singletons.get(name) 

63 if generator is None: 

64 factory = self._feature_generator_factories.get(name) 

65 if factory is None: 

66 raise ValueError(f"No factory registered for name '{name}': known names: {list_string(self._feature_generator_factories.keys())}. Use registerFeatureGeneratorFactory to register a new feature generator factory.") 

67 generator = factory() 

68 generator.set_name(name) 

69 if self._use_singletons: 

70 self._feature_generator_singletons[name] = generator 

71 return generator 

72 

73 def collect_features(self, *feature_generators_or_names: Union[Hashable, FeatureGenerator]) -> "FeatureCollector": 

74 """ 

75 Creates a feature collector for the given feature names/keys/instances, which can subsequently be added to a model. 

76 

77 :param feature_generators_or_names: feature names/keys known to this registry or feature generator instances 

78 """ 

79 return FeatureCollector(*feature_generators_or_names, registry=self) 

80 

81 

82class FeatureCollector(object): 

83 """ 

84 A feature collector which facilitates the collection of features that shall be used by a model as well as the 

85 generation of commonly used feature transformers that are informed by the features' meta-data. 

86 """ 

87 

88 def __init__(self, 

89 *feature_generators_or_names: Union[Hashable, FeatureGenerator], 

90 registry: FeatureGeneratorRegistry = None): 

91 """ 

92 :param feature_generators_or_names: generator names/keys (known to the registry) or generator instances 

93 :param registry: the feature generator registry for the case where names/keys are passed 

94 """ 

95 self._feature_generators_or_names = feature_generators_or_names 

96 self._registry = registry 

97 self._multi_feature_generator = self.create_multi_feature_generator() 

98 

99 def get_multi_feature_generator(self) -> MultiFeatureGenerator: 

100 """ 

101 Gets the multi-feature generator that was created for this collector. 

102 To create a new, independent instance (e.g. when using this collector for multiple 

103 models), use :meth:`create_multi_feature_generator` instead. 

104 

105 :return: the multi-feature generator that was created for this instance 

106 """ 

107 return self._multi_feature_generator 

108 

109 def get_normalisation_rules(self, include_generated_categorical_rules=True): 

110 return self.get_multi_feature_generator().get_normalisation_rules( 

111 include_generated_categorical_rules=include_generated_categorical_rules) 

112 

113 def get_categorical_feature_name_regex(self) -> str: 

114 """ 

115 :return: a regular expression that matches all known categorical feature names 

116 """ 

117 return self.get_multi_feature_generator().get_categorical_feature_name_regex() 

118 

119 def create_multi_feature_generator(self): 

120 """ 

121 Creates a new instance of the multi-feature generator that generates the features 

122 collected by this instance. If the feature collector instance is not used for 

123 multiple models, use :meth:`get_multi_feature_generator` instead to obtain 

124 the instance that has already been created. 

125 

126 :return: a new multi-feature generator that generates the collected features 

127 """ 

128 feature_generators = [] 

129 for f in self._feature_generators_or_names: 

130 if isinstance(f, FeatureGenerator): 

131 feature_generators.append(f) 

132 else: 

133 if self._registry is None: 

134 raise Exception(f"Received feature name '{f}' instead of instance but no registry to perform the lookup") 

135 feature_generators.append(self._registry.get_feature_generator(f)) 

136 return MultiFeatureGenerator(*feature_generators) 

137 

138 def create_dft_normalisation(self, default_transformer_factory=None, require_all_handled=True, inplace=False) -> DFTNormalisation: 

139 """ 

140 Creates a feature transformer that will apply normalisation to all supported (numeric) features 

141 

142 :param default_transformer_factory: a factory for the creation of transformer instances (which implements the 

143 API used by sklearn.preprocessing, e.g. StandardScaler) that shall be used to create a transformer for all 

144 rules that do not specify a particular transformer. 

145 The default transformer will only be applied to columns matched by such rules, unmatched columns will 

146 not be transformed. 

147 Use SkLearnTransformerFactoryFactory to conveniently create a factory. 

148 :param require_all_handled: whether to raise an exception if not all columns are matched by a rule 

149 :param inplace: whether to apply data frame transformations in-place 

150 :return: the transformer 

151 """ 

152 return DFTNormalisation(self.get_normalisation_rules(), default_transformer_factory=default_transformer_factory, 

153 require_all_handled=require_all_handled, inplace=inplace) 

154 

155 def create_dft_one_hot_encoder(self, ignore_unknown=False, inplace=False): 

156 """ 

157 Creates a feature transformer that will apply one-hot encoding to all the features that are known to be categorical 

158 

159 :param inplace: whether to perform the transformation in-place 

160 :param ignore_unknown: if True and an unknown category is encountered during transform, the resulting one-hot 

161 encoded columns for this feature will be all zeros. if False, an unknown category will raise an error. 

162 :return: the transformer 

163 """ 

164 return DFTOneHotEncoder(self.get_categorical_feature_name_regex(), ignore_unknown=ignore_unknown, inplace=inplace) 

165 

166 def create_feature_transformer_normalisation(self, default_transformer_factory=None, require_all_handled=True, inplace=False) \ 

167 -> DFTNormalisation: 

168 """ 

169 Creates a feature transformer that will apply normalisation to all supported (numeric) features. 

170 Alias of create_dft_normalisation. 

171 

172 :param default_transformer_factory: a factory for the creation of transformer instances (which implements the 

173 API used by sklearn.preprocessing, e.g. StandardScaler) that shall be used to create a transformer for all 

174 rules that do not specify a particular transformer. 

175 The default transformer will only be applied to columns matched by such rules, unmatched columns will 

176 not be transformed. 

177 Use SkLearnTransformerFactoryFactory to conveniently create a factory. 

178 :param require_all_handled: whether to raise an exception if not all columns are matched by a rule 

179 :param inplace: whether to apply data frame transformations in-place 

180 :return: the transformer 

181 """ 

182 return self.create_dft_normalisation(default_transformer_factory=default_transformer_factory, 

183 require_all_handled=require_all_handled, inplace=inplace) 

184 

185 def create_feature_transformer_one_hot_encoder(self, ignore_unknown=False, inplace=False): 

186 """ 

187 Creates a feature transformer that will apply one-hot encoding to all the features that are known to be categorical. 

188 Alias of create_dft_one_hot_encoder. 

189 

190 :param inplace: whether to perform the transformation in-place 

191 :param ignore_unknown: if True and an unknown category is encountered during transform, the resulting one-hot 

192 encoded columns for this feature will be all zeros. if False, an unknown category will raise an error. 

193 :return: the transformer 

194 """ 

195 return self.create_dft_one_hot_encoder(ignore_unknown=ignore_unknown, inplace=inplace)