Coverage for src/sensai/columngen.py: 33%
78 statements
« prev ^ index » next coverage.py v7.6.1, created at 2024-08-13 22:17 +0000
« prev ^ index » next coverage.py v7.6.1, created at 2024-08-13 22:17 +0000
1from abc import ABC, abstractmethod
2import logging
3from typing import Any, Union, Optional
5import numpy as np
6import pandas as pd
8from .data_transformation import DFTNormalisation
9from .featuregen import FeatureGeneratorFromColumnGenerator
10from .util.cache import KeyValueCache
13log = logging.getLogger(__name__)
16class ColumnGenerator:
17 """
18 Generates a single column (pd.Series) from an input data frame, which is to have the same index as the input
19 """
20 def __init__(self, generated_column_name: str):
21 """
22 :param generated_column_name: the name of the column being generated
23 """
24 self.generatedColumnName = generated_column_name
26 def generate_column(self, df: pd.DataFrame) -> pd.Series:
27 """
28 Generates a column from the input data frame
30 :param df: the input data frame
31 :return: the column as a named series, which has the same index as the input
32 """
33 result = self._generate_column(df)
34 if isinstance(result, pd.Series):
35 result.name = self.generatedColumnName
36 else:
37 result = pd.Series(result, index=df.index, name=self.generatedColumnName)
38 return result
40 @abstractmethod
41 def _generate_column(self, df: pd.DataFrame) -> Union[pd.Series, list, np.ndarray]:
42 """
43 Performs the actual column generation
45 :param df: the input data frame
46 :return: a list/array of the same length as df or a series with the same index
47 """
48 pass
50 def to_feature_generator(self,
51 take_input_column_if_present: bool = False,
52 normalisation_rule_template: DFTNormalisation.RuleTemplate = None,
53 is_categorical: bool = False):
54 """
55 Transforms this column generator into a feature generator that can be used as part of a VectorModel.
57 :param take_input_column_if_present: if True, then if a column whose name corresponds to the column to generate exists
58 in the input data, simply copy it to generate the output (without using the column generator); if False, always
59 apply the columnGen to generate the output
60 :param is_categorical: whether the resulting column is categorical
61 :param normalisation_rule_template: template for a DFTNormalisation for the resulting column.
62 This should only be provided if is_categorical is False
63 :return:
64 """
65 return FeatureGeneratorFromColumnGenerator(self,
66 take_input_column_if_present=take_input_column_if_present,
67 normalisation_rule_template=normalisation_rule_template,
68 is_categorical=is_categorical)
71class IndexCachedColumnGenerator(ColumnGenerator):
72 """
73 Decorator for a column generator which adds support for cached column generation where cache keys are given by the input data frame's
74 index. Entries not found in the cache are computed by the wrapped column generator.
76 The main use case for this class is to add caching to existing ColumnGenerators. For creating a new caching
77 ColumnGenerator the use of ColumnGeneratorCachedByIndex is encouraged.
78 """
80 log = log.getChild(__qualname__)
82 def __init__(self, column_generator: ColumnGenerator, cache: KeyValueCache):
83 """
84 :param column_generator: the column generator with which to generate values for keys not found in the cache
85 :param cache: the cache in which to store key-value pairs
86 """
87 super().__init__(column_generator.generatedColumnName)
88 self.columnGenerator = column_generator
89 self.cache = cache
91 def _generate_column(self, df: pd.DataFrame) -> pd.Series:
92 # compute series of cached values
93 cache_values = [self.cache.get(nt.Index) for nt in df.itertuples()]
94 cache_series = pd.Series(cache_values, dtype=object, index=df.index).dropna()
96 # compute missing values (if any) via wrapped generator, storing them in the cache
97 missing_values_df = df[~df.index.isin(cache_series.index)]
98 self.log.info(f"Retrieved {len(cache_series)} values from the cache, {len(missing_values_df)} still to be computed by "
99 f"{self.columnGenerator}")
100 if len(missing_values_df) == 0:
101 return cache_series
102 else:
103 missing_series = self.columnGenerator.generate_column(missing_values_df)
104 for key, value in missing_series.iteritems():
105 self.cache.set(key, value)
106 return pd.concat((cache_series, missing_series))
109class ColumnGeneratorCachedByIndex(ColumnGenerator, ABC):
110 """
111 Base class for column generators, which supports cached column generation, each value being generated independently.
112 Cache keys are given by the input data frame's index.
113 """
115 log = log.getChild(__qualname__)
117 def __init__(self, generated_column_name: str, cache: Optional[KeyValueCache], persist_cache=False):
118 """
119 :param generated_column_name: the name of the column being generated
120 :param cache: the cache in which to store key-value pairs. If None, caching will be disabled
121 :param persist_cache: whether to persist the cache when pickling
122 """
123 super().__init__(generated_column_name)
124 self.cache = cache
125 self.persistCache = persist_cache
127 def _generate_column(self, df: pd.DataFrame) -> Union[pd.Series, list, np.ndarray]:
128 self.log.info(f"Generating column {self.generatedColumnName} with {self.__class__.__name__}")
129 values = []
130 cache_hits = 0
131 column_length = len(df)
132 percentage_to_log = 0
133 for i, namedTuple in enumerate(df.itertuples()):
134 percentage_generated = int(100*i/column_length)
135 if percentage_generated == percentage_to_log:
136 self.log.debug(f"Processed {percentage_to_log}% of {self.generatedColumnName}")
137 percentage_to_log += 5
139 key = namedTuple.Index
140 if self.cache is not None:
141 value = self.cache.get(key)
142 if value is None:
143 value = self._generate_value(namedTuple)
144 self.cache.set(key, value)
145 else:
146 cache_hits += 1
147 else:
148 value = self._generate_value(namedTuple)
149 values.append(value)
150 if self.cache is not None:
151 self.log.info(f"Cached column generation resulted in {cache_hits}/{column_length} cache hits")
152 return values
154 def __getstate__(self):
155 if not self.persistCache:
156 d = self.__dict__.copy()
157 d["cache"] = None
158 return d
159 return self.__dict__
161 @abstractmethod
162 def _generate_value(self, named_tuple) -> Any:
163 pass