Source code for sensai.vectoriser

from abc import ABC, abstractmethod
from enum import Enum
import logging
from typing import Callable, Union, TypeVar, Generic, Sequence, List, Tuple, Iterable, Dict, Hashable, Optional

import numpy as np

from .util import LogTime
from .util.pickle import setstate
from .util.string import list_string, ToStringMixin

T = TypeVar("T")

log = logging.getLogger(__name__)


[docs]class Vectoriser(Generic[T], ToStringMixin): """ A vectoriser represents a method for the conversion of instances of some type T into vectors, i.e. one-dimensional (numeric) arrays, or (in the special case of a 1D vector) scalars """ log = log.getChild(__qualname__) def __init__(self, f: Callable[[T], Union[float, np.ndarray, list]], transformer=None, is_fitted=False): """ :param f: the function which maps from an instance of T to an array/list/scalar :param transformer: an optional transformer (e.g. instance of one of the classes in sklearn.preprocessing) which can be used to transform/normalise the generated arrays :param is_fitted: whether the vectoriser (and therefore the given transformer) is assumed to be fitted already """ self._fn = f self.transformer = transformer self._resultType = None self.name = None self._is_fitted = is_fitted def __setstate__(self, state): new_default_properties = { "_is_fitted": True # we assume that any persisted objects have been fitted } setstate(Vectoriser, self, state, new_optional_properties=["_resultType", "name"], renamed_properties={"f": "_fn"}, new_default_properties=new_default_properties) def _tostring_exclude_private(self) -> bool: return True
[docs] def is_fitted(self) -> bool: return self._is_fitted
[docs] def set_name(self, name): self.name = name
[docs] def get_name(self): """ :return: the name of this feature generator, which may be a default name if the name has not been set. Note that feature generators created by a FeatureGeneratorFactory always get the name with which the generator factory was registered. """ if self.name is None: return f"{self.__class__.__name__}-{id(self)}" return self.name
[docs] def fit(self, items: Iterable[T]): if self.transformer is not None: values = [self._f(item) for item in items] self.transformer.fit(np.array(values)) self._is_fitted = True
def _f(self, x) -> np.array: y = self._fn(x) if self._resultType is None: self._resultType = self.ResultType.from_value(y) if self._resultType == self.ResultType.LIST: y = np.array(y) elif self._resultType == self.ResultType.SCALAR: y = np.array([y]) return y
[docs] def apply(self, item: T, transform=True) -> np.array: """ :param item: the item to be vectorised :param transform: whether to apply this instance's transformer (if any) :return: a vector """ value = self._f(item) if self.transformer is not None and transform: value = self.transformer.transform([value])[0] return value
[docs] def apply_multi(self, items: Iterable[T], transform=True, use_cache=False, verbose=False) -> List[np.array]: """ Applies this vectoriser to multiple items at once. Especially for cases where this vectoriser uses a transformer, this method is significantly faster than calling apply repeatedly. :param items: the items to be vectorised :param transform: whether to apply this instance's transformer (if any) :param use_cache: whether to apply caching of the value function f given at construction (keeping track of outputs for each input object id), which can significantly speed up computation in cases where an items appears more than once in the collection of items :param verbose: whether to generate log messages :return: a list of vectors """ if verbose: self.log.info(f"Applying {self}") with LogTime("Application", enabled=verbose, logger=self.log): if not use_cache: compute_value = self._f else: cache = {} def compute_value(x): key = id(x) value = cache.get(key) if value is None: value = self._f(x) cache[key] = value return value values = [compute_value(x) for x in items] if self.transformer is not None and transform: values = self.transformer.transform(values) return values
[docs] class ResultType(Enum): SCALAR = 0 LIST = 1 NUMPY_ARRAY = 2
[docs] @classmethod def from_value(cls, y): if type(y) == list: return cls.LIST elif np.isscalar(y): return cls.SCALAR elif isinstance(y, np.ndarray): return cls.NUMPY_ARRAY else: raise ValueError(f"Received unhandled value of type {type(y)}")
[docs]class EmptyVectoriser(Vectoriser): def __init__(self): super().__init__(self._create_empty_vector) # noinspection PyUnusedLocal @staticmethod def _create_empty_vector(x): return np.zeros(0)
[docs]class ItemIdentifierProvider(Generic[T], ABC): """ Provides identifiers for sequence items. """
[docs] @abstractmethod def get_identifier(self, item: T) -> Hashable: pass
[docs]class SequenceVectoriser(Generic[T], ToStringMixin): """ Supports the application of Vectorisers to sequences of objects of some type T, where each object of type T is mapped to a vector (1D array) by the vectorisers. A SequenceVectoriser is fitted by fitting the underlying Vectorisers. In order to obtain the instances of T that are used for training, we take into consideration the fact that the sequences of T may overlap and thus training is performed on the set of unique instances. """ log = log.getChild(__qualname__)
[docs] class FittingMode(Enum): """ Determines how the individual vectorisers are fitted based on several sequences of objects of type T that are given. If NONE, no fitting is performed, otherwise the mode determines how a single sequence of objects of type T for fitting is obtained from the collection of sequences: either by forming the set of unique objects from the sequences (UNIQUE) """ NONE = "none" # no fitting is performed UNIQUE = "unique" # use collection of unique items CONCAT = "concat" # use collection obtained by concatenating all sequences using numpy.concatenate
def __init__(self, vectorisers: Union[Sequence[Vectoriser[T]], Vectoriser[T]], fitting_mode: FittingMode = FittingMode.UNIQUE, unique_id_provider: Optional[ItemIdentifierProvider] = None, refit_vectorisers: bool = True): """ :param vectorisers: zero or more vectorisers that are to be applied. If more than one vectoriser is supplied, vectors are generated from input instances of type T by concatenating the results of the vectorisers in the order the vectorisers are given. :param fitting_mode: the fitting mode for vectorisers. If `NONE`, no fitting takes place. If `UNIQUE`, fit vectorisers on unique set of items of type T. By default, uniqueness is determined based on Python object identity. If a custom mechanisms for determining an item's identity is desired, pass `unique_id_retriever`. If `CONCAT`, fit vectorisers based on all items of type T, concatenating them to a single sequence. :param unique_id_provider: an object used to determine item identities when using fitting mode `UNIQUE`. :param refit_vectorisers: whether any vectorisers that have previously been fitted shall be fitted once more when this sequence vectoriser is fitted. Set this to false if you are reusing vectorisers that are also part of another sequence vectoriser that will be fitted/has been fitted before this sequence vectoriser. This can be useful, in particular, in encoder-decoders where the target features are partly the same as the history sequence features, and we want to reuse the latter and their fitted transformers for the target features. """ self.fittingMode = fitting_mode self.uniqueIdProvider = unique_id_provider self.refitVectorisers = refit_vectorisers if isinstance(vectorisers, Vectoriser): self.vectorisers = [vectorisers] else: self.vectorisers = vectorisers if len(self.vectorisers) == 0: self.vectorisers = [EmptyVectoriser()] def __setstate__(self, state): state["fittingMode"] = state.get("fittingMode", self.FittingMode.UNIQUE) setstate(SequenceVectoriser, self, state, new_optional_properties=["uniqueIdProvider"], new_default_properties={"refitVectorisers": True})
[docs] def fit(self, data: Iterable[Sequence[T]]): log.debug(f"Fitting {self}") if self.fittingMode == self.FittingMode.NONE: return if not self.refitVectorisers: if all(v.is_fitted() for v in self.vectorisers): log.debug(f"No vectorisers to be fitted; all contained vectorisers have previously been fitted") return # obtain items for fitting if self.fittingMode == self.FittingMode.UNIQUE: if self.uniqueIdProvider is None: items = set() for seq in data: items.update(seq) else: items = [] identifiers = set() for seq in data: for item in seq: identifier = self.uniqueIdProvider.get_identifier(item) if identifier not in identifiers: identifiers.add(identifier) items.append(item) elif self.fittingMode == self.FittingMode.CONCAT: items = np.concatenate(data) # type: ignore else: raise ValueError(self.fittingMode) for v in self.vectorisers: if self.refitVectorisers or not v.is_fitted(): log.debug(f"Fitting {v}") v.fit(items)
[docs] def apply(self, seq: Sequence[T], transform=True) -> List[np.array]: """ Applies vectorisation to the given sequence of objects :param seq: the sequence to vectorise :param transform: whether to apply any post-vectorisation transformers :return: """ vectors_list = [] for item in seq: vectors = [vec.apply(item, transform=transform) for vec in self.vectorisers] conc = np.concatenate(vectors, axis=0) vectors_list.append(conc) return vectors_list
[docs] def apply_multi(self, sequences: Iterable[Sequence[T]], use_cache=False, verbose=False) -> Tuple[List[List[np.array]], List[int]]: """ Applies this vectoriser to multiple sequences of objects of type T, where each sequence is mapped to a sequence of 1D arrays. This method can be significantly faster than multiple applications of apply, especially in cases where the vectorisers use transformers. :param sequences: the sequences to vectorise :param use_cache: whether to apply caching of the value functions of contained vectorisers (keeping track of outputs for each input object id), which can significantly speed up computation in cases where the given sequences contain individual items more than once :param verbose: whether to generate log messages :return: a pair (vl, l) where vl is a list of lists of vectors/arrays and l is a list of integers containing the lengths of the sequences """ if verbose: self.log.info(f"Applying {self} (useCache={use_cache})") lengths = [len(s) for s in sequences] if verbose: self.log.info("Generating combined sequence") combined_seq = [] for seq in sequences: combined_seq.extend(seq) individual_vectoriser_results = [vectoriser.apply_multi(combined_seq, use_cache=use_cache, verbose=verbose) for vectoriser in self.vectorisers] conc_vectors = [np.concatenate(x, axis=0) for x in zip(*individual_vectoriser_results)] vector_sequences = [] idx_start = 0 for l in lengths: vector_sequences.append(conc_vectors[idx_start:idx_start+l]) idx_start += l return vector_sequences, lengths
[docs] def apply_multi_with_padding(self, sequences: Sequence[Sequence[T]], use_cache=False, verbose=False) \ -> Tuple[List[List[np.array]], List[int]]: """ Applies this vectoriser to multiple sequences of objects of type T, where each sequence is mapped to a sequence of 1D arrays. Sequences are allowed to vary in length. for shorter sequences, 0-vectors are appended until the maximum sequence length is reached (padding). :param sequences: the sequences to vectorise :param use_cache: whether to apply caching of the value functions of contained vectorisers (keeping track of outputs for each input object id), which can significantly speed up computation in cases where the given sequences contain individual items more than once :param verbose: whether to generate log messages :return: a pair (vl, l) where vl is a list of lists of vectors/arrays, each list having the same length, and l is a list of integers containing the original unpadded lengths of the sequences """ result, lengths = self.apply_multi(sequences, use_cache=use_cache, verbose=verbose) if verbose: self.log.info("Applying padding") max_length = max(lengths) dim = len(result[0][0]) dummy_vec = np.zeros((dim,)) for seq in result: for i in range(max_length - len(seq)): seq.append(dummy_vec) return result, lengths
[docs] def get_vector_dim(self, seq: Sequence[T]): """ Determines the dimensionality of generated vectors by applying the vectoriser to the given sequence :param seq: the sequence :return: the number of dimensions in generated output vectors (per item) """ return len(self.apply(seq, transform=False)[0])
[docs]class VectoriserRegistry: def __init__(self): self._factories: Dict[Hashable, Callable[[Callable], Vectoriser]] = {}
[docs] def get_available_vectorisers(self): return list(self._factories.keys())
@staticmethod def _name(name: Hashable): # for enums, which have .name, use the name only, because it is less problematic to persist if hasattr(name, "name"): name = name.name return name
[docs] def register_factory(self, name: Hashable, factory: Callable[[Callable], Vectoriser], additional_names: Optional[Iterable[Hashable]] = None): """ Registers a vectoriser factory which can subsequently be referenced via their name :param name: the name (which can, in particular, be a string or an enum item) :param factory: the factory, which takes the default transformer factory as an argument :param additional_names: (optional) additional names under which to register the factory """ self._register_factory(name, factory) if additional_names is not None: for n in additional_names: self._register_factory(n, factory)
def _register_factory(self, name: Hashable, factory): name = self._name(name) if name in self._factories: raise ValueError(f"Vectoriser factory for name '{name}' already registered") self._factories[name] = factory
[docs] def get_vectoriser(self, name: Hashable, default_transformer_factory: Callable) -> Vectoriser: """ Creates a vectoriser from a name, which must have been previously registered. :param name: the name (which can, in particular, be a string or an enum item) :param default_transformer_factory: the default transformer factory :return: a new vectoriser instance """ name = self._name(name) factory = self._factories.get(name) if factory is None: raise ValueError(f"No vectoriser factory registered for name '{name}': known names: {list_string(self._factories.keys())}. " f"Register the factory first.") instance = factory(default_transformer_factory) instance.set_name(name) return instance
[docs] def get_vectorisers(self, names: List[Hashable], default_transformer_factory: Callable) -> List[Vectoriser]: return [self.get_vectoriser(name, default_transformer_factory) for name in names]