import collections
import datetime
import logging
import typing
from abc import ABC, abstractmethod
from typing import Callable, List, Iterable, Optional
import numpy as np
import pandas as pd
from . import util, data_transformation
from .distance_metric import DistanceMetric
from .featuregen import FeatureGeneratorFromNamedTuples
from .util.string import object_repr
from .util.typing import PandasNamedTuple
from .vector_model import VectorClassificationModel, VectorRegressionModel
log = logging.getLogger(__name__)
[docs]class Neighbor:
def __init__(self, value: PandasNamedTuple, distance: float):
self.distance = distance
self.value = value
self.identifier = value.Index
[docs]class NeighborProvider(ABC):
def __init__(self, df_indexed_by_id: pd.DataFrame):
self.df = df_indexed_by_id
self.index = self.df.index
if any(self.index.duplicated()):
raise Exception("Dataframe index should not contain duplicates")
self.index_position_dict = {idx: pos for pos, idx in enumerate(self.index)}
[docs] @abstractmethod
def iter_potential_neighbors(self, value: PandasNamedTuple) -> Iterable[PandasNamedTuple]:
pass
@abstractmethod
def __str__(self):
return super().__str__()
[docs]class AllNeighborsProvider(NeighborProvider):
def __init__(self, df_indexed_by_id: pd.DataFrame):
super().__init__(df_indexed_by_id)
self.named_tuples = None
def __getstate__(self):
d = self.__dict__.copy()
d["namedTuples"] = None
return d
[docs] def iter_potential_neighbors(self, value):
identifier = value.Index
if self.named_tuples is None:
self.named_tuples = list(self.df.itertuples())
for nt in self.named_tuples:
if nt.Index != identifier:
yield nt
def __str__(self):
return str(self.__class__.__name__)
[docs]class TimerangeNeighborsProvider(NeighborProvider):
def __init__(self, df_indexed_by_id: pd.DataFrame, timestamps_column="timestamps",
past_time_range_days=120, future_time_range_days=120):
super().__init__(df_indexed_by_id)
if not pd.core.dtypes.common.is_datetime64_any_dtype(self.df[timestamps_column]):
raise Exception(f"Column {timestamps_column} does not have a compatible datatype")
self.timestamps_column = timestamps_column
self.past_time_range_days = past_time_range_days
self.future_time_range_days = future_time_range_days
self.past_timedelta = datetime.timedelta(days=past_time_range_days)
self.future_timedelta = datetime.timedelta(days=future_time_range_days)
[docs] def iter_potential_neighbors(self, value: PandasNamedTuple):
identifier = value.Index
input_time = getattr(value, self.timestamps_column)
max_time, min_time = input_time + self.future_timedelta, input_time - self.past_timedelta
neighbors_df = self.df[
self.df[self.timestamps_column].apply(lambda time: min_time < time < input_time)
]
if identifier in neighbors_df.index:
neighbors_df.drop(identifier, inplace=True)
return neighbors_df.itertuples()
def __str__(self):
return object_repr(self, ["past_time_range_days", "future_time_range_days"])
[docs]class AbstractKnnFinder(ABC):
[docs] @abstractmethod
def find_neighbors(self, named_tuple: PandasNamedTuple, n_neighbors=20) -> List[Neighbor]:
pass
@abstractmethod
def __str__(self):
super().__str__()
[docs]class CachingKNearestNeighboursFinder(AbstractKnnFinder):
"""
A nearest neighbor finder which uses a cache for distance metrics in order speed up repeated computations
of the neighbors of the same data point by keeping a pandas.Series of distances to all provided
data points cached. If the distance metric is of the composite type LinearCombinationDistanceMetric,
its component distance metrics are cached, such that weights in the linear combination can be varied
without necessitating recomputations.
"""
log = log.getChild(__qualname__)
def __init__(self, cache: 'CachingKNearestNeighboursFinder.DistanceMetricCache', distance_metric: DistanceMetric,
neighbor_provider: NeighborProvider):
self.neighbor_provider = neighbor_provider
# This field is purely for logging purposes
self.distance_metric = distance_metric
if isinstance(distance_metric, distance_metric.LinearCombinationDistanceMetric):
self.weighted_distance_metrics = [(cache.get_cached_metric(dm), w) for (w, dm) in distance_metric.metrics]
else:
self.weighted_distance_metrics = [(cache.get_cached_metric(distance_metric), 1)]
def __str__(self):
return object_repr(self, ["neighbor_provider", "distance_metric"])
[docs] class DistanceMetricCache:
"""
A cache for distance metrics which identifies equivalent distance metrics by their string representations.
The cache can be passed (consecutively) to multiple KNN models in order to speed up computations for the
same test data points. If the cache is reused, it is assumed that the neighbor provider remains the same.
"""
log = log.getChild(__qualname__)
def __init__(self):
self._cached_metrics_by_name = {}
[docs] def get_cached_metric(self, distance_metric):
key = str(distance_metric)
cached_metric = self._cached_metrics_by_name.get(key)
if cached_metric is None:
self.log.info(f"Creating new cached metric for key '{key}'")
cached_metric = CachingKNearestNeighboursFinder.CachedSeriesDistanceMetric(distance_metric)
self._cached_metrics_by_name[key] = cached_metric
else:
self.log.info(f"Reusing cached metric for key '{key}'")
return cached_metric
[docs] class CachedSeriesDistanceMetric:
"""
Provides caching for a wrapped distance metric: the series of all distances to provided potential neighbors
are retained in a cache
"""
def __init__(self, distance_metric):
self.distance_metric = distance_metric
self.cache = {}
[docs] def get_distance_series(self, named_tuple: PandasNamedTuple, potential_neighbor_values):
identifier = named_tuple.Index
series = self.cache.get(identifier)
if series is None:
distances = []
for neighborTuple in potential_neighbor_values:
distances.append(self.distance_metric.distance(named_tuple, neighborTuple))
series = pd.Series(distances)
self.cache[identifier] = series
return series
[docs] def find_neighbors(self, named_tuple: PandasNamedTuple, n_neighbors=20) -> List[Neighbor]:
potential_neighbors = list(self.neighbor_provider.iter_potential_neighbors(named_tuple))
summed_distance_series = None
for i, (metric, weight) in enumerate(self.weighted_distance_metrics):
weighted_distances_series = metric.get_distance_series(named_tuple, potential_neighbors) * weight
if i == 0:
summed_distance_series = weighted_distances_series.copy()
else:
summed_distance_series += weighted_distances_series
summed_distance_series.sort_values(ascending=True, inplace=True)
result = []
for i in range(n_neighbors):
neighbor_tuple = potential_neighbors[summed_distance_series.index[i]]
distance = summed_distance_series.iloc[i]
result.append(Neighbor(neighbor_tuple, distance))
return result
[docs]class KNearestNeighboursFinder(AbstractKnnFinder):
def __init__(self, distance_metric: DistanceMetric, neighbor_provider: NeighborProvider):
self.neighbor_provider = neighbor_provider
self.distance_metric = distance_metric
def __str__(self):
return object_repr(self, ["neighbor_provider", "distance_metric"])
[docs] def find_neighbors(self, named_tuple: PandasNamedTuple, n_neighbors=20) -> List[Neighbor]:
result = []
log.debug(f"Finding neighbors for {named_tuple.Index}")
for neighborTuple in self.neighbor_provider.iter_potential_neighbors(named_tuple):
distance = self.distance_metric.distance(named_tuple, neighborTuple)
result.append(Neighbor(neighborTuple, distance))
result.sort(key=lambda n: n.distance)
return result[:n_neighbors]
[docs]class KNearestNeighboursClassificationModel(VectorClassificationModel):
def __init__(self, num_neighbors: int, distance_metric: DistanceMetric,
neighbor_provider_factory: Callable[[pd.DataFrame], NeighborProvider] = AllNeighborsProvider,
distance_based_weighting=False, distance_epsilon=1e-3,
distance_metric_cache: CachingKNearestNeighboursFinder.DistanceMetricCache = None, **kwargs):
"""
:param num_neighbors: the number of nearest neighbors to consider
:param distance_metric: the distance metric to use
:param neighbor_provider_factory: a factory with which a neighbor provider can be constructed using data
:param distance_based_weighting: whether to weight neighbors according to their distance (inverse); if False, use democratic vote
:param distance_epsilon: a distance that is added to all distances for distance-based weighting (in order to avoid 0 distances);
:param distance_metric_cache: a cache for distance metrics which shall be used to store speed up repeated computations
of the neighbors of the same data point by keeping series of distances cached (particularly for composite distance metrics);
see class CachingKNearestNeighboursFinder
:param kwargs: parameters to pass on to super-classes
"""
super().__init__(**kwargs)
self.distance_epsilon = distance_epsilon
self.distance_based_weighting = distance_based_weighting
self.neighbor_provider_factory = neighbor_provider_factory
self.num_neighbors = num_neighbors
self.distance_metric = distance_metric
self.distance_metric_cache = distance_metric_cache
self.df = None
self.y = None
self.knn_finder = None
def _tostring_excludes(self) -> List[str]:
return super()._tostring_excludes() + ["neighbor_provider_factory", "distance_metric", "distance_metric_cache", "df", "y"]
# noinspection DuplicatedCode
def _fit_classifier(self, x: pd.DataFrame, y: pd.DataFrame, weights: Optional[pd.Series] = None):
self._warn_sample_weights_unsupported(False, weights)
assert len(y.columns) == 1, "Expected exactly one column in label set Y"
self.df = x.merge(y, how="inner", left_index=True, right_index=True)
self.y = y
neighbor_provider = self.neighbor_provider_factory(self.df)
if self.distance_metric_cache is None:
self.knn_finder = KNearestNeighboursFinder(self.distance_metric, neighbor_provider)
else:
self.knn_finder = CachingKNearestNeighboursFinder(self.distance_metric_cache, self.distance_metric, neighbor_provider)
log.info(f"Using neighbor provider of type {self.knn_finder.__class__.__name__}")
def _predict_class_probabilities(self, x: pd.DataFrame):
output_df = pd.DataFrame({label: np.nan for label in self._labels}, index=x.index)
for nt in x.itertuples():
neighbors = self.find_neighbors(nt)
probabilities = self._predict_class_probability_vector_from_neighbors(neighbors)
output_df.loc[nt.Index] = probabilities
return output_df
def _predict_class_probability_vector_from_neighbors(self, neighbors: List['Neighbor']):
weights = collections.defaultdict(lambda: 0)
total = 0
for neigh in neighbors:
if self.distance_based_weighting:
weight = 1.0 / (neigh.distance + self.distance_epsilon)
else:
weight = 1
weights[self._get_label(neigh)] += weight
total += weight
return [weights[label] / total for label in self._labels]
def _get_label(self, neighbor: 'Neighbor'):
return self.y.iloc[:, 0].loc[neighbor.identifier]
[docs] def find_neighbors(self, named_tuple):
return self.knn_finder.find_neighbors(named_tuple, self.num_neighbors)
[docs]class KNearestNeighboursRegressionModel(VectorRegressionModel):
def __init__(self, num_neighbors: int, distance_metric: DistanceMetric,
neighbor_provider_factory: Callable[[pd.DataFrame], NeighborProvider] = AllNeighborsProvider,
distance_based_weighting=False, distance_epsilon=1e-3,
distance_metric_cache: CachingKNearestNeighboursFinder.DistanceMetricCache = None, **kwargs):
"""
:param num_neighbors: the number of nearest neighbors to consider
:param distance_metric: the distance metric to use
:param neighbor_provider_factory: a factory with which a neighbor provider can be constructed using data
:param distance_based_weighting: whether to weight neighbors according to their distance (inverse); if False, use democratic vote
:param distance_epsilon: a distance that is added to all distances for distance-based weighting (in order to avoid 0 distances);
:param distance_metric_cache: a cache for distance metrics which shall be used to store speed up repeated computations
of the neighbors of the same data point by keeping series of distances cached (particularly for composite distance metrics);
see class CachingKNearestNeighboursFinder
:param kwargs: parameters to pass on to super-classes
"""
super().__init__(**kwargs)
self.distance_epsilon = distance_epsilon
self.distance_based_weighting = distance_based_weighting
self.neighbor_provider_factory = neighbor_provider_factory
self.num_neighbors = num_neighbors
self.distance_metric = distance_metric
self.distance_metric_cache = distance_metric_cache
self.df = None
self.y = None
self.knn_finder = None
def _tostring_excludes(self) -> List[str]:
return super()._tostring_excludes() + ["neighbor_provider_factory", "distance_metric", "distance_metric_cache", "df", "y"]
# noinspection DuplicatedCode
def _fit(self, x: pd.DataFrame, y: pd.DataFrame, weights: Optional[pd.Series] = None):
self._warn_sample_weights_unsupported(False, weights)
assert len(y.columns) == 1, "Expected exactly one column in label set Y"
self.df = x.merge(y, how="inner", left_index=True, right_index=True)
self.y = y
neighbor_provider = self.neighbor_provider_factory(self.df)
if self.distance_metric_cache is None:
self.knn_finder = KNearestNeighboursFinder(self.distance_metric, neighbor_provider)
else:
self.knn_finder = CachingKNearestNeighboursFinder(self.distance_metric_cache, self.distance_metric, neighbor_provider)
log.info(f"Using neighbor provider of type {self.knn_finder.__class__.__name__}")
def _get_target(self, neighbor: Neighbor):
return self.y.iloc[:, 0].loc[neighbor.identifier]
def _predict_single_input(self, named_tuple):
neighbors = self.knn_finder.find_neighbors(named_tuple, self.num_neighbors)
neighbor_targets = np.array([self._get_target(n) for n in neighbors])
if self.distance_based_weighting:
neighbor_weights = np.array([1.0 / (n.distance + self.distance_epsilon) for n in neighbors])
return np.sum(neighbor_targets * neighbor_weights) / np.sum(neighbor_weights)
else:
return np.mean(neighbor_targets)
def _predict(self, x: pd.DataFrame) -> pd.DataFrame:
predicted_values = []
for i, nt in enumerate(x.itertuples()):
predicted_values.append(self._predict_single_input(nt))
return pd.DataFrame({self._predictedVariableNames[0]: predicted_values}, index=x.index)
[docs]class FeatureGeneratorNeighbors(FeatureGeneratorFromNamedTuples):
"""
Generates features based on nearest neighbors. For each neighbor, a set of features is added to the output data frame.
Each feature has the name "n{0-based neighbor index}_{feature name}", where the feature names are configurable
at construction. The feature name "distance", which indicates the distance of the neighbor to the data point is
always present.
"""
def __init__(self, num_neighbors: int,
neighbor_attributes: typing.List[str],
distance_metric: DistanceMetric,
neighbor_provider_factory: typing.Callable[[pd.DataFrame], NeighborProvider] = AllNeighborsProvider,
cache: util.cache.KeyValueCache = None,
categorical_feature_names: typing.Sequence[str] = (),
normalisation_rules: typing.Sequence[data_transformation.DFTNormalisation.Rule] = ()):
"""
:param num_neighbors: the number of neighbors for to generate features
:param neighbor_attributes: the attributes of the neighbor's named tuple to include as features (in addition to "distance")
:param distance_metric: the distance metric defining which neighbors are near
:param neighbor_provider_factory: a factory for the creation of neighbor provider
:param cache: an optional key-value cache in which feature values are stored by data point identifier (as given by the DataFrame's
index)
"""
super().__init__(cache=cache, categorical_feature_names=categorical_feature_names, normalisation_rules=normalisation_rules)
self.neighbor_attributes = neighbor_attributes
self.distance_metric = distance_metric
self.neighbor_provider_factory = neighbor_provider_factory
self.num_neighbors = num_neighbors
self._knn_finder: Optional[KNearestNeighboursFinder] = None
self._train_x = None
def _generate(self, df: pd.DataFrame, ctx=None):
if self._train_x is None:
raise Exception("Feature generator has not been fitted")
neighbor_provider = self.neighbor_provider_factory(self._train_x)
self._knn_finder = KNearestNeighboursFinder(self.distance_metric, neighbor_provider)
return super()._generate(df, ctx)
def _generate_feature_dict(self, named_tuple) -> typing.Dict[str, typing.Any]:
neighbors = self._knn_finder.find_neighbors(named_tuple, self.num_neighbors)
result = {}
for i, neighbor in enumerate(neighbors):
result[f"n{i}_distance"] = neighbor.distance
for attr in self.neighbor_attributes:
result[f"n{i}_{attr}"] = getattr(neighbor.value, attr)
return result
def _fit(self, x: pd.DataFrame, y: pd.DataFrame = None, ctx=None):
self._train_x = x