Coverage for src/sensai/naive_bayes.py: 22%
55 statements
« prev ^ index » next coverage.py v7.6.1, created at 2024-11-29 18:29 +0000
« prev ^ index » next coverage.py v7.6.1, created at 2024-11-29 18:29 +0000
1import collections
2from math import log, exp
3from typing import Optional
5import numpy as np
6import pandas as pd
8from .vector_model import VectorClassificationModel
11class CategoricalNaiveBayesVectorClassificationModel(VectorClassificationModel):
12 """
13 Naive Bayes with categorical features
14 """
15 def __init__(self, pseudo_count=0.1):
16 """
17 :param pseudo_count: the count to add to each empirical count in order to avoid overfitting
18 """
19 super().__init__()
20 self.prior = None
21 self.conditionals = None
22 self.pseudoCount = pseudo_count
24 def _fit_classifier(self, x: pd.DataFrame, y: pd.DataFrame, weights: Optional[pd.Series] = None):
25 self._warn_sample_weights_unsupported(False, weights)
26 self.prior = collections.defaultdict(lambda: 0)
27 self.conditionals = collections.defaultdict(lambda: [collections.defaultdict(lambda: 0) for _ in range(x.shape[1])])
28 increment = 1
29 for idxRow in range(x.shape[0]):
30 cls = y.iloc[idxRow, 0]
31 self.prior[cls] += increment
32 for idxFeature in range(x.shape[1]):
33 value = x.iloc[idxRow, idxFeature]
34 self.conditionals[cls][idxFeature][value] += increment
35 # get rid of defaultdicts, which are not picklable
36 self.prior = dict(self.prior)
37 self.conditionals = {k: [dict(d) for d in l] for k, l in self.conditionals.items()}
39 def _predict_class_probabilities(self, x: pd.DataFrame):
40 results = []
41 for _, features in x.iterrows():
42 class_probabilities = np.zeros(len(self._labels))
43 for i, cls in enumerate(self._labels):
44 lp = log(self._probability(self.prior, cls))
45 for idx_feature, value in enumerate(features):
46 lp += log(self._probability(self.conditionals[cls][idx_feature], value))
47 class_probabilities[i] = exp(lp)
48 class_probabilities /= np.sum(class_probabilities)
49 results.append(class_probabilities)
50 return pd.DataFrame(results, columns=self._labels)
52 def _probability(self, counts, value):
53 value_count = counts.get(value, 0.0)
54 total_count = sum(counts.values())
55 return (value_count + self.pseudoCount) / (total_count + self.pseudoCount)
57 def _predict(self, x: pd.DataFrame) -> pd.DataFrame:
58 results = []
59 for _, features in x.iterrows():
60 best_cls = None
61 best_lp = None
62 for cls in self.prior:
63 lp = log(self._probability(self.prior, cls))
64 for idxFeature, value in enumerate(features):
65 lp += log(self._probability(self.conditionals[cls][idxFeature], value))
66 if best_lp is None or lp > best_lp:
67 best_lp = lp
68 best_cls = cls
69 results.append(best_cls)
70 return pd.DataFrame(results, columns=self.get_predicted_variable_names())