Source code for sensai.naive_bayes
import collections
from math import log, exp
from typing import Optional
import numpy as np
import pandas as pd
from .vector_model import VectorClassificationModel
[docs]class CategoricalNaiveBayesVectorClassificationModel(VectorClassificationModel):
"""
Naive Bayes with categorical features
"""
def __init__(self, pseudo_count=0.1):
"""
:param pseudo_count: the count to add to each empirical count in order to avoid overfitting
"""
super().__init__()
self.prior = None
self.conditionals = None
self.pseudoCount = pseudo_count
def _fit_classifier(self, x: pd.DataFrame, y: pd.DataFrame, weights: Optional[pd.Series] = None):
self._warn_sample_weights_unsupported(False, weights)
self.prior = collections.defaultdict(lambda: 0)
self.conditionals = collections.defaultdict(lambda: [collections.defaultdict(lambda: 0) for _ in range(x.shape[1])])
increment = 1
for idxRow in range(x.shape[0]):
cls = y.iloc[idxRow, 0]
self.prior[cls] += increment
for idxFeature in range(x.shape[1]):
value = x.iloc[idxRow, idxFeature]
self.conditionals[cls][idxFeature][value] += increment
# get rid of defaultdicts, which are not picklable
self.prior = dict(self.prior)
self.conditionals = {k: [dict(d) for d in l] for k, l in self.conditionals.items()}
def _predict_class_probabilities(self, x: pd.DataFrame):
results = []
for _, features in x.iterrows():
class_probabilities = np.zeros(len(self._labels))
for i, cls in enumerate(self._labels):
lp = log(self._probability(self.prior, cls))
for idx_feature, value in enumerate(features):
lp += log(self._probability(self.conditionals[cls][idx_feature], value))
class_probabilities[i] = exp(lp)
class_probabilities /= np.sum(class_probabilities)
results.append(class_probabilities)
return pd.DataFrame(results, columns=self._labels)
def _probability(self, counts, value):
value_count = counts.get(value, 0.0)
total_count = sum(counts.values())
return (value_count + self.pseudoCount) / (total_count + self.pseudoCount)
def _predict(self, x: pd.DataFrame) -> pd.DataFrame:
results = []
for _, features in x.iterrows():
best_cls = None
best_lp = None
for cls in self.prior:
lp = log(self._probability(self.prior, cls))
for idxFeature, value in enumerate(features):
lp += log(self._probability(self.conditionals[cls][idxFeature], value))
if best_lp is None or lp > best_lp:
best_lp = lp
best_cls = cls
results.append(best_cls)
return pd.DataFrame(results, columns=self.get_predicted_variable_names())