Coverage for src/sensai/naive_bayes.py: 22%

55 statements  

« prev     ^ index     » next       coverage.py v7.6.1, created at 2024-11-29 18:29 +0000

1import collections 

2from math import log, exp 

3from typing import Optional 

4 

5import numpy as np 

6import pandas as pd 

7 

8from .vector_model import VectorClassificationModel 

9 

10 

11class CategoricalNaiveBayesVectorClassificationModel(VectorClassificationModel): 

12 """ 

13 Naive Bayes with categorical features 

14 """ 

15 def __init__(self, pseudo_count=0.1): 

16 """ 

17 :param pseudo_count: the count to add to each empirical count in order to avoid overfitting 

18 """ 

19 super().__init__() 

20 self.prior = None 

21 self.conditionals = None 

22 self.pseudoCount = pseudo_count 

23 

24 def _fit_classifier(self, x: pd.DataFrame, y: pd.DataFrame, weights: Optional[pd.Series] = None): 

25 self._warn_sample_weights_unsupported(False, weights) 

26 self.prior = collections.defaultdict(lambda: 0) 

27 self.conditionals = collections.defaultdict(lambda: [collections.defaultdict(lambda: 0) for _ in range(x.shape[1])]) 

28 increment = 1 

29 for idxRow in range(x.shape[0]): 

30 cls = y.iloc[idxRow, 0] 

31 self.prior[cls] += increment 

32 for idxFeature in range(x.shape[1]): 

33 value = x.iloc[idxRow, idxFeature] 

34 self.conditionals[cls][idxFeature][value] += increment 

35 # get rid of defaultdicts, which are not picklable 

36 self.prior = dict(self.prior) 

37 self.conditionals = {k: [dict(d) for d in l] for k, l in self.conditionals.items()} 

38 

39 def _predict_class_probabilities(self, x: pd.DataFrame): 

40 results = [] 

41 for _, features in x.iterrows(): 

42 class_probabilities = np.zeros(len(self._labels)) 

43 for i, cls in enumerate(self._labels): 

44 lp = log(self._probability(self.prior, cls)) 

45 for idx_feature, value in enumerate(features): 

46 lp += log(self._probability(self.conditionals[cls][idx_feature], value)) 

47 class_probabilities[i] = exp(lp) 

48 class_probabilities /= np.sum(class_probabilities) 

49 results.append(class_probabilities) 

50 return pd.DataFrame(results, columns=self._labels) 

51 

52 def _probability(self, counts, value): 

53 value_count = counts.get(value, 0.0) 

54 total_count = sum(counts.values()) 

55 return (value_count + self.pseudoCount) / (total_count + self.pseudoCount) 

56 

57 def _predict(self, x: pd.DataFrame) -> pd.DataFrame: 

58 results = [] 

59 for _, features in x.iterrows(): 

60 best_cls = None 

61 best_lp = None 

62 for cls in self.prior: 

63 lp = log(self._probability(self.prior, cls)) 

64 for idxFeature, value in enumerate(features): 

65 lp += log(self._probability(self.conditionals[cls][idxFeature], value)) 

66 if best_lp is None or lp > best_lp: 

67 best_lp = lp 

68 best_cls = cls 

69 results.append(best_cls) 

70 return pd.DataFrame(results, columns=self.get_predicted_variable_names())