Coverage for src/sensai/lightgbm.py: 0%

47 statements  

« prev     ^ index     » next       coverage.py v7.6.1, created at 2024-08-13 22:17 +0000

1import logging 

2import re 

3from typing import Sequence, Union, Optional 

4 

5import lightgbm 

6import pandas as pd 

7 

8from .sklearn.sklearn_base import AbstractSkLearnMultipleOneDimVectorRegressionModel, AbstractSkLearnVectorClassificationModel, \ 

9 FeatureImportanceProviderSkLearnRegressionMultipleOneDim, FeatureImportanceProviderSkLearnClassification 

10from .util.string import or_regex_group 

11 

12log = logging.getLogger(__name__) 

13 

14 

15# noinspection PyUnusedLocal 

16def _update_fit_args(fit_args: dict, inputs: pd.DataFrame, outputs: pd.DataFrame, categorical_feature_name_regex: Optional[str]): 

17 if categorical_feature_name_regex is not None: 

18 cols = list(inputs.columns) 

19 categorical_feature_names = [col for col in cols if re.match(categorical_feature_name_regex, col)] 

20 col_indices = [cols.index(f) for f in categorical_feature_names] 

21 args = {"categorical_feature": col_indices} 

22 log.info(f"Updating fit parameters with {args}") 

23 fit_args.update(args) 

24 else: 

25 fit_args.pop("categorical_feature", None) 

26 

27 

28class LightGBMVectorRegressionModel(AbstractSkLearnMultipleOneDimVectorRegressionModel, 

29 FeatureImportanceProviderSkLearnRegressionMultipleOneDim): 

30 log = log.getChild(__qualname__) 

31 

32 def __init__(self, categorical_feature_names: Optional[Union[Sequence[str], str]] = None, random_state=42, num_leaves=31, 

33 max_depth=-1, n_estimators=100, min_child_samples=20, importance_type="gain", **model_args): 

34 """ 

35 :param categorical_feature_names: sequence of feature names in the input data that are categorical or a single string containing 

36 a regex matching the categorical feature names. 

37 Columns that have dtype 'category' (as will be the case for categorical columns created via FeatureGenerators) 

38 need not be specified (will be inferred automatically). 

39 In general, passing categorical features is preferable to using one-hot encoding, for example. 

40 :param random_state: the random seed to use 

41 :param num_leaves: the maximum number of leaves in one tree (original lightgbm default is 31) 

42 :param max_depth: maximum tree depth for base learners, <=0 means no limit 

43 :param n_estimators: number of boosted trees to fit 

44 :param min_child_samples: minimum number of data needed in a child (leaf) 

45 :param importance_type: the type of feature importance to be set in the respective property of the wrapped model. 

46 If ‘split’, result contains numbers of times the feature is used in a model. 

47 If ‘gain’, result contains total gains of splits which use the feature. 

48 :param model_args: see https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.LGBMRegressor.html 

49 """ 

50 super().__init__(lightgbm.sklearn.LGBMRegressor, random_state=random_state, num_leaves=num_leaves, importance_type=importance_type, 

51 max_depth=max_depth, n_estimators=n_estimators, min_child_samples=min_child_samples, 

52 **model_args) 

53 

54 if type(categorical_feature_names) == str: 

55 categorical_feature_name_regex = categorical_feature_names 

56 else: 

57 if categorical_feature_names is not None and len(categorical_feature_names) > 0: 

58 categorical_feature_name_regex = or_regex_group(categorical_feature_names) 

59 else: 

60 categorical_feature_name_regex = None 

61 self._categoricalFeatureNameRegex: str = categorical_feature_name_regex 

62 

63 def _update_fit_args(self, inputs: pd.DataFrame, outputs: pd.DataFrame): 

64 _update_fit_args(self.fitArgs, inputs, outputs, self._categoricalFeatureNameRegex) 

65 

66 

67class LightGBMVectorClassificationModel(AbstractSkLearnVectorClassificationModel, FeatureImportanceProviderSkLearnClassification): 

68 log = log.getChild(__qualname__) 

69 

70 def __init__(self, categorical_feature_names: Optional[Union[Sequence[str], str]] = None, random_state=42, num_leaves=31, 

71 max_depth=-1, n_estimators=100, min_child_samples=20, importance_type="gain", use_balanced_class_weights=False, 

72 **model_args): 

73 """ 

74 :param categorical_feature_names: sequence of feature names in the input data that are categorical or a single string containing 

75 a regex matching the categorical feature names. 

76 Columns that have dtype 'category' (as will be the case for categorical columns created via FeatureGenerators) 

77 need not be specified (will be inferred automatically). 

78 In general, passing categorical features may be preferable to using one-hot encoding, for example. 

79 :param random_state: the random seed to use 

80 :param num_leaves: the maximum number of leaves in one tree (original lightgbm default is 31) 

81 :param max_depth: maximum tree depth for base learners, <=0 means no limit 

82 :param n_estimators: number of boosted trees to fit 

83 :param min_child_samples: minimum number of data needed in a child (leaf) 

84 :param importance_type: the type of feature importance to be set in the respective property of the wrapped model. 

85 If ‘split’, result contains numbers of times the feature is used in a model. 

86 If ‘gain’, result contains total gains of splits which use the feature. 

87 :param use_balanced_class_weights: whether to compute class weights from the training data that is given and pass it on to the 

88 classifier's fit method; weighted data points may not be supported for all types of models 

89 :param model_args: see https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.LGBMClassifier.html?highlight=LGBMClassifier 

90 """ 

91 super().__init__(lightgbm.sklearn.LGBMClassifier, random_state=random_state, num_leaves=num_leaves, 

92 max_depth=max_depth, n_estimators=n_estimators, min_child_samples=min_child_samples, importance_type=importance_type, 

93 use_balanced_class_weights=use_balanced_class_weights, **model_args) 

94 

95 if type(categorical_feature_names) == str: 

96 categorical_feature_name_regex = categorical_feature_names 

97 else: 

98 if categorical_feature_names is not None and len(categorical_feature_names) > 0: 

99 categorical_feature_name_regex = or_regex_group(categorical_feature_names) 

100 else: 

101 categorical_feature_name_regex = None 

102 self._categoricalFeatureNameRegex: str = categorical_feature_name_regex 

103 

104 def _update_fit_args(self, inputs: pd.DataFrame, outputs: pd.DataFrame): 

105 _update_fit_args(self.fitArgs, inputs, outputs, self._categoricalFeatureNameRegex) 

106 

107 def _predict_class_probabilities(self, x: pd.DataFrame): 

108 if len(self._labels) == 1: 

109 # special handling required because LGBMClassifier will return values for two classes even if there is only one 

110 y = self.model.predict_proba(self._transform_input(x)) 

111 y = y[:, 0] 

112 return pd.DataFrame(y, columns=self._labels) 

113 else: 

114 return super()._predict_class_probabilities(x)