Coverage for src/sensai/lightgbm.py: 0%

51 statements  

« prev     ^ index     » next       coverage.py v7.6.1, created at 2024-11-29 18:29 +0000

1import logging 

2import re 

3from typing import Sequence, Union, Optional 

4 

5import lightgbm 

6import pandas as pd 

7 

8from .sklearn.sklearn_base import AbstractSkLearnMultipleOneDimVectorRegressionModel, AbstractSkLearnVectorClassificationModel, \ 

9 FeatureImportanceProviderSkLearnRegressionMultipleOneDim, FeatureImportanceProviderSkLearnClassification 

10from .util.string import or_regex_group 

11 

12log = logging.getLogger(__name__) 

13 

14 

15# noinspection PyUnusedLocal 

16def _update_fit_args(fit_args: dict, inputs: pd.DataFrame, outputs: pd.DataFrame, categorical_feature_name_regex: Optional[str]): 

17 if categorical_feature_name_regex is not None: 

18 cols = list(inputs.columns) 

19 categorical_feature_names = [col for col in cols if re.match(categorical_feature_name_regex, col)] 

20 col_indices = [cols.index(f) for f in categorical_feature_names] 

21 args = {"categorical_feature": col_indices} 

22 log.info(f"Updating fit parameters with {args}") 

23 fit_args.update(args) 

24 else: 

25 fit_args.pop("categorical_feature", None) 

26 

27 

28class LightGBMVectorRegressionModel(AbstractSkLearnMultipleOneDimVectorRegressionModel, 

29 FeatureImportanceProviderSkLearnRegressionMultipleOneDim): 

30 log = log.getChild(__qualname__) 

31 

32 def __init__(self, categorical_feature_names: Optional[Union[Sequence[str], str]] = None, random_state=42, num_leaves=31, 

33 max_depth=-1, n_estimators=100, min_child_samples=20, importance_type="gain", **model_args): 

34 """ 

35 :param categorical_feature_names: sequence of feature names in the input data that are categorical or a single string containing 

36 a regex matching the categorical feature names. 

37 Columns that have dtype 'category' (as will be the case for categorical columns created via FeatureGenerators) 

38 need not be specified (will be inferred automatically). 

39 In general, passing categorical features is preferable to using one-hot encoding, for example. 

40 :param random_state: the random seed to use 

41 :param num_leaves: the maximum number of leaves in one tree (original lightgbm default is 31) 

42 :param max_depth: maximum tree depth for base learners, <=0 means no limit 

43 :param n_estimators: number of boosted trees to fit 

44 :param min_child_samples: minimum number of data needed in a child (leaf) 

45 :param importance_type: the type of feature importance to be set in the respective property of the wrapped model. 

46 If ‘split’, result contains numbers of times the feature is used in a model. 

47 If ‘gain’, result contains total gains of splits which use the feature. 

48 :param model_args: see https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.LGBMRegressor.html 

49 """ 

50 super().__init__(lightgbm.sklearn.LGBMRegressor, random_state=random_state, num_leaves=num_leaves, importance_type=importance_type, 

51 max_depth=max_depth, n_estimators=n_estimators, min_child_samples=min_child_samples, 

52 **model_args) 

53 

54 if type(categorical_feature_names) == str: 

55 categorical_feature_name_regex = categorical_feature_names 

56 else: 

57 if categorical_feature_names is not None and len(categorical_feature_names) > 0: 

58 categorical_feature_name_regex = or_regex_group(categorical_feature_names) 

59 else: 

60 categorical_feature_name_regex = None 

61 self._categoricalFeatureNameRegex: str = categorical_feature_name_regex 

62 

63 def _update_fit_args(self, inputs: pd.DataFrame, outputs: pd.DataFrame): 

64 _update_fit_args(self.fitArgs, inputs, outputs, self._categoricalFeatureNameRegex) 

65 

66 def is_sample_weight_supported(self) -> bool: 

67 return True 

68 

69 

70class LightGBMVectorClassificationModel(AbstractSkLearnVectorClassificationModel, FeatureImportanceProviderSkLearnClassification): 

71 log = log.getChild(__qualname__) 

72 

73 def __init__(self, categorical_feature_names: Optional[Union[Sequence[str], str]] = None, random_state=42, num_leaves=31, 

74 max_depth=-1, n_estimators=100, min_child_samples=20, importance_type="gain", use_balanced_class_weights=False, 

75 **model_args): 

76 """ 

77 :param categorical_feature_names: sequence of feature names in the input data that are categorical or a single string containing 

78 a regex matching the categorical feature names. 

79 Columns that have dtype 'category' (as will be the case for categorical columns created via FeatureGenerators) 

80 need not be specified (will be inferred automatically). 

81 In general, passing categorical features may be preferable to using one-hot encoding, for example. 

82 :param random_state: the random seed to use 

83 :param num_leaves: the maximum number of leaves in one tree (original lightgbm default is 31) 

84 :param max_depth: maximum tree depth for base learners, <=0 means no limit 

85 :param n_estimators: number of boosted trees to fit 

86 :param min_child_samples: minimum number of data needed in a child (leaf) 

87 :param importance_type: the type of feature importance to be set in the respective property of the wrapped model. 

88 If ‘split’, result contains numbers of times the feature is used in a model. 

89 If ‘gain’, result contains total gains of splits which use the feature. 

90 :param use_balanced_class_weights: whether to compute class weights from the training data that is given and pass it on to the 

91 classifier's fit method; weighted data points may not be supported for all types of models 

92 :param model_args: see https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.LGBMClassifier.html?highlight=LGBMClassifier 

93 """ 

94 super().__init__(lightgbm.sklearn.LGBMClassifier, random_state=random_state, num_leaves=num_leaves, 

95 max_depth=max_depth, n_estimators=n_estimators, min_child_samples=min_child_samples, importance_type=importance_type, 

96 use_balanced_class_weights=use_balanced_class_weights, **model_args) 

97 

98 if type(categorical_feature_names) == str: 

99 categorical_feature_name_regex = categorical_feature_names 

100 else: 

101 if categorical_feature_names is not None and len(categorical_feature_names) > 0: 

102 categorical_feature_name_regex = or_regex_group(categorical_feature_names) 

103 else: 

104 categorical_feature_name_regex = None 

105 self._categoricalFeatureNameRegex: str = categorical_feature_name_regex 

106 

107 def _update_fit_args(self, inputs: pd.DataFrame, outputs: pd.DataFrame): 

108 _update_fit_args(self.fitArgs, inputs, outputs, self._categoricalFeatureNameRegex) 

109 

110 def _predict_class_probabilities(self, x: pd.DataFrame): 

111 if len(self._labels) == 1: 

112 # special handling required because LGBMClassifier will return values for two classes even if there is only one 

113 y = self.model.predict_proba(self._transform_input(x)) 

114 y = y[:, 0] 

115 return pd.DataFrame(y, columns=self._labels) 

116 else: 

117 return super()._predict_class_probabilities(x) 

118 

119 def is_sample_weight_supported(self) -> bool: 

120 return True