Coverage for src/sensai/lightgbm.py: 0%

1import logging

2import re

3from typing import Sequence, Union, Optional

5import lightgbm

6import pandas as pd

8from .sklearn.sklearn_base import AbstractSkLearnMultipleOneDimVectorRegressionModel, AbstractSkLearnVectorClassificationModel, \

9 FeatureImportanceProviderSkLearnRegressionMultipleOneDim, FeatureImportanceProviderSkLearnClassification

10from .util.string import or_regex_group

12log = logging.getLogger(__name__)

15# noinspection PyUnusedLocal

16def _update_fit_args(fit_args: dict, inputs: pd.DataFrame, outputs: pd.DataFrame, categorical_feature_name_regex: Optional[str]):

17 if categorical_feature_name_regex is not None:

18 cols = list(inputs.columns)

19 categorical_feature_names = [col for col in cols if re.match(categorical_feature_name_regex, col)]

20 col_indices = [cols.index(f) for f in categorical_feature_names]

21 args = {"categorical_feature": col_indices}

22 log.info(f"Updating fit parameters with {args}")

23 fit_args.update(args)

24 else:

25 fit_args.pop("categorical_feature", None)

28class LightGBMVectorRegressionModel(AbstractSkLearnMultipleOneDimVectorRegressionModel,

29 FeatureImportanceProviderSkLearnRegressionMultipleOneDim):

30 log = log.getChild(__qualname__)

32 def __init__(self, categorical_feature_names: Optional[Union[Sequence[str], str]] = None, random_state=42, num_leaves=31,

33 max_depth=-1, n_estimators=100, min_child_samples=20, importance_type="gain", **model_args):

34 """

35 :param categorical_feature_names: sequence of feature names in the input data that are categorical or a single string containing

36 a regex matching the categorical feature names.

37 Columns that have dtype 'category' (as will be the case for categorical columns created via FeatureGenerators)

38 need not be specified (will be inferred automatically).

39 In general, passing categorical features is preferable to using one-hot encoding, for example.

40 :param random_state: the random seed to use

41 :param num_leaves: the maximum number of leaves in one tree (original lightgbm default is 31)

42 :param max_depth: maximum tree depth for base learners, <=0 means no limit

43 :param n_estimators: number of boosted trees to fit

44 :param min_child_samples: minimum number of data needed in a child (leaf)

45 :param importance_type: the type of feature importance to be set in the respective property of the wrapped model.

46 If ‘split’, result contains numbers of times the feature is used in a model.

47 If ‘gain’, result contains total gains of splits which use the feature.

48 :param model_args: see https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.LGBMRegressor.html

49 """

50 super().__init__(lightgbm.sklearn.LGBMRegressor, random_state=random_state, num_leaves=num_leaves, importance_type=importance_type,

51 max_depth=max_depth, n_estimators=n_estimators, min_child_samples=min_child_samples,

52 **model_args)

54 if type(categorical_feature_names) == str:

55 categorical_feature_name_regex = categorical_feature_names

56 else:

57 if categorical_feature_names is not None and len(categorical_feature_names) > 0:

58 categorical_feature_name_regex = or_regex_group(categorical_feature_names)

59 else:

60 categorical_feature_name_regex = None

61 self._categoricalFeatureNameRegex: str = categorical_feature_name_regex

63 def _update_fit_args(self, inputs: pd.DataFrame, outputs: pd.DataFrame):

64 _update_fit_args(self.fitArgs, inputs, outputs, self._categoricalFeatureNameRegex)

66 def is_sample_weight_supported(self) -> bool:

67 return True

70class LightGBMVectorClassificationModel(AbstractSkLearnVectorClassificationModel, FeatureImportanceProviderSkLearnClassification):

71 log = log.getChild(__qualname__)

73 def __init__(self, categorical_feature_names: Optional[Union[Sequence[str], str]] = None, random_state=42, num_leaves=31,

74 max_depth=-1, n_estimators=100, min_child_samples=20, importance_type="gain", use_balanced_class_weights=False,

75 **model_args):

76 """

77 :param categorical_feature_names: sequence of feature names in the input data that are categorical or a single string containing

78 a regex matching the categorical feature names.

79 Columns that have dtype 'category' (as will be the case for categorical columns created via FeatureGenerators)

80 need not be specified (will be inferred automatically).

81 In general, passing categorical features may be preferable to using one-hot encoding, for example.

82 :param random_state: the random seed to use

83 :param num_leaves: the maximum number of leaves in one tree (original lightgbm default is 31)

84 :param max_depth: maximum tree depth for base learners, <=0 means no limit

85 :param n_estimators: number of boosted trees to fit

86 :param min_child_samples: minimum number of data needed in a child (leaf)

87 :param importance_type: the type of feature importance to be set in the respective property of the wrapped model.

88 If ‘split’, result contains numbers of times the feature is used in a model.

89 If ‘gain’, result contains total gains of splits which use the feature.

90 :param use_balanced_class_weights: whether to compute class weights from the training data that is given and pass it on to the

91 classifier's fit method; weighted data points may not be supported for all types of models

92 :param model_args: see https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.LGBMClassifier.html?highlight=LGBMClassifier

93 """

94 super().__init__(lightgbm.sklearn.LGBMClassifier, random_state=random_state, num_leaves=num_leaves,

95 max_depth=max_depth, n_estimators=n_estimators, min_child_samples=min_child_samples, importance_type=importance_type,

96 use_balanced_class_weights=use_balanced_class_weights, **model_args)

98 if type(categorical_feature_names) == str:

99 categorical_feature_name_regex = categorical_feature_names

100 else:

101 if categorical_feature_names is not None and len(categorical_feature_names) > 0:

102 categorical_feature_name_regex = or_regex_group(categorical_feature_names)

103 else:

104 categorical_feature_name_regex = None

105 self._categoricalFeatureNameRegex: str = categorical_feature_name_regex

106

107 def _update_fit_args(self, inputs: pd.DataFrame, outputs: pd.DataFrame):

108 _update_fit_args(self.fitArgs, inputs, outputs, self._categoricalFeatureNameRegex)

109

110 def _predict_class_probabilities(self, x: pd.DataFrame):

111 if len(self._labels) == 1:

112 # special handling required because LGBMClassifier will return values for two classes even if there is only one

113 y = self.model.predict_proba(self._transform_input(x))

114 y = y[:, 0]

115 return pd.DataFrame(y, columns=self._labels)

116 else:

117 return super()._predict_class_probabilities(x)

118

119 def is_sample_weight_supported(self) -> bool:

120 return True