Coverage for src/sensai/lightgbm.py: 0%
51 statements
« prev ^ index » next coverage.py v7.6.1, created at 2024-11-29 18:29 +0000
« prev ^ index » next coverage.py v7.6.1, created at 2024-11-29 18:29 +0000
1import logging
2import re
3from typing import Sequence, Union, Optional
5import lightgbm
6import pandas as pd
8from .sklearn.sklearn_base import AbstractSkLearnMultipleOneDimVectorRegressionModel, AbstractSkLearnVectorClassificationModel, \
9 FeatureImportanceProviderSkLearnRegressionMultipleOneDim, FeatureImportanceProviderSkLearnClassification
10from .util.string import or_regex_group
12log = logging.getLogger(__name__)
15# noinspection PyUnusedLocal
16def _update_fit_args(fit_args: dict, inputs: pd.DataFrame, outputs: pd.DataFrame, categorical_feature_name_regex: Optional[str]):
17 if categorical_feature_name_regex is not None:
18 cols = list(inputs.columns)
19 categorical_feature_names = [col for col in cols if re.match(categorical_feature_name_regex, col)]
20 col_indices = [cols.index(f) for f in categorical_feature_names]
21 args = {"categorical_feature": col_indices}
22 log.info(f"Updating fit parameters with {args}")
23 fit_args.update(args)
24 else:
25 fit_args.pop("categorical_feature", None)
28class LightGBMVectorRegressionModel(AbstractSkLearnMultipleOneDimVectorRegressionModel,
29 FeatureImportanceProviderSkLearnRegressionMultipleOneDim):
30 log = log.getChild(__qualname__)
32 def __init__(self, categorical_feature_names: Optional[Union[Sequence[str], str]] = None, random_state=42, num_leaves=31,
33 max_depth=-1, n_estimators=100, min_child_samples=20, importance_type="gain", **model_args):
34 """
35 :param categorical_feature_names: sequence of feature names in the input data that are categorical or a single string containing
36 a regex matching the categorical feature names.
37 Columns that have dtype 'category' (as will be the case for categorical columns created via FeatureGenerators)
38 need not be specified (will be inferred automatically).
39 In general, passing categorical features is preferable to using one-hot encoding, for example.
40 :param random_state: the random seed to use
41 :param num_leaves: the maximum number of leaves in one tree (original lightgbm default is 31)
42 :param max_depth: maximum tree depth for base learners, <=0 means no limit
43 :param n_estimators: number of boosted trees to fit
44 :param min_child_samples: minimum number of data needed in a child (leaf)
45 :param importance_type: the type of feature importance to be set in the respective property of the wrapped model.
46 If ‘split’, result contains numbers of times the feature is used in a model.
47 If ‘gain’, result contains total gains of splits which use the feature.
48 :param model_args: see https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.LGBMRegressor.html
49 """
50 super().__init__(lightgbm.sklearn.LGBMRegressor, random_state=random_state, num_leaves=num_leaves, importance_type=importance_type,
51 max_depth=max_depth, n_estimators=n_estimators, min_child_samples=min_child_samples,
52 **model_args)
54 if type(categorical_feature_names) == str:
55 categorical_feature_name_regex = categorical_feature_names
56 else:
57 if categorical_feature_names is not None and len(categorical_feature_names) > 0:
58 categorical_feature_name_regex = or_regex_group(categorical_feature_names)
59 else:
60 categorical_feature_name_regex = None
61 self._categoricalFeatureNameRegex: str = categorical_feature_name_regex
63 def _update_fit_args(self, inputs: pd.DataFrame, outputs: pd.DataFrame):
64 _update_fit_args(self.fitArgs, inputs, outputs, self._categoricalFeatureNameRegex)
66 def is_sample_weight_supported(self) -> bool:
67 return True
70class LightGBMVectorClassificationModel(AbstractSkLearnVectorClassificationModel, FeatureImportanceProviderSkLearnClassification):
71 log = log.getChild(__qualname__)
73 def __init__(self, categorical_feature_names: Optional[Union[Sequence[str], str]] = None, random_state=42, num_leaves=31,
74 max_depth=-1, n_estimators=100, min_child_samples=20, importance_type="gain", use_balanced_class_weights=False,
75 **model_args):
76 """
77 :param categorical_feature_names: sequence of feature names in the input data that are categorical or a single string containing
78 a regex matching the categorical feature names.
79 Columns that have dtype 'category' (as will be the case for categorical columns created via FeatureGenerators)
80 need not be specified (will be inferred automatically).
81 In general, passing categorical features may be preferable to using one-hot encoding, for example.
82 :param random_state: the random seed to use
83 :param num_leaves: the maximum number of leaves in one tree (original lightgbm default is 31)
84 :param max_depth: maximum tree depth for base learners, <=0 means no limit
85 :param n_estimators: number of boosted trees to fit
86 :param min_child_samples: minimum number of data needed in a child (leaf)
87 :param importance_type: the type of feature importance to be set in the respective property of the wrapped model.
88 If ‘split’, result contains numbers of times the feature is used in a model.
89 If ‘gain’, result contains total gains of splits which use the feature.
90 :param use_balanced_class_weights: whether to compute class weights from the training data that is given and pass it on to the
91 classifier's fit method; weighted data points may not be supported for all types of models
92 :param model_args: see https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.LGBMClassifier.html?highlight=LGBMClassifier
93 """
94 super().__init__(lightgbm.sklearn.LGBMClassifier, random_state=random_state, num_leaves=num_leaves,
95 max_depth=max_depth, n_estimators=n_estimators, min_child_samples=min_child_samples, importance_type=importance_type,
96 use_balanced_class_weights=use_balanced_class_weights, **model_args)
98 if type(categorical_feature_names) == str:
99 categorical_feature_name_regex = categorical_feature_names
100 else:
101 if categorical_feature_names is not None and len(categorical_feature_names) > 0:
102 categorical_feature_name_regex = or_regex_group(categorical_feature_names)
103 else:
104 categorical_feature_name_regex = None
105 self._categoricalFeatureNameRegex: str = categorical_feature_name_regex
107 def _update_fit_args(self, inputs: pd.DataFrame, outputs: pd.DataFrame):
108 _update_fit_args(self.fitArgs, inputs, outputs, self._categoricalFeatureNameRegex)
110 def _predict_class_probabilities(self, x: pd.DataFrame):
111 if len(self._labels) == 1:
112 # special handling required because LGBMClassifier will return values for two classes even if there is only one
113 y = self.model.predict_proba(self._transform_input(x))
114 y = y[:, 0]
115 return pd.DataFrame(y, columns=self._labels)
116 else:
117 return super()._predict_class_probabilities(x)
119 def is_sample_weight_supported(self) -> bool:
120 return True