Coverage for src/sensai/lightgbm.py: 0%
47 statements
« prev ^ index » next coverage.py v7.6.1, created at 2024-08-13 22:17 +0000
« prev ^ index » next coverage.py v7.6.1, created at 2024-08-13 22:17 +0000
1import logging
2import re
3from typing import Sequence, Union, Optional
5import lightgbm
6import pandas as pd
8from .sklearn.sklearn_base import AbstractSkLearnMultipleOneDimVectorRegressionModel, AbstractSkLearnVectorClassificationModel, \
9 FeatureImportanceProviderSkLearnRegressionMultipleOneDim, FeatureImportanceProviderSkLearnClassification
10from .util.string import or_regex_group
12log = logging.getLogger(__name__)
15# noinspection PyUnusedLocal
16def _update_fit_args(fit_args: dict, inputs: pd.DataFrame, outputs: pd.DataFrame, categorical_feature_name_regex: Optional[str]):
17 if categorical_feature_name_regex is not None:
18 cols = list(inputs.columns)
19 categorical_feature_names = [col for col in cols if re.match(categorical_feature_name_regex, col)]
20 col_indices = [cols.index(f) for f in categorical_feature_names]
21 args = {"categorical_feature": col_indices}
22 log.info(f"Updating fit parameters with {args}")
23 fit_args.update(args)
24 else:
25 fit_args.pop("categorical_feature", None)
28class LightGBMVectorRegressionModel(AbstractSkLearnMultipleOneDimVectorRegressionModel,
29 FeatureImportanceProviderSkLearnRegressionMultipleOneDim):
30 log = log.getChild(__qualname__)
32 def __init__(self, categorical_feature_names: Optional[Union[Sequence[str], str]] = None, random_state=42, num_leaves=31,
33 max_depth=-1, n_estimators=100, min_child_samples=20, importance_type="gain", **model_args):
34 """
35 :param categorical_feature_names: sequence of feature names in the input data that are categorical or a single string containing
36 a regex matching the categorical feature names.
37 Columns that have dtype 'category' (as will be the case for categorical columns created via FeatureGenerators)
38 need not be specified (will be inferred automatically).
39 In general, passing categorical features is preferable to using one-hot encoding, for example.
40 :param random_state: the random seed to use
41 :param num_leaves: the maximum number of leaves in one tree (original lightgbm default is 31)
42 :param max_depth: maximum tree depth for base learners, <=0 means no limit
43 :param n_estimators: number of boosted trees to fit
44 :param min_child_samples: minimum number of data needed in a child (leaf)
45 :param importance_type: the type of feature importance to be set in the respective property of the wrapped model.
46 If ‘split’, result contains numbers of times the feature is used in a model.
47 If ‘gain’, result contains total gains of splits which use the feature.
48 :param model_args: see https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.LGBMRegressor.html
49 """
50 super().__init__(lightgbm.sklearn.LGBMRegressor, random_state=random_state, num_leaves=num_leaves, importance_type=importance_type,
51 max_depth=max_depth, n_estimators=n_estimators, min_child_samples=min_child_samples,
52 **model_args)
54 if type(categorical_feature_names) == str:
55 categorical_feature_name_regex = categorical_feature_names
56 else:
57 if categorical_feature_names is not None and len(categorical_feature_names) > 0:
58 categorical_feature_name_regex = or_regex_group(categorical_feature_names)
59 else:
60 categorical_feature_name_regex = None
61 self._categoricalFeatureNameRegex: str = categorical_feature_name_regex
63 def _update_fit_args(self, inputs: pd.DataFrame, outputs: pd.DataFrame):
64 _update_fit_args(self.fitArgs, inputs, outputs, self._categoricalFeatureNameRegex)
67class LightGBMVectorClassificationModel(AbstractSkLearnVectorClassificationModel, FeatureImportanceProviderSkLearnClassification):
68 log = log.getChild(__qualname__)
70 def __init__(self, categorical_feature_names: Optional[Union[Sequence[str], str]] = None, random_state=42, num_leaves=31,
71 max_depth=-1, n_estimators=100, min_child_samples=20, importance_type="gain", use_balanced_class_weights=False,
72 **model_args):
73 """
74 :param categorical_feature_names: sequence of feature names in the input data that are categorical or a single string containing
75 a regex matching the categorical feature names.
76 Columns that have dtype 'category' (as will be the case for categorical columns created via FeatureGenerators)
77 need not be specified (will be inferred automatically).
78 In general, passing categorical features may be preferable to using one-hot encoding, for example.
79 :param random_state: the random seed to use
80 :param num_leaves: the maximum number of leaves in one tree (original lightgbm default is 31)
81 :param max_depth: maximum tree depth for base learners, <=0 means no limit
82 :param n_estimators: number of boosted trees to fit
83 :param min_child_samples: minimum number of data needed in a child (leaf)
84 :param importance_type: the type of feature importance to be set in the respective property of the wrapped model.
85 If ‘split’, result contains numbers of times the feature is used in a model.
86 If ‘gain’, result contains total gains of splits which use the feature.
87 :param use_balanced_class_weights: whether to compute class weights from the training data that is given and pass it on to the
88 classifier's fit method; weighted data points may not be supported for all types of models
89 :param model_args: see https://lightgbm.readthedocs.io/en/latest/pythonapi/lightgbm.LGBMClassifier.html?highlight=LGBMClassifier
90 """
91 super().__init__(lightgbm.sklearn.LGBMClassifier, random_state=random_state, num_leaves=num_leaves,
92 max_depth=max_depth, n_estimators=n_estimators, min_child_samples=min_child_samples, importance_type=importance_type,
93 use_balanced_class_weights=use_balanced_class_weights, **model_args)
95 if type(categorical_feature_names) == str:
96 categorical_feature_name_regex = categorical_feature_names
97 else:
98 if categorical_feature_names is not None and len(categorical_feature_names) > 0:
99 categorical_feature_name_regex = or_regex_group(categorical_feature_names)
100 else:
101 categorical_feature_name_regex = None
102 self._categoricalFeatureNameRegex: str = categorical_feature_name_regex
104 def _update_fit_args(self, inputs: pd.DataFrame, outputs: pd.DataFrame):
105 _update_fit_args(self.fitArgs, inputs, outputs, self._categoricalFeatureNameRegex)
107 def _predict_class_probabilities(self, x: pd.DataFrame):
108 if len(self._labels) == 1:
109 # special handling required because LGBMClassifier will return values for two classes even if there is only one
110 y = self.model.predict_proba(self._transform_input(x))
111 y = y[:, 0]
112 return pd.DataFrame(y, columns=self._labels)
113 else:
114 return super()._predict_class_probabilities(x)