Skip to content

Commit 5a498d9

Browse files
committed
Applies the changes to lgbm Regressor
Also adds a unittest.
1 parent 03e84e0 commit 5a498d9

File tree

2 files changed

+41
-5
lines changed

2 files changed

+41
-5
lines changed

src/fklearn/training/regression.py

+11-5
Original file line numberDiff line numberDiff line change
@@ -412,6 +412,7 @@ def lgbm_regression_learner(df: pd.DataFrame,
412412
learning_rate: float = 0.1,
413413
num_estimators: int = 100,
414414
extra_params: Dict[str, Any] = None,
415+
categorical_features: Union[List[str], str] = "auto",
415416
prediction_column: str = "prediction",
416417
weight_column: str = None,
417418
encode_extra_cols: bool = True) -> LearnerReturnType:
@@ -458,6 +459,11 @@ def lgbm_regression_learner(df: pd.DataFrame,
458459
https://github.com/Microsoft/LightGBM/blob/master/docs/Parameters.rst
459460
If not passed, the default will be used.
460461
462+
categorical_features : list of str, or 'auto', optional (default="auto")
463+
A list of column names that should be treated as categorical features.
464+
See the categorical_feature hyper-parameter in:
465+
https://github.com/Microsoft/LightGBM/blob/master/docs/Parameters.rst
466+
461467
prediction_column : str
462468
The name of the column with the predictions from the model.
463469
@@ -474,17 +480,17 @@ def lgbm_regression_learner(df: pd.DataFrame,
474480
params = assoc(params, "eta", learning_rate)
475481
params = params if "objective" in params else assoc(params, "objective", 'regression')
476482

477-
weights = df[weight_column].values if weight_column else None
483+
weights = df[weight_column] if weight_column else None
478484

479485
features = features if not encode_extra_cols else expand_features_encoded(df, features)
480486

481-
dtrain = lgbm.Dataset(df[features].values, label=df[target], feature_name=list(map(str, features)), weight=weights,
482-
silent=True)
487+
dtrain = lgbm.Dataset(df[features], label=df[target], feature_name=list(map(str, features)), weight=weights,
488+
silent=True, categorical_feature=categorical_features)
483489

484-
bst = lgbm.train(params, dtrain, num_estimators)
490+
bst = lgbm.train(params, dtrain, num_estimators, categorical_feature=categorical_features)
485491

486492
def p(new_df: pd.DataFrame, apply_shap: bool = False) -> pd.DataFrame:
487-
col_dict = {prediction_column: bst.predict(new_df[features].values)}
493+
col_dict = {prediction_column: bst.predict(new_df[features])}
488494

489495
if apply_shap:
490496
import shap

tests/training/test_regression.py

+30
Original file line numberDiff line numberDiff line change
@@ -169,6 +169,7 @@ def test_lgbm_regression_learner():
169169
assert Counter(expected_col_train) == Counter(pred_train.columns.tolist())
170170
assert Counter(expected_col_test) == Counter(pred_test.columns.tolist())
171171
assert (pred_test.columns == pred_train.columns).all()
172+
assert all(tree['num_cat'] == 0 for tree in log['object'].dump_model()['tree_info'])
172173
assert "prediction" in pred_test.columns
173174

174175
# SHAP test
@@ -177,6 +178,35 @@ def test_lgbm_regression_learner():
177178
assert "shap_expected_value" in pred_shap.columns
178179
assert np.vstack(pred_shap["shap_values"]).shape == (4, 2)
179180

181+
learner = lgbm_regression_learner(features=features,
182+
target="y",
183+
learning_rate=0.1,
184+
num_estimators=1,
185+
categorical_features=["x2"],
186+
extra_params={"max_depth": 2,
187+
"min_data_in_leaf": 1,
188+
"min_data_per_group": 1,
189+
"seed": 42},
190+
prediction_column="prediction")
191+
192+
predict_fn, pred_train, log = learner(df_train)
193+
194+
pred_test = predict_fn(df_test)
195+
196+
expected_col_train = df_train.columns.tolist() + ["prediction"]
197+
expected_col_test = df_test.columns.tolist() + ["prediction"]
198+
199+
assert Counter(expected_col_train) == Counter(pred_train.columns.tolist())
200+
assert Counter(expected_col_test) == Counter(pred_test.columns.tolist())
201+
assert (pred_test.columns == pred_train.columns).all()
202+
assert any(tree['num_cat'] > 0 for tree in log['object'].dump_model()['tree_info'])
203+
204+
# SHAP test
205+
pred_shap = predict_fn(df_test, apply_shap=True)
206+
assert "shap_values" in pred_shap.columns
207+
assert "shap_expected_value" in pred_shap.columns
208+
assert np.vstack(pred_shap["shap_values"]).shape == (4, 2)
209+
180210

181211
def test_catboost_regressor_learner():
182212
df_train = pd.DataFrame({

0 commit comments

Comments
 (0)