Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

FIX HistGradientBoosting raising ValueError with monotonic_cst and categorical_feature #28925

Merged
merged 7 commits into from
May 3, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
5 changes: 5 additions & 0 deletions doc/whats_new/v1.5.rst
Original file line number Diff line number Diff line change
Expand Up @@ -285,6 +285,11 @@ Changelog
pre-sorting the data before finding the thresholds for binning.
:pr:`28102` by :user:`Christian Lorentzen <lorentzenchr>`.

- |Fix| Fixes a bug in :class:`ensemble.HistGradientBoostingClassifier` and
:class:`ensemble.HistGradientBoostingRegressor` when `monotonic_cst` is specified
for non-categorical features.
:pr:`28925` by :user:`Xiao Yuan <yuanx749>`.

:mod:`sklearn.feature_extraction`
.................................

Expand Down
13 changes: 12 additions & 1 deletion sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
Original file line number Diff line number Diff line change
Expand Up @@ -583,6 +583,17 @@ def fit(self, X, y, sample_weight=None):

self._validate_parameters()
monotonic_cst = _check_monotonic_cst(self, self.monotonic_cst)
# _preprocess_X places the categorical features at the beginning,
# change the order of monotonic_cst accordingly
if self.is_categorical_ is not None:
monotonic_cst_remapped = np.concatenate(
(
monotonic_cst[self.is_categorical_],
monotonic_cst[~self.is_categorical_],
)
)
else:
monotonic_cst_remapped = monotonic_cst

# used for validation in predict
n_samples, self._n_features = X.shape
Expand Down Expand Up @@ -895,7 +906,7 @@ def fit(self, X, y, sample_weight=None):
n_bins_non_missing=self._bin_mapper.n_bins_non_missing_,
has_missing_values=has_missing_values,
is_categorical=self._is_categorical_remapped,
monotonic_cst=monotonic_cst,
monotonic_cst=monotonic_cst_remapped,
interaction_cst=interaction_cst,
max_leaf_nodes=self.max_leaf_nodes,
max_depth=self.max_depth,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -206,19 +206,26 @@ def test_nodes_values(monotonic_cst, seed):

@pytest.mark.parametrize("use_feature_names", (True, False))
def test_predictions(global_random_seed, use_feature_names):
# Train a model with a POS constraint on the first feature and a NEG
# constraint on the second feature, and make sure the constraints are
# respected by checking the predictions.
# Train a model with a POS constraint on the first non-categorical feature
# and a NEG constraint on the second non-categorical feature, and make sure
# the constraints are respected by checking the predictions.
# test adapted from lightgbm's test_monotone_constraint(), itself inspired
# by https://xgboost.readthedocs.io/en/latest/tutorials/monotonic.html

rng = np.random.RandomState(global_random_seed)

n_samples = 1000
f_0 = rng.rand(n_samples) # positive correlation with y
f_1 = rng.rand(n_samples) # negative correslation with y
X = np.c_[f_0, f_1]
columns_name = ["f_0", "f_1"]
f_1 = rng.rand(n_samples) # negative correlation with y

# extra categorical features, no correlation with y,
# to check the correctness of monotonicity constraint remapping, see issue #28898
f_a = rng.randint(low=0, high=9, size=n_samples)
f_b = rng.randint(low=0, high=9, size=n_samples)
f_c = rng.randint(low=0, high=9, size=n_samples)

X = np.c_[f_a, f_0, f_b, f_1, f_c]
columns_name = ["f_a", "f_0", "f_b", "f_1", "f_c"]
constructor_name = "dataframe" if use_feature_names else "array"
X = _convert_container(X, constructor_name, columns_name=columns_name)

Expand All @@ -227,10 +234,14 @@ def test_predictions(global_random_seed, use_feature_names):

if use_feature_names:
monotonic_cst = {"f_0": +1, "f_1": -1}
categorical_features = ["f_a", "f_b", "f_c"]
else:
monotonic_cst = [+1, -1]
monotonic_cst = [0, +1, 0, -1, 0]
categorical_features = [0, 2, 4]

gbdt = HistGradientBoostingRegressor(monotonic_cst=monotonic_cst)
gbdt = HistGradientBoostingRegressor(
monotonic_cst=monotonic_cst, categorical_features=categorical_features
)
gbdt.fit(X, y)

linspace = np.linspace(0, 1, 100)
Expand All @@ -247,26 +258,26 @@ def test_predictions(global_random_seed, use_feature_names):
# The constraint does not guanrantee that
# x0 < x0' => f(x0, x1) < f(x0', x1')

# First feature (POS)
# First non-categorical feature (POS)
# assert pred is all increasing when f_0 is all increasing
X = np.c_[linspace, constant]
X = np.c_[constant, linspace, constant, constant, constant]
X = _convert_container(X, constructor_name, columns_name=columns_name)
pred = gbdt.predict(X)
assert is_increasing(pred)
# assert pred actually follows the variations of f_0
X = np.c_[sin, constant]
X = np.c_[constant, sin, constant, constant, constant]
X = _convert_container(X, constructor_name, columns_name=columns_name)
pred = gbdt.predict(X)
assert np.all((np.diff(pred) >= 0) == (np.diff(sin) >= 0))

# Second feature (NEG)
# Second non-categorical feature (NEG)
# assert pred is all decreasing when f_1 is all increasing
X = np.c_[constant, linspace]
X = np.c_[constant, constant, constant, linspace, constant]
X = _convert_container(X, constructor_name, columns_name=columns_name)
pred = gbdt.predict(X)
assert is_decreasing(pred)
# assert pred actually follows the inverse variations of f_1
X = np.c_[constant, sin]
X = np.c_[constant, constant, constant, sin, constant]
X = _convert_container(X, constructor_name, columns_name=columns_name)
pred = gbdt.predict(X)
assert ((np.diff(pred) <= 0) == (np.diff(sin) >= 0)).all()
Expand Down