diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index 29d21e6a8c3b..03febc8eb1c2 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -2,9 +2,9 @@ Contributors of DMLC/XGBoost ============================ XGBoost has been developed and used by a group of active community. Everyone is more than welcomed to is a great way to make the project better and more accessible to more users. -Project Management Committee(PMC) +Project Management Committee(PMC) ---------- -The Project Management Committee(PMC) consists group of active committers that moderate the discussion, manage the project release, and proposes new committer/PMC members. +The Project Management Committee(PMC) consists group of active committers that moderate the discussion, manage the project release, and proposes new committer/PMC members. * [Tianqi Chen](https://github.com/tqchen), University of Washington - Tianqi is a Ph.D. student working on large-scale machine learning. He is the creator of the project. @@ -19,7 +19,7 @@ The Project Management Committee(PMC) consists group of active committers that m * [Hyunsu Cho](http://hyunsu-cho.io/), NVIDIA - Hyunsu is the maintainer of the XGBoost Python package. He also manages the Jenkins continuous integration system (https://xgboost-ci.net/). He is the initial author of the CPU 'hist' updater. * [Rory Mitchell](https://github.com/RAMitchell), University of Waikato - - Rory is a Ph.D. student at University of Waikato. He is the original creator of the GPU training algorithms. He improved the CMake build system and continuous integration. + - Rory is a Ph.D. student at University of Waikato. He is the original creator of the GPU training algorithms. He improved the CMake build system and continuous integration. * [Hongliang Liu](https://github.com/phunterlau) @@ -104,3 +104,4 @@ List of Contributors * [Haoda Fu](https://github.com/fuhaoda) * [Evan Kepner](https://github.com/EvanKepner) - Evan Kepner added support for os.PathLike file paths in Python +* [Judah Rand](https://github.com/judahrand) diff --git a/python-package/xgboost/sklearn.py b/python-package/xgboost/sklearn.py index e791be51c0e4..33896f625ef5 100644 --- a/python-package/xgboost/sklearn.py +++ b/python-package/xgboost/sklearn.py @@ -1,5 +1,6 @@ # pylint: disable=too-many-arguments, too-many-locals, invalid-name, fixme, too-many-lines """Scikit-Learn Wrapper interface for XGBoost.""" +import contextlib import copy import json import os @@ -827,17 +828,28 @@ def _get_type(self) -> str: ) return self._estimator_type # pylint: disable=no-member - def save_model(self, fname: Union[str, os.PathLike]) -> None: + @contextlib.contextmanager + def _set_sklearn_metadata(self) -> None: meta: Dict[str, Any] = {} # For validation. meta["_estimator_type"] = self._get_type() meta_str = json.dumps(meta) self.get_booster().set_attr(scikit_learn=meta_str) - self.get_booster().save_model(fname) + yield self.get_booster().set_attr(scikit_learn=None) + def save_model(self, fname: Union[str, os.PathLike]) -> None: + with self._set_sklearn_metadata(): + self.get_booster().save_model(fname) + save_model.__doc__ = f"""{Booster.save_model.__doc__}""" + def save_raw(self, raw_format: str = "deprecated") -> bytearray: + with self._set_sklearn_metadata(): + return self.get_booster().save_raw(raw_format) + + save_model.__doc__ = f"""{Booster.save_raw.__doc__}""" + def load_model(self, fname: ModelIn) -> None: # pylint: disable=attribute-defined-outside-init if not self.__sklearn_is_fitted__(): diff --git a/tests/python/test_with_sklearn.py b/tests/python/test_with_sklearn.py index 9a58b7277adf..636f55073a8e 100644 --- a/tests/python/test_with_sklearn.py +++ b/tests/python/test_with_sklearn.py @@ -990,6 +990,98 @@ def test_save_load_model(): assert clf.best_score == score +def save_raw_load_model(raw_format): + from sklearn.datasets import load_digits + from sklearn.model_selection import KFold + + digits = load_digits(n_class=2) + y = digits['target'] + X = digits['data'] + kf = KFold(n_splits=2, shuffle=True, random_state=rng) + for train_index, test_index in kf.split(X, y): + xgb_model = xgb.XGBClassifier().fit(X[train_index], y[train_index]) + buf = xgb_model.save_raw(raw_format) + + xgb_model = xgb.XGBClassifier() + xgb_model.load_model(buf) + + assert isinstance(xgb_model.classes_, np.ndarray) + np.testing.assert_equal(xgb_model.classes_, np.array([0, 1])) + assert isinstance(xgb_model._Booster, xgb.Booster) + + preds = xgb_model.predict(X[test_index]) + labels = y[test_index] + err = sum(1 for i in range(len(preds)) + if int(preds[i] > 0.5) != labels[i]) / float(len(preds)) + assert err < 0.1 + assert xgb_model.get_booster().attr('scikit_learn') is None + + # test native booster + preds = xgb_model.predict(X[test_index], output_margin=True) + booster = xgb.Booster(model_file=buf) + predt_1 = booster.predict(xgb.DMatrix(X[test_index]), + output_margin=True) + assert np.allclose(preds, predt_1) + + with pytest.raises(TypeError): + xgb_model = xgb.XGBModel() + xgb_model.load_model(buf) + + +def test_save_raw_load_model(): + for raw_format in ('deprecated', 'json', 'ubj'): + save_raw_load_model(raw_format) + + from sklearn.datasets import load_digits + from sklearn.model_selection import train_test_split + + digits = load_digits(n_class=2) + y = digits['target'] + X = digits['data'] + booster = xgb.train({'tree_method': 'hist', + 'objective': 'binary:logistic'}, + dtrain=xgb.DMatrix(X, y), + num_boost_round=4) + predt_0 = booster.predict(xgb.DMatrix(X)) + buf = booster.save_raw() + cls = xgb.XGBClassifier() + cls.load_model(buf) + + proba = cls.predict_proba(X) + assert proba.shape[0] == X.shape[0] + assert proba.shape[1] == 2 # binary + + predt_1 = cls.predict_proba(X)[:, 1] + assert np.allclose(predt_0, predt_1) + + cls = xgb.XGBModel() + cls.load_model(buf) + predt_1 = cls.predict(X) + assert np.allclose(predt_0, predt_1) + + # mclass + X, y = load_digits(n_class=10, return_X_y=True) + # small test_size to force early stop + X_train, X_test, y_train, y_test = train_test_split( + X, y, test_size=0.01, random_state=1 + ) + clf = xgb.XGBClassifier( + n_estimators=64, tree_method="hist", early_stopping_rounds=2 + ) + clf.fit(X_train, y_train, eval_set=[(X_test, y_test)]) + score = clf.best_score + clf.save_model(buf) + + clf = xgb.XGBClassifier() + clf.load_model(buf) + assert clf.classes_.size == 10 + np.testing.assert_equal(clf.classes_, np.arange(10)) + assert clf.n_classes_ == 10 + + assert clf.best_iteration == 27 + assert clf.best_score == score + + def test_RFECV(): from sklearn.datasets import load_breast_cancer, load_diabetes, load_iris from sklearn.feature_selection import RFECV