Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add save_raw method to XGBModel #9451

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
7 changes: 4 additions & 3 deletions CONTRIBUTORS.md
Expand Up @@ -2,9 +2,9 @@ Contributors of DMLC/XGBoost
============================
XGBoost has been developed and used by a group of active community. Everyone is more than welcomed to is a great way to make the project better and more accessible to more users.

Project Management Committee(PMC)
Project Management Committee(PMC)
----------
The Project Management Committee(PMC) consists group of active committers that moderate the discussion, manage the project release, and proposes new committer/PMC members.
The Project Management Committee(PMC) consists group of active committers that moderate the discussion, manage the project release, and proposes new committer/PMC members.

* [Tianqi Chen](https://github.com/tqchen), University of Washington
- Tianqi is a Ph.D. student working on large-scale machine learning. He is the creator of the project.
Expand All @@ -19,7 +19,7 @@ The Project Management Committee(PMC) consists group of active committers that m
* [Hyunsu Cho](http://hyunsu-cho.io/), NVIDIA
- Hyunsu is the maintainer of the XGBoost Python package. He also manages the Jenkins continuous integration system (https://xgboost-ci.net/). He is the initial author of the CPU 'hist' updater.
* [Rory Mitchell](https://github.com/RAMitchell), University of Waikato
- Rory is a Ph.D. student at University of Waikato. He is the original creator of the GPU training algorithms. He improved the CMake build system and continuous integration.
- Rory is a Ph.D. student at University of Waikato. He is the original creator of the GPU training algorithms. He improved the CMake build system and continuous integration.
* [Hongliang Liu](https://github.com/phunterlau)


Expand Down Expand Up @@ -104,3 +104,4 @@ List of Contributors
* [Haoda Fu](https://github.com/fuhaoda)
* [Evan Kepner](https://github.com/EvanKepner)
- Evan Kepner added support for os.PathLike file paths in Python
* [Judah Rand](https://github.com/judahrand)
16 changes: 14 additions & 2 deletions python-package/xgboost/sklearn.py
@@ -1,5 +1,6 @@
# pylint: disable=too-many-arguments, too-many-locals, invalid-name, fixme, too-many-lines
"""Scikit-Learn Wrapper interface for XGBoost."""
import contextlib
import copy
import json
import os
Expand Down Expand Up @@ -827,17 +828,28 @@ def _get_type(self) -> str:
)
return self._estimator_type # pylint: disable=no-member

def save_model(self, fname: Union[str, os.PathLike]) -> None:
@contextlib.contextmanager
def _set_sklearn_metadata(self) -> None:
meta: Dict[str, Any] = {}
# For validation.
meta["_estimator_type"] = self._get_type()
meta_str = json.dumps(meta)
self.get_booster().set_attr(scikit_learn=meta_str)
self.get_booster().save_model(fname)
yield
self.get_booster().set_attr(scikit_learn=None)

def save_model(self, fname: Union[str, os.PathLike]) -> None:
with self._set_sklearn_metadata():
self.get_booster().save_model(fname)

save_model.__doc__ = f"""{Booster.save_model.__doc__}"""

def save_raw(self, raw_format: str = "deprecated") -> bytearray:
with self._set_sklearn_metadata():
return self.get_booster().save_raw(raw_format)

save_model.__doc__ = f"""{Booster.save_raw.__doc__}"""

def load_model(self, fname: ModelIn) -> None:
# pylint: disable=attribute-defined-outside-init
if not self.__sklearn_is_fitted__():
Expand Down
92 changes: 92 additions & 0 deletions tests/python/test_with_sklearn.py
Expand Up @@ -990,6 +990,98 @@ def test_save_load_model():
assert clf.best_score == score


def save_raw_load_model(raw_format):
from sklearn.datasets import load_digits
from sklearn.model_selection import KFold

digits = load_digits(n_class=2)
y = digits['target']
X = digits['data']
kf = KFold(n_splits=2, shuffle=True, random_state=rng)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Considering this is model IO, maybe we can use a lighter-weight test without grid search and with early stopping enabled so that the test can cover more IO-related features in a more efficient way?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, that's fair. I'd just copy pasta'd the tests for save_model. I'll look at simplifying this when I get a chance.

for train_index, test_index in kf.split(X, y):
xgb_model = xgb.XGBClassifier().fit(X[train_index], y[train_index])
buf = xgb_model.save_raw(raw_format)

xgb_model = xgb.XGBClassifier()
xgb_model.load_model(buf)

assert isinstance(xgb_model.classes_, np.ndarray)
np.testing.assert_equal(xgb_model.classes_, np.array([0, 1]))
assert isinstance(xgb_model._Booster, xgb.Booster)

preds = xgb_model.predict(X[test_index])
labels = y[test_index]
err = sum(1 for i in range(len(preds))
if int(preds[i] > 0.5) != labels[i]) / float(len(preds))
assert err < 0.1
assert xgb_model.get_booster().attr('scikit_learn') is None

# test native booster
preds = xgb_model.predict(X[test_index], output_margin=True)
booster = xgb.Booster(model_file=buf)
predt_1 = booster.predict(xgb.DMatrix(X[test_index]),
output_margin=True)
assert np.allclose(preds, predt_1)

with pytest.raises(TypeError):
xgb_model = xgb.XGBModel()
xgb_model.load_model(buf)


def test_save_raw_load_model():
for raw_format in ('deprecated', 'json', 'ubj'):
save_raw_load_model(raw_format)

from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split

digits = load_digits(n_class=2)
y = digits['target']
X = digits['data']
booster = xgb.train({'tree_method': 'hist',
'objective': 'binary:logistic'},
dtrain=xgb.DMatrix(X, y),
num_boost_round=4)
predt_0 = booster.predict(xgb.DMatrix(X))
buf = booster.save_raw()
cls = xgb.XGBClassifier()
cls.load_model(buf)

proba = cls.predict_proba(X)
assert proba.shape[0] == X.shape[0]
assert proba.shape[1] == 2 # binary

predt_1 = cls.predict_proba(X)[:, 1]
assert np.allclose(predt_0, predt_1)

cls = xgb.XGBModel()
cls.load_model(buf)
predt_1 = cls.predict(X)
assert np.allclose(predt_0, predt_1)

# mclass
X, y = load_digits(n_class=10, return_X_y=True)
# small test_size to force early stop
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.01, random_state=1
)
clf = xgb.XGBClassifier(
n_estimators=64, tree_method="hist", early_stopping_rounds=2
)
clf.fit(X_train, y_train, eval_set=[(X_test, y_test)])
score = clf.best_score
clf.save_model(buf)

clf = xgb.XGBClassifier()
clf.load_model(buf)
assert clf.classes_.size == 10
np.testing.assert_equal(clf.classes_, np.arange(10))
assert clf.n_classes_ == 10

assert clf.best_iteration == 27
assert clf.best_score == score


def test_RFECV():
from sklearn.datasets import load_breast_cancer, load_diabetes, load_iris
from sklearn.feature_selection import RFECV
Expand Down