From 5f6577ade5eaa12e5cc0e9f8cadfdd9f775c32aa Mon Sep 17 00:00:00 2001 From: fis Date: Thu, 14 Oct 2021 16:28:26 +0800 Subject: [PATCH 1/2] Initial commit. --- python-package/xgboost/core.py | 16 ++++++++-------- python-package/xgboost/data.py | 13 ++++++------- python-package/xgboost/sklearn.py | 6 +----- tests/python-gpu/test_gpu_with_sklearn.py | 16 +++++++++++++++- 4 files changed, 30 insertions(+), 21 deletions(-) diff --git a/python-package/xgboost/core.py b/python-package/xgboost/core.py index 5218630e7196..05b354e21930 100644 --- a/python-package/xgboost/core.py +++ b/python-package/xgboost/core.py @@ -1973,13 +1973,6 @@ def inplace_predict( preds = ctypes.POINTER(ctypes.c_float)() # once caching is supported, we can pass id(data) as cache id. - try: - import pandas as pd - - if isinstance(data, pd.DataFrame): - data = data.values - except ImportError: - pass args = { "type": 0, "training": False, @@ -2014,7 +2007,15 @@ def inplace_predict( f"got {data.shape[1]}" ) + from .data import _is_pandas_df, _transform_pandas_df from .data import _array_interface + if _is_pandas_df(data): + ft = self.feature_types + if ft is None: + enable_categorical = False + else: + enable_categorical = any(f == "c" for f in ft) + data, _, _ = _transform_pandas_df(data, enable_categorical) if isinstance(data, np.ndarray): from .data import _ensure_np_dtype data, _ = _ensure_np_dtype(data, data.dtype) @@ -2068,7 +2069,6 @@ def inplace_predict( return _prediction_output(shape, dims, preds, True) if lazy_isinstance(data, "cudf.core.dataframe", "DataFrame"): from .data import _cudf_array_interfaces - _, interfaces_str = _cudf_array_interfaces(data) _check_call( _LIB.XGBoosterPredictFromCudaColumnar( diff --git a/python-package/xgboost/data.py b/python-package/xgboost/data.py index e4b5a690359a..8908a3a58b62 100644 --- a/python-package/xgboost/data.py +++ b/python-package/xgboost/data.py @@ -289,16 +289,15 @@ def _transform_pandas_df( def _from_pandas_df( data, enable_categorical: bool, - missing, - nthread, + missing: float, + nthread: int, feature_names: Optional[List[str]], feature_types: Optional[List[str]], -): +) -> Tuple[ctypes.c_void_p, Optional[List[str]], Optional[List[str]]]: data, feature_names, feature_types = _transform_pandas_df( - data, enable_categorical, feature_names, feature_types) - return _from_numpy_array(data, missing, nthread, feature_names, - feature_types) - + data, enable_categorical, feature_names, feature_types + ) + return _from_numpy_array(data, missing, nthread, feature_names, feature_types) def _is_pandas_series(data): try: diff --git a/python-package/xgboost/sklearn.py b/python-package/xgboost/sklearn.py index 172ea20a6ece..e4c4b9928fb2 100644 --- a/python-package/xgboost/sklearn.py +++ b/python-package/xgboost/sklearn.py @@ -809,11 +809,7 @@ def _can_use_inplace_predict(self) -> bool: # Inplace predict doesn't handle as many data types as DMatrix, but it's # sufficient for dask interface where input is simpiler. predictor = self.get_params().get("predictor", None) - if ( - not self.enable_categorical - and predictor in ("auto", None) - and self.booster != "gblinear" - ): + if predictor in ("auto", None) and self.booster != "gblinear": return True return False diff --git a/tests/python-gpu/test_gpu_with_sklearn.py b/tests/python-gpu/test_gpu_with_sklearn.py index 58e64886d9d5..6664004f517b 100644 --- a/tests/python-gpu/test_gpu_with_sklearn.py +++ b/tests/python-gpu/test_gpu_with_sklearn.py @@ -59,7 +59,6 @@ def test_categorical(): ) X = pd.DataFrame(X.todense()).astype("category") clf.fit(X, y) - assert not clf._can_use_inplace_predict() with tempfile.TemporaryDirectory() as tempdir: model = os.path.join(tempdir, "categorial.json") @@ -74,3 +73,18 @@ def test_categorical(): ) assert categories_sizes.shape[0] != 0 np.testing.assert_allclose(categories_sizes, 1) + + X = pd.DataFrame({"f0": ["a", "b", "c"]}) + X["f0"] = X["f0"].astype("category") + + y = [1, 2, 3] + reg = xgb.XGBRegressor( + tree_method="gpu_hist", enable_categorical=True, n_estimators=64 + ) + reg.fit(X, y) + predts = reg.predict(X) + booster = reg.get_booster() + assert "c" in booster.feature_types + assert len(booster.feature_types) == 1 + inp_predts = booster.inplace_predict(X) + np.testing.assert_allclose(predts, inp_predts) From 5fd7dbafe1c7559e1d4d80dab7b74b5c749efb93 Mon Sep 17 00:00:00 2001 From: fis Date: Thu, 14 Oct 2021 16:42:09 +0800 Subject: [PATCH 2/2] Test. --- tests/python-gpu/test_gpu_with_sklearn.py | 32 +++++++++++++++-------- 1 file changed, 21 insertions(+), 11 deletions(-) diff --git a/tests/python-gpu/test_gpu_with_sklearn.py b/tests/python-gpu/test_gpu_with_sklearn.py index 6664004f517b..f8d510753944 100644 --- a/tests/python-gpu/test_gpu_with_sklearn.py +++ b/tests/python-gpu/test_gpu_with_sklearn.py @@ -44,9 +44,12 @@ def test_num_parallel_tree(): @pytest.mark.skipif(**tm.no_pandas()) +@pytest.mark.skipif(**tm.no_cudf()) @pytest.mark.skipif(**tm.no_sklearn()) def test_categorical(): import pandas as pd + import cudf + import cupy as cp from sklearn.datasets import load_svmlight_file data_dir = os.path.join(tm.PROJECT_ROOT, "demo", "data") @@ -74,17 +77,24 @@ def test_categorical(): assert categories_sizes.shape[0] != 0 np.testing.assert_allclose(categories_sizes, 1) + def check_predt(X, y): + reg = xgb.XGBRegressor( + tree_method="gpu_hist", enable_categorical=True, n_estimators=64 + ) + reg.fit(X, y) + predts = reg.predict(X) + booster = reg.get_booster() + assert "c" in booster.feature_types + assert len(booster.feature_types) == 1 + inp_predts = booster.inplace_predict(X) + if isinstance(inp_predts, cp.ndarray): + inp_predts = cp.asnumpy(inp_predts) + np.testing.assert_allclose(predts, inp_predts) + + y = [1, 2, 3] X = pd.DataFrame({"f0": ["a", "b", "c"]}) X["f0"] = X["f0"].astype("category") + check_predt(X, y) - y = [1, 2, 3] - reg = xgb.XGBRegressor( - tree_method="gpu_hist", enable_categorical=True, n_estimators=64 - ) - reg.fit(X, y) - predts = reg.predict(X) - booster = reg.get_booster() - assert "c" in booster.feature_types - assert len(booster.feature_types) == 1 - inp_predts = booster.inplace_predict(X) - np.testing.assert_allclose(predts, inp_predts) + X = cudf.DataFrame(X) + check_predt(X, y)