From 5b17bb00317ac43929eb795f5c9a3da2fc5c6c30 Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Tue, 12 Oct 2021 14:31:12 +0800 Subject: [PATCH] Fix prediction with cat data in sklearn interface. (#7306) * Specify DMatrix parameter for pre-processing dataframe. * Add document about the behaviour of prediction. --- python-package/xgboost/sklearn.py | 20 ++++++++++++++++---- tests/python-gpu/test_gpu_with_sklearn.py | 1 + 2 files changed, 17 insertions(+), 4 deletions(-) diff --git a/python-package/xgboost/sklearn.py b/python-package/xgboost/sklearn.py index e4b6f2f8fab4..172ea20a6ece 100644 --- a/python-package/xgboost/sklearn.py +++ b/python-package/xgboost/sklearn.py @@ -174,7 +174,9 @@ def inner(preds: np.ndarray, dmatrix: DMatrix) -> Tuple[np.ndarray, np.ndarray]: Device ordinal. validate_parameters : Optional[bool] Give warnings for unknown parameter. - + predictor : Optional[str] + Force XGBoost to use specific predictor, available choices are [cpu_predictor, + gpu_predictor]. enable_categorical : bool .. versionadded:: 1.5.0 @@ -807,7 +809,11 @@ def _can_use_inplace_predict(self) -> bool: # Inplace predict doesn't handle as many data types as DMatrix, but it's # sufficient for dask interface where input is simpiler. predictor = self.get_params().get("predictor", None) - if predictor in ("auto", None) and self.booster != "gblinear": + if ( + not self.enable_categorical + and predictor in ("auto", None) + and self.booster != "gblinear" + ): return True return False @@ -834,7 +840,9 @@ def predict( iteration_range: Optional[Tuple[int, int]] = None, ) -> np.ndarray: """Predict with `X`. If the model is trained with early stopping, then `best_iteration` - is used automatically. + is used automatically. For tree models, when data is on GPU, like cupy array or + cuDF dataframe and `predictor` is not specified, the prediction is run on GPU + automatically, otherwise it will run on CPU. .. note:: This function is only thread safe for `gbtree` and `dart`. @@ -862,6 +870,7 @@ def predict( Returns ------- prediction + """ iteration_range = _convert_ntree_limit( self.get_booster(), ntree_limit, iteration_range @@ -886,7 +895,10 @@ def predict( pass test = DMatrix( - X, base_margin=base_margin, missing=self.missing, nthread=self.n_jobs + X, base_margin=base_margin, + missing=self.missing, + nthread=self.n_jobs, + enable_categorical=self.enable_categorical ) return self.get_booster().predict( data=test, diff --git a/tests/python-gpu/test_gpu_with_sklearn.py b/tests/python-gpu/test_gpu_with_sklearn.py index 47a2f44ca1bc..58e64886d9d5 100644 --- a/tests/python-gpu/test_gpu_with_sklearn.py +++ b/tests/python-gpu/test_gpu_with_sklearn.py @@ -59,6 +59,7 @@ def test_categorical(): ) X = pd.DataFrame(X.todense()).astype("category") clf.fit(X, y) + assert not clf._can_use_inplace_predict() with tempfile.TemporaryDirectory() as tempdir: model = os.path.join(tempdir, "categorial.json")