Skip to content

Commit

Permalink
init
Browse files Browse the repository at this point in the history
Signed-off-by: Weichen Xu <weichen.xu@databricks.com>
  • Loading branch information
WeichenXu123 committed Nov 16, 2022
1 parent 812d577 commit bce3da7
Show file tree
Hide file tree
Showing 4 changed files with 32 additions and 3 deletions.
5 changes: 5 additions & 0 deletions python-package/xgboost/compat.py
Expand Up @@ -43,6 +43,11 @@ def lazy_isinstance(instance: Any, module: str, name: str) -> bool:
pandas_concat = None
PANDAS_INSTALLED = False


# cuDF
CUDF_INSTALLED = importlib.util.find_spec("cudf") is not None


# sklearn
try:
from sklearn.base import BaseEstimator as XGBModelBase
Expand Down
10 changes: 9 additions & 1 deletion python-package/xgboost/spark/core.py
Expand Up @@ -37,6 +37,7 @@

import xgboost
from xgboost import XGBClassifier, XGBRanker, XGBRegressor
from xgboost.compat import CUDF_INSTALLED

from .data import (
_read_csr_matrix_from_unwrapped_spark_vec,
Expand All @@ -56,6 +57,7 @@
HasEnableSparseDataOptim,
HasFeaturesCols,
HasQueryIdCol,
UseQuantileDMatrix,
)
from .utils import (
CommunicatorContext,
Expand Down Expand Up @@ -150,6 +152,7 @@ class _SparkXGBParams(
HasFeaturesCols,
HasEnableSparseDataOptim,
HasQueryIdCol,
UseQuantileDMatrix,
):
num_workers = Param(
Params._dummy(),
Expand Down Expand Up @@ -755,7 +758,12 @@ def _fit(self, dataset):
k: v for k, v in train_call_kwargs_params.items() if v is not None
}
dmatrix_kwargs = {k: v for k, v in dmatrix_kwargs.items() if v is not None}
use_qdm = booster_params.get("tree_method", None) in ("hist", "gpu_hist")

if self.isDefined(self.use_quantile_dmatrix):
use_qdm = self.getOrDefault(self.use_quantile_dmatrix)
else:
use_qdm = CUDF_INSTALLED and \
booster_params.get("tree_method", None) in ("hist", "gpu_hist")

def _train_booster(pandas_df_iter):
"""Takes in an RDD partition and outputs a booster for that partition after
Expand Down
4 changes: 2 additions & 2 deletions python-package/xgboost/spark/data.py
Expand Up @@ -5,7 +5,7 @@
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from xgboost.compat import concat
from xgboost.compat import CUDF_INSTALLED, concat

from xgboost import DataIter, DMatrix, QuantileDMatrix

Expand Down Expand Up @@ -81,7 +81,7 @@ def _fetch(self, data: Optional[Sequence[pd.DataFrame]]) -> Optional[pd.DataFram
if not data:
return None

if self._device_id is not None:
if self._device_id is not None and CUDF_INSTALLED:
import cudf # pylint: disable=import-error
import cupy as cp # pylint: disable=import-error

Expand Down
16 changes: 16 additions & 0 deletions python-package/xgboost/spark/params.py
Expand Up @@ -85,3 +85,19 @@ class HasQueryIdCol(Params):
"query id column name",
typeConverter=TypeConverters.toString,
)


class UseQuantileDMatrix(Params):
"""
Mixin for param use_quantile_dmatrix:
"""
use_quantile_dmatrix = Param(
Params._dummy(),
"use_quantile_dmatrix",
"This stores the boolean config of constructing quantile DMatrix instead of plain "
"DMatrix as xgboost training input. By default, if 'tree_method' param is 'hist' or "
"'gpu_hist' and 'cuDF' package is installed, the config is ON, otherwise the config "
"is OFF. Note that if you do not install 'cuDF' package, turning on this config "
"might result in performance degradation.",
typeConverter=TypeConverters.toBoolean,
)

0 comments on commit bce3da7

Please sign in to comment.