dmlc · trivialfis · Nov 27, 2022 · Nov 16, 2022 · Nov 23, 2022 · Nov 24, 2022
diff --git a/python-package/xgboost/compat.py b/python-package/xgboost/compat.py
@@ -43,6 +43,7 @@ def lazy_isinstance(instance: Any, module: str, name: str) -> bool:
     pandas_concat = None
     PANDAS_INSTALLED = False
 
+
 # sklearn
 try:
     from sklearn.base import BaseEstimator as XGBModelBase
@@ -72,6 +73,18 @@ def lazy_isinstance(instance: Any, module: str, name: str) -> bool:
     XGBStratifiedKFold = None
 
 
+def is_cudf_installed():
+    """Check cuDF installed or not"""
+    # Checking by `importing` instead of check `importlib.util.find_spec("cudf") is not None`
+    # because user might install cudf successfully but importing cudf raises issues (e.g. saying
+    # running on mismatched cuda version)
+    try:
+        import cudf
+        return True
+    except ImportError:
+        return False
+
+
 class XGBoostLabelEncoder(LabelEncoder):
     """Label encoder with JSON serialization methods."""
 

diff --git a/python-package/xgboost/spark/core.py b/python-package/xgboost/spark/core.py
@@ -37,6 +37,7 @@
 
 import xgboost
 from xgboost import XGBClassifier, XGBRanker, XGBRegressor
+from xgboost.compat import is_cudf_installed
 
 from .data import (
     _read_csr_matrix_from_unwrapped_spark_vec,
@@ -755,7 +756,8 @@ def _fit(self, dataset):
             k: v for k, v in train_call_kwargs_params.items() if v is not None
         }
         dmatrix_kwargs = {k: v for k, v in dmatrix_kwargs.items() if v is not None}
-        use_qdm = booster_params.get("tree_method", None) in ("hist", "gpu_hist")
+
+        use_hist = booster_params.get("tree_method", None) in ("hist", "gpu_hist")
 
         def _train_booster(pandas_df_iter):
             """Takes in an RDD partition and outputs a booster for that partition after
@@ -769,6 +771,12 @@ def _train_booster(pandas_df_iter):
 
             gpu_id = None
 
+            # If cuDF is not installed, then using DMatrix instead of QDM,
+            # because without cuDF, DMatrix performs better than QDM.
+            # Note: Checking `is_cudf_installed` in spark worker side because
+            # spark worker might has different python environment with driver side.
+            use_qdm = use_hist and is_cudf_installed()
+
             if use_qdm and (booster_params.get("max_bin", None) is not None):
                 dmatrix_kwargs["max_bin"] = booster_params["max_bin"]
 

diff --git a/python-package/xgboost/spark/data.py b/python-package/xgboost/spark/data.py
@@ -5,7 +5,7 @@
 import numpy as np
 import pandas as pd
 from scipy.sparse import csr_matrix
-from xgboost.compat import concat
+from xgboost.compat import CUDF_INSTALLED, concat
 
 from xgboost import DataIter, DMatrix, QuantileDMatrix
 
@@ -81,7 +81,7 @@ def _fetch(self, data: Optional[Sequence[pd.DataFrame]]) -> Optional[pd.DataFram
         if not data:
             return None
 
-        if self._device_id is not None:
+        if self._device_id is not None and CUDF_INSTALLED:
             import cudf  # pylint: disable=import-error
             import cupy as cp  # pylint: disable=import-error