dmlc · trivialfis · Nov 27, 2022 · Nov 16, 2022 · Nov 23, 2022 · Nov 24, 2022
diff --git a/python-package/xgboost/compat.py b/python-package/xgboost/compat.py
@@ -44,10 +44,6 @@ def lazy_isinstance(instance: Any, module: str, name: str) -> bool:
     PANDAS_INSTALLED = False
 
 
-# cuDF
-CUDF_INSTALLED = importlib.util.find_spec("cudf") is not None
-
-
 # sklearn
 try:
     from sklearn.base import BaseEstimator as XGBModelBase
@@ -77,6 +73,18 @@ def lazy_isinstance(instance: Any, module: str, name: str) -> bool:
     XGBStratifiedKFold = None
 
 
+def is_cudf_installed():
+    """Check cuDF installed or not"""
+    # Checking by `importing` instead of check `importlib.util.find_spec("cudf") is not None`
+    # because user might install cudf successfully but importing cudf raises issues (e.g. saying
+    # running on mismatched cuda version)
+    try:
+        import cudf
+        return True
+    except ImportError:
+        return False
+
+
 class XGBoostLabelEncoder(LabelEncoder):
     """Label encoder with JSON serialization methods."""
 

diff --git a/python-package/xgboost/spark/core.py b/python-package/xgboost/spark/core.py
@@ -37,7 +37,7 @@
 
 import xgboost
 from xgboost import XGBClassifier, XGBRanker, XGBRegressor
-from xgboost.compat import CUDF_INSTALLED
+from xgboost.compat import is_cudf_installed
 
 from .data import (
     _read_csr_matrix_from_unwrapped_spark_vec,
@@ -57,7 +57,6 @@
     HasEnableSparseDataOptim,
     HasFeaturesCols,
     HasQueryIdCol,
-    UseQuantileDMatrix,
 )
 from .utils import (
     CommunicatorContext,
@@ -758,10 +757,7 @@ def _fit(self, dataset):
         }
         dmatrix_kwargs = {k: v for k, v in dmatrix_kwargs.items() if v is not None}
 
-        # If cuDF is not installed, then using DMatrix instead of QDM,
-        # because without cuDF, DMatrix performs better than QDM.
-        use_qdm = CUDF_INSTALLED and \
-                  booster_params.get("tree_method", None) in ("hist", "gpu_hist")
+        use_hist = booster_params.get("tree_method", None) in ("hist", "gpu_hist")
 
         def _train_booster(pandas_df_iter):
             """Takes in an RDD partition and outputs a booster for that partition after
@@ -775,6 +771,12 @@ def _train_booster(pandas_df_iter):
 
             gpu_id = None
 
+            # If cuDF is not installed, then using DMatrix instead of QDM,
+            # because without cuDF, DMatrix performs better than QDM.
+            # Note: Checking `is_cudf_installed` in spark worker side because
+            # spark worker might has different python environment with driver side.
+            use_qdm = use_hist and is_cudf_installed()
+
             if use_qdm and (booster_params.get("max_bin", None) is not None):
                 dmatrix_kwargs["max_bin"] = booster_params["max_bin"]