[pyspark] disable repartition_random_shuffle by default (#8283)

dmlc · Sep 29, 2022 · c91fed0 · c91fed0
1 parent 6925b22
commit c91fed0
Show file tree

Hide file tree

Showing 2 changed files with 9 additions and 1 deletion.
diff --git a/python-package/xgboost/spark/core.py b/python-package/xgboost/spark/core.py
@@ -88,6 +88,7 @@
     "features_cols",
     "enable_sparse_data_optim",
     "qid_col",
+    "repartition_random_shuffle",
 ]
 
 _non_booster_params = ["missing", "n_estimators", "feature_types", "feature_weights"]
@@ -477,7 +478,7 @@ def __init__(self):
             num_workers=1,
             use_gpu=False,
             force_repartition=False,
-            repartition_random_shuffle=True,
+            repartition_random_shuffle=False,
             feature_names=None,
             feature_types=None,
             arbitrary_params_dict={},

diff --git a/python-package/xgboost/spark/data.py b/python-package/xgboost/spark/data.py
@@ -9,6 +9,8 @@
 
 from xgboost import DataIter, DeviceQuantileDMatrix, DMatrix
 
+from .utils import get_logger  # type: ignore
+
 
 def stack_series(series: pd.Series) -> np.ndarray:
     """Stack a series of arrays."""
@@ -246,6 +248,11 @@ def make(values: Dict[str, List[np.ndarray]], kwargs: Dict[str, Any]) -> DMatrix
         else:
             append_fn = append_m
         cache_partitions(iterator, append_fn)
+        if len(train_data) == 0:
+            get_logger("XGBoostPySpark").warning(
+                "Detected an empty partition in the training data. "
+                "Consider to enable repartition_random_shuffle"
+            )
         dtrain = make(train_data, kwargs)
     else:
         cache_partitions(iterator, append_dqm)