From c91fed083d727d2d4c0e59675586d4a5eba5d104 Mon Sep 17 00:00:00 2001 From: Bobby Wang Date: Thu, 29 Sep 2022 10:50:51 +0800 Subject: [PATCH] [pyspark] disable repartition_random_shuffle by default (#8283) --- python-package/xgboost/spark/core.py | 3 ++- python-package/xgboost/spark/data.py | 7 +++++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/python-package/xgboost/spark/core.py b/python-package/xgboost/spark/core.py index ffeeae8a77d7..03d431dd4eea 100644 --- a/python-package/xgboost/spark/core.py +++ b/python-package/xgboost/spark/core.py @@ -88,6 +88,7 @@ "features_cols", "enable_sparse_data_optim", "qid_col", + "repartition_random_shuffle", ] _non_booster_params = ["missing", "n_estimators", "feature_types", "feature_weights"] @@ -477,7 +478,7 @@ def __init__(self): num_workers=1, use_gpu=False, force_repartition=False, - repartition_random_shuffle=True, + repartition_random_shuffle=False, feature_names=None, feature_types=None, arbitrary_params_dict={}, diff --git a/python-package/xgboost/spark/data.py b/python-package/xgboost/spark/data.py index a5b3b1e811a4..11dc02340284 100644 --- a/python-package/xgboost/spark/data.py +++ b/python-package/xgboost/spark/data.py @@ -9,6 +9,8 @@ from xgboost import DataIter, DeviceQuantileDMatrix, DMatrix +from .utils import get_logger # type: ignore + def stack_series(series: pd.Series) -> np.ndarray: """Stack a series of arrays.""" @@ -246,6 +248,11 @@ def make(values: Dict[str, List[np.ndarray]], kwargs: Dict[str, Any]) -> DMatrix else: append_fn = append_m cache_partitions(iterator, append_fn) + if len(train_data) == 0: + get_logger("XGBoostPySpark").warning( + "Detected an empty partition in the training data. " + "Consider to enable repartition_random_shuffle" + ) dtrain = make(train_data, kwargs) else: cache_partitions(iterator, append_dqm)