From 06a3f59778ed3d0a30f6d9c10f4d6a1e365e83b6 Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Thu, 25 Apr 2024 05:00:30 +0800 Subject: [PATCH] [pyspark] Sort workers by task ID. The PySpark interface uses partition ID as task ID. Unlike the ordering of workers, partitions should be deterministic. --- python-package/xgboost/spark/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python-package/xgboost/spark/utils.py b/python-package/xgboost/spark/utils.py index 7dbe290ae3e5..1403596c0c2a 100644 --- a/python-package/xgboost/spark/utils.py +++ b/python-package/xgboost/spark/utils.py @@ -55,7 +55,7 @@ def _start_tracker(context: BarrierTaskContext, n_workers: int) -> Dict[str, Any """Start Rabit tracker with n_workers""" env: Dict[str, Any] = {"DMLC_NUM_WORKER": n_workers} host = _get_host_ip(context) - rabit_context = RabitTracker(host_ip=host, n_workers=n_workers) + rabit_context = RabitTracker(host_ip=host, n_workers=n_workers, sortby="task") env.update(rabit_context.worker_envs()) rabit_context.start(n_workers) thread = Thread(target=rabit_context.join)