From 5815df4c46e7cd8fea02c12f315335f8bfcc28b9 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Wed, 20 Apr 2022 01:19:09 +0800
Subject: [PATCH 01/16] Remove warning in 1.4. (#7815)

---
 src/learner.cc | 10 ----------
 1 file changed, 10 deletions(-)
diff --git a/src/learner.cc b/src/learner.cc
index 370b46190b3c..a7bd47e65283 100644
--- a/src/learner.cc
+++ b/src/learner.cc
@@ -1200,16 +1200,6 @@ class LearnerImpl : public LearnerIO {
     os.precision(std::numeric_limits<double>::max_digits10);
     os << '[' << iter << ']' << std::setiosflags(std::ios::fixed);
     if (metrics_.size() == 0 && tparam_.disable_default_eval_metric <= 0) {
-      auto warn_default_eval_metric = [](const std::string& objective, const std::string& before,
-                                         const std::string& after, const std::string& version) {
-        LOG(WARNING) << "Starting in XGBoost " << version << ", the default evaluation metric "
-                     << "used with the objective '" << objective << "' was changed from '"
-                     << before << "' to '" << after << "'. Explicitly set eval_metric if you'd "
-                     << "like to restore the old behavior.";
-      };
-      if (tparam_.objective == "binary:logitraw") {
-        warn_default_eval_metric(tparam_.objective, "auc", "logloss", "1.4.0");
-      }
       metrics_.emplace_back(Metric::Create(obj_->DefaultEvalMetric(), &generic_parameters_));
       metrics_.back()->Configure({cfg_.begin(), cfg_.end()});
     }

From 52d4eda786cea67dec6dc75844b7fa0b778ff566 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Thu, 21 Apr 2022 13:14:02 +0800
Subject: [PATCH 02/16] Deprecate `use_label_encoder` in XGBClassifier. (#7822)

* Deprecate `use_label_encoder` in XGBClassifier.

* We have removed the encoder, now prepare to remove the indicator.
---
 demo/guide-python/cat_in_the_dat.py        |  1 -
 demo/guide-python/continuation.py          | 14 ++++++--------
 demo/guide-python/predict_first_ntree.py   |  2 +-
 doc/tutorials/categorical.rst              |  4 +---
 python-package/xgboost/sklearn.py          |  4 +++-
 tests/python/test_training_continuation.py |  6 +++---
 tests/python/test_with_dask.py             | 18 ++++--------------
 tests/python/test_with_sklearn.py          | 13 +++----------
 8 files changed, 21 insertions(+), 41 deletions(-)

diff --git a/demo/guide-python/cat_in_the_dat.py b/demo/guide-python/cat_in_the_dat.py
index 29f55aba7de1..bd0381d13bb0 100644
--- a/demo/guide-python/cat_in_the_dat.py
+++ b/demo/guide-python/cat_in_the_dat.py
@@ -63,7 +63,6 @@ def load_cat_in_the_dat() -> tuple[pd.DataFrame, pd.Series]:
 
 params = {
     "tree_method": "gpu_hist",
-    "use_label_encoder": False,
     "n_estimators": 32,
     "colsample_bylevel": 0.7,
 }
diff --git a/demo/guide-python/continuation.py b/demo/guide-python/continuation.py
index 22fbfc3f78f5..5cddc31086f7 100644
--- a/demo/guide-python/continuation.py
+++ b/demo/guide-python/continuation.py
@@ -14,13 +14,13 @@ def training_continuation(tmpdir: str, use_pickle: bool) -> None:
     """Basic training continuation."""
     # Train 128 iterations in 1 session
     X, y = load_breast_cancer(return_X_y=True)
-    clf = xgboost.XGBClassifier(n_estimators=128, use_label_encoder=False)
+    clf = xgboost.XGBClassifier(n_estimators=128)
     clf.fit(X, y, eval_set=[(X, y)], eval_metric="logloss")
     print("Total boosted rounds:", clf.get_booster().num_boosted_rounds())
 
     # Train 128 iterations in 2 sessions, with the first one runs for 32 iterations and
     # the second one runs for 96 iterations
-    clf = xgboost.XGBClassifier(n_estimators=32, use_label_encoder=False)
+    clf = xgboost.XGBClassifier(n_estimators=32)
     clf.fit(X, y, eval_set=[(X, y)], eval_metric="logloss")
     assert clf.get_booster().num_boosted_rounds() == 32
 
@@ -54,14 +54,14 @@ def training_continuation_early_stop(tmpdir: str, use_pickle: bool) -> None:
     n_estimators = 512
 
     X, y = load_breast_cancer(return_X_y=True)
-    clf = xgboost.XGBClassifier(n_estimators=n_estimators, use_label_encoder=False)
+    clf = xgboost.XGBClassifier(n_estimators=n_estimators)
     clf.fit(X, y, eval_set=[(X, y)], eval_metric="logloss", callbacks=[early_stop])
     print("Total boosted rounds:", clf.get_booster().num_boosted_rounds())
     best = clf.best_iteration
 
     # Train 512 iterations in 2 sessions, with the first one runs for 128 iterations and
     # the second one runs until early stop.
-    clf = xgboost.XGBClassifier(n_estimators=128, use_label_encoder=False)
+    clf = xgboost.XGBClassifier(n_estimators=128)
     # Reinitialize the early stop callback
     early_stop = xgboost.callback.EarlyStopping(
         rounds=early_stopping_rounds, save_best=True
@@ -79,15 +79,13 @@ def training_continuation_early_stop(tmpdir: str, use_pickle: bool) -> None:
     else:
         path = os.path.join(tmpdir, "model-first-128.json")
         clf.save_model(path)
-        loaded = xgboost.XGBClassifier(use_label_encoder=False)
+        loaded = xgboost.XGBClassifier()
         loaded.load_model(path)
 
     early_stop = xgboost.callback.EarlyStopping(
         rounds=early_stopping_rounds, save_best=True
     )
-    clf = xgboost.XGBClassifier(
-        n_estimators=n_estimators - 128, use_label_encoder=False
-    )
+    clf = xgboost.XGBClassifier(n_estimators=n_estimators - 128)
     clf.fit(
         X,
         y,
diff --git a/demo/guide-python/predict_first_ntree.py b/demo/guide-python/predict_first_ntree.py
index b56de0200843..fb7837728d18 100644
--- a/demo/guide-python/predict_first_ntree.py
+++ b/demo/guide-python/predict_first_ntree.py
@@ -35,7 +35,7 @@ def native_interface():
 def sklearn_interface():
     X_train, y_train = load_svmlight_file(train)
     X_test, y_test = load_svmlight_file(test)
-    clf = xgb.XGBClassifier(n_estimators=3, max_depth=2, eta=1, use_label_encoder=False)
+    clf = xgb.XGBClassifier(n_estimators=3, max_depth=2, eta=1)
     clf.fit(X_train, y_train, eval_set=[(X_test, y_test)])
     assert clf.n_classes_ == 2
 
diff --git a/doc/tutorials/categorical.rst b/doc/tutorials/categorical.rst
index 7a185a113116..3f106962d7af 100644
--- a/doc/tutorials/categorical.rst
+++ b/doc/tutorials/categorical.rst
@@ -36,9 +36,7 @@ parameter ``enable_categorical``:
 .. code:: python
 
   # Supported tree methods are `gpu_hist`, `approx`, and `hist`.
-  clf = xgb.XGBClassifier(
-      tree_method="gpu_hist", enable_categorical=True, use_label_encoder=False
-  )
+  clf = xgb.XGBClassifier(tree_method="gpu_hist", enable_categorical=True)
   # X is the dataframe we created in previous snippet
   clf.fit(X, y)
   # Must use JSON/UBJSON for serialization, otherwise the information is lost.
diff --git a/python-package/xgboost/sklearn.py b/python-package/xgboost/sklearn.py
index d27cc6354641..0b4d4f5a914f 100644
--- a/python-package/xgboost/sklearn.py
+++ b/python-package/xgboost/sklearn.py
@@ -1304,13 +1304,15 @@ def __init__(
         self,
         *,
         objective: _SklObjective = "binary:logistic",
-        use_label_encoder: bool = False,
+        use_label_encoder: Optional[bool] = None,
         **kwargs: Any
     ) -> None:
         # must match the parameters for `get_params`
         self.use_label_encoder = use_label_encoder
         if use_label_encoder is True:
             raise ValueError("Label encoder was removed in 1.6.")
+        if use_label_encoder is not None:
+            warnings.warn("`use_label_encoder` is deprecated in 2.0.0.")
         super().__init__(objective=objective, **kwargs)
 
     @_deprecate_positional_args
diff --git a/tests/python/test_training_continuation.py b/tests/python/test_training_continuation.py
index 44de6bed434d..31a408170766 100644
--- a/tests/python/test_training_continuation.py
+++ b/tests/python/test_training_continuation.py
@@ -152,16 +152,16 @@ def test_training_continuation_updaters_json(self):
     def test_changed_parameter(self):
         from sklearn.datasets import load_breast_cancer
         X, y = load_breast_cancer(return_X_y=True)
-        clf = xgb.XGBClassifier(n_estimators=2, use_label_encoder=False)
+        clf = xgb.XGBClassifier(n_estimators=2)
         clf.fit(X, y, eval_set=[(X, y)], eval_metric="logloss")
         assert tm.non_increasing(clf.evals_result()["validation_0"]["logloss"])
 
         with tempfile.TemporaryDirectory() as tmpdir:
             clf.save_model(os.path.join(tmpdir, "clf.json"))
-            loaded = xgb.XGBClassifier(use_label_encoder=False)
+            loaded = xgb.XGBClassifier()
             loaded.load_model(os.path.join(tmpdir, "clf.json"))
 
-        clf = xgb.XGBClassifier(n_estimators=2, use_label_encoder=False)
+        clf = xgb.XGBClassifier(n_estimators=2)
         # change metric to error
         clf.fit(X, y, eval_set=[(X, y)], eval_metric="error")
         assert tm.non_increasing(clf.evals_result()["validation_0"]["error"])
diff --git a/tests/python/test_with_dask.py b/tests/python/test_with_dask.py
index 4e80409d4764..c20291d74dec 100644
--- a/tests/python/test_with_dask.py
+++ b/tests/python/test_with_dask.py
@@ -777,9 +777,7 @@ def run_empty_dmatrix_auc(client: "Client", tree_method: str, n_workers: int) ->
     valid_X = dd.from_array(valid_X_, chunksize=n_samples)
     valid_y = dd.from_array(valid_y_, chunksize=n_samples)
 
-    cls = xgb.dask.DaskXGBClassifier(
-        tree_method=tree_method, n_estimators=2, use_label_encoder=False
-    )
+    cls = xgb.dask.DaskXGBClassifier(tree_method=tree_method, n_estimators=2)
     cls.fit(X, y, eval_metric=["auc", "aucpr"], eval_set=[(valid_X, valid_y)])
 
     # multiclass
@@ -808,9 +806,7 @@ def run_empty_dmatrix_auc(client: "Client", tree_method: str, n_workers: int) ->
     valid_X = dd.from_array(valid_X_, chunksize=n_samples)
     valid_y = dd.from_array(valid_y_, chunksize=n_samples)
 
-    cls = xgb.dask.DaskXGBClassifier(
-        tree_method=tree_method, n_estimators=2, use_label_encoder=False
-    )
+    cls = xgb.dask.DaskXGBClassifier(tree_method=tree_method, n_estimators=2)
     cls.fit(X, y, eval_metric=["auc", "aucpr"], eval_set=[(valid_X, valid_y)])
 
 
@@ -837,14 +833,10 @@ def run_auc(client: "Client", tree_method: str) -> None:
     valid_X = dd.from_array(valid_X_, chunksize=10)
     valid_y = dd.from_array(valid_y_, chunksize=10)
 
-    cls = xgb.XGBClassifier(
-        tree_method=tree_method, n_estimators=2, use_label_encoder=False
-    )
+    cls = xgb.XGBClassifier(tree_method=tree_method, n_estimators=2)
     cls.fit(X_, y_, eval_metric="auc", eval_set=[(valid_X_, valid_y_)])
 
-    dcls = xgb.dask.DaskXGBClassifier(
-        tree_method=tree_method, n_estimators=2, use_label_encoder=False
-    )
+    dcls = xgb.dask.DaskXGBClassifier(tree_method=tree_method, n_estimators=2)
     dcls.fit(X, y, eval_metric="auc", eval_set=[(valid_X, valid_y)])
 
     approx = dcls.evals_result()["validation_0"]["auc"]
@@ -1693,7 +1685,6 @@ def test_parallel_submits(client: "Client") -> None:
             verbosity=1,
             n_estimators=i + 1,
             eval_metric="merror",
-            use_label_encoder=False,
         )
         f = client.submit(cls.fit, X, y, pure=False)
         futures.append(f)
@@ -1786,7 +1777,6 @@ def test_parallel_submit_multi_clients() -> None:
                 verbosity=1,
                 n_estimators=i + 1,
                 eval_metric="merror",
-                use_label_encoder=False,
             )
             f = client.submit(cls.fit, X, y, pure=False)
             futures.append((client, f))
diff --git a/tests/python/test_with_sklearn.py b/tests/python/test_with_sklearn.py
index a2e70ae6de2e..cd1297f708d6 100644
--- a/tests/python/test_with_sklearn.py
+++ b/tests/python/test_with_sklearn.py
@@ -774,13 +774,12 @@ def save_load_model(model_path):
     X = digits['data']
     kf = KFold(n_splits=2, shuffle=True, random_state=rng)
     for train_index, test_index in kf.split(X, y):
-        xgb_model = xgb.XGBClassifier(use_label_encoder=False).fit(X[train_index], y[train_index])
+        xgb_model = xgb.XGBClassifier().fit(X[train_index], y[train_index])
         xgb_model.save_model(model_path)
 
         xgb_model = xgb.XGBClassifier()
         xgb_model.load_model(model_path)
 
-        assert xgb_model.use_label_encoder is False
         assert isinstance(xgb_model.classes_, np.ndarray)
         assert isinstance(xgb_model._Booster, xgb.Booster)
 
@@ -972,8 +971,8 @@ def test_deprecate_position_arg():
         model.fit(X, y, w)
 
     with pytest.warns(FutureWarning):
-        xgb.XGBClassifier(1, use_label_encoder=False)
-    model = xgb.XGBClassifier(n_estimators=1, use_label_encoder=False)
+        xgb.XGBClassifier(1)
+    model = xgb.XGBClassifier(n_estimators=1)
     with pytest.warns(FutureWarning):
         model.fit(X, y, w)
 
@@ -990,9 +989,6 @@ def test_deprecate_position_arg():
     with pytest.warns(FutureWarning):
         model.fit(X, y, w)
 
-    with pytest.raises(ValueError):
-        xgb.XGBRFClassifier(1, use_label_encoder=True)
-
     model = xgb.XGBRFClassifier(n_estimators=1)
     with pytest.warns(FutureWarning):
         model.fit(X, y, w)
@@ -1334,7 +1330,6 @@ def merror(y_true: np.ndarray, predt: np.ndarray):
     X, y = load_digits(n_class=10, return_X_y=True)
 
     clf = xgb.XGBClassifier(
-        use_label_encoder=False,
         tree_method="hist",
         eval_metric=merror,
         n_estimators=16,
@@ -1344,7 +1339,6 @@ def merror(y_true: np.ndarray, predt: np.ndarray):
     custom = clf.evals_result()
 
     clf = xgb.XGBClassifier(
-        use_label_encoder=False,
         tree_method="hist",
         eval_metric="merror",
         n_estimators=16,
@@ -1360,7 +1354,6 @@ def merror(y_true: np.ndarray, predt: np.ndarray):
     )
 
     clf = xgb.XGBRFClassifier(
-        use_label_encoder=False,
         tree_method="hist", n_estimators=16,
         objective=tm.softprob_obj(10),
         eval_metric=merror,

From 401d451569cc0a49e7f5b14d7b558eb61a86d2a7 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Thu, 21 Apr 2022 19:09:54 +0800
Subject: [PATCH 03/16] Clear configuration cache. (#7826)

---
 src/learner.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/learner.cc b/src/learner.cc
index a7bd47e65283..7d8419259e1e 100644
--- a/src/learner.cc
+++ b/src/learner.cc
@@ -397,7 +397,7 @@ class LearnerConfiguration : public Learner {
       this->ValidateParameters();
     }
 
-    // FIXME(trivialfis): Clear the cache once binary IO is gone.
+    cfg_.clear();
     monitor_.Stop("Configure");
   }
 

From c70fa502a50d0178f1e415ab83ef85cc34ca9457 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Thu, 21 Apr 2022 20:23:35 +0800
Subject: [PATCH 04/16] Expose `feature_types` to sklearn interface. (#7821)

---
 python-package/xgboost/_typing.py |  5 +--
 python-package/xgboost/core.py    | 27 ++++++++-------
 python-package/xgboost/dask.py    | 18 +++++++---
 python-package/xgboost/data.py    | 57 +++++++++++++++++--------------
 python-package/xgboost/sklearn.py | 33 ++++++++++++++++--
 tests/python/test_with_dask.py    |  7 ++++
 tests/python/test_with_sklearn.py | 32 +++++++++++++++++
 7 files changed, 131 insertions(+), 48 deletions(-)

diff --git a/python-package/xgboost/_typing.py b/python-package/xgboost/_typing.py
index d21de6f0ed8c..64ea9a0a2993 100644
--- a/python-package/xgboost/_typing.py
+++ b/python-package/xgboost/_typing.py
@@ -1,7 +1,7 @@
 """Shared typing definition."""
 import ctypes
 import os
-from typing import Optional, List, Any, TypeVar, Union
+from typing import Optional, Any, TypeVar, Union, Sequence
 
 # os.PathLike/string/numpy.array/scipy.sparse/pd.DataFrame/dt.Frame/
 # cudf.DataFrame/cupy.array/dlpack
@@ -9,7 +9,8 @@
 
 # xgboost accepts some other possible types in practice due to historical reason, which is
 # lesser tested.  For now we encourage users to pass a simple list of string.
-FeatureNames = Optional[List[str]]
+FeatureNames = Optional[Sequence[str]]
+FeatureTypes = Optional[Sequence[str]]
 
 ArrayLike = Any
 PathLike = Union[str, os.PathLike]
diff --git a/python-package/xgboost/core.py b/python-package/xgboost/core.py
index 3321e2f0819f..1c537d365a73 100644
--- a/python-package/xgboost/core.py
+++ b/python-package/xgboost/core.py
@@ -31,6 +31,7 @@
     CFloatPtr,
     NumpyOrCupy,
     FeatureNames,
+    FeatureTypes,
     _T,
     CupyT,
 )
@@ -553,7 +554,7 @@ def __init__(
         missing: Optional[float] = None,
         silent: bool = False,
         feature_names: FeatureNames = None,
-        feature_types: Optional[List[str]] = None,
+        feature_types: FeatureTypes = None,
         nthread: Optional[int] = None,
         group: Optional[ArrayLike] = None,
         qid: Optional[ArrayLike] = None,
@@ -594,10 +595,15 @@ def __init__(
             Whether print messages during construction
         feature_names : list, optional
             Set names for features.
-        feature_types :
+        feature_types : FeatureTypes
 
             Set types for features.  When `enable_categorical` is set to `True`, string
-            "c" represents categorical data type.
+            "c" represents categorical data type while "q" represents numerical feature
+            type. For categorical features, the input is assumed to be preprocessed and
+            encoded by the users. The encoding can be done via
+            :py:class:`sklearn.preprocessing.OrdinalEncoder` or pandas dataframe
+            `.cat.codes` method. This is useful when users want to specify categorical
+            features without having to construct a dataframe as input.
 
         nthread : integer, optional
             Number of threads to use for loading data when parallelization is
@@ -1062,12 +1068,7 @@ def feature_names(self, feature_names: FeatureNames) -> None:
 
     @property
     def feature_types(self) -> Optional[List[str]]:
-        """Get feature types (column types).
-
-        Returns
-        -------
-        feature_types : list or None
-        """
+        """Get feature types. See :py:class:`DMatrix` for details."""
         length = c_bst_ulong()
         sarr = ctypes.POINTER(ctypes.c_char_p)()
         _check_call(_LIB.XGDMatrixGetStrFeatureInfo(self.handle,
@@ -1083,8 +1084,8 @@ def feature_types(self) -> Optional[List[str]]:
     def feature_types(self, feature_types: Optional[Union[List[str], str]]) -> None:
         """Set feature types (column types).
 
-        This is for displaying the results and categorical data support.  See doc string
-        of :py:obj:`xgboost.DMatrix` for details.
+        This is for displaying the results and categorical data support. See
+        :py:class:`DMatrix` for details.
 
         Parameters
         ----------
@@ -1647,7 +1648,7 @@ def _get_feature_info(self, field: str) -> Optional[List[str]]:
         feature_info = from_cstr_to_pystr(sarr, length)
         return feature_info if feature_info else None
 
-    def _set_feature_info(self, features: Optional[List[str]], field: str) -> None:
+    def _set_feature_info(self, features: Optional[Sequence[str]], field: str) -> None:
         if features is not None:
             assert isinstance(features, list)
             feature_info_bytes = [bytes(f, encoding="utf-8") for f in features]
@@ -1667,7 +1668,7 @@ def _set_feature_info(self, features: Optional[List[str]], field: str) -> None:
     @property
     def feature_types(self) -> Optional[List[str]]:
         """Feature types for this booster.  Can be directly set by input data or by
-        assignment.
+        assignment.  See :py:class:`DMatrix` for details.
 
         """
         return self._get_feature_info("feature_type")
diff --git a/python-package/xgboost/dask.py b/python-package/xgboost/dask.py
index a09eeefa0840..942893f0a32d 100644
--- a/python-package/xgboost/dask.py
+++ b/python-package/xgboost/dask.py
@@ -54,10 +54,11 @@
 from .compat import PANDAS_INSTALLED, DataFrame, Series, pandas_concat
 from .compat import lazy_isinstance
 
+from ._typing import FeatureNames, FeatureTypes
+
 from .core import DMatrix, DeviceQuantileDMatrix, Booster, _expect, DataIter
 from .core import Objective, Metric
 from .core import _deprecate_positional_args, _has_categorical
-from .data import FeatureNames
 from .training import train as worker_train
 from .tracker import RabitTracker, get_host_ip
 from .sklearn import XGBModel, XGBClassifier, XGBRegressorBase, XGBClassifierBase
@@ -327,7 +328,7 @@ def __init__(
         missing: float = None,
         silent: bool = False,  # pylint: disable=unused-argument
         feature_names: FeatureNames = None,
-        feature_types: Optional[List[str]] = None,
+        feature_types: FeatureTypes = None,
         group: Optional[_DaskCollection] = None,
         qid: Optional[_DaskCollection] = None,
         label_lower_bound: Optional[_DaskCollection] = None,
@@ -1601,7 +1602,11 @@ async def _predict_async(
                 predts = predts.to_dask_array()
         else:
             test_dmatrix = await DaskDMatrix(
-                self.client, data=data, base_margin=base_margin, missing=self.missing
+                self.client,
+                data=data,
+                base_margin=base_margin,
+                missing=self.missing,
+                feature_types=self.feature_types
             )
             predts = await predict(
                 self.client,
@@ -1640,7 +1645,9 @@ async def _apply_async(
         iteration_range: Optional[Tuple[int, int]] = None,
     ) -> Any:
         iteration_range = self._get_iteration_range(iteration_range)
-        test_dmatrix = await DaskDMatrix(self.client, data=X, missing=self.missing)
+        test_dmatrix = await DaskDMatrix(
+            self.client, data=X, missing=self.missing, feature_types=self.feature_types,
+        )
         predts = await predict(
             self.client,
             model=self.get_booster(),
@@ -1755,6 +1762,7 @@ async def _fit_async(
             eval_qid=None,
             missing=self.missing,
             enable_categorical=self.enable_categorical,
+            feature_types=self.feature_types,
         )
 
         if callable(self.objective):
@@ -1849,6 +1857,7 @@ async def _fit_async(
             eval_qid=None,
             missing=self.missing,
             enable_categorical=self.enable_categorical,
+            feature_types=self.feature_types,
         )
 
         # pylint: disable=attribute-defined-outside-init
@@ -2054,6 +2063,7 @@ async def _fit_async(
             eval_qid=eval_qid,
             missing=self.missing,
             enable_categorical=self.enable_categorical,
+            feature_types=self.feature_types,
         )
         if eval_metric is not None:
             if callable(eval_metric):
diff --git a/python-package/xgboost/data.py b/python-package/xgboost/data.py
index 119b354fc6dd..00d47599fe73 100644
--- a/python-package/xgboost/data.py
+++ b/python-package/xgboost/data.py
@@ -13,6 +13,7 @@
 from .core import c_array, _LIB, _check_call, c_str
 from .core import _cuda_array_interface
 from .core import DataIter, _ProxyDMatrix, DMatrix, FeatureNames
+from ._typing import FeatureTypes
 from .compat import lazy_isinstance, DataFrame
 
 c_bst_ulong = ctypes.c_uint64   # pylint: disable=invalid-name
@@ -70,7 +71,7 @@ def _from_scipy_csr(
     missing,
     nthread,
     feature_names: FeatureNames,
-    feature_types: Optional[List[str]],
+    feature_types: FeatureTypes,
 ):
     """Initialize data from a CSR matrix."""
     if len(data.indices) != len(data.data):
@@ -109,7 +110,7 @@ def _from_scipy_csc(
     data,
     missing,
     feature_names: FeatureNames,
-    feature_types: Optional[List[str]],
+    feature_types: FeatureTypes,
 ):
     if len(data.indices) != len(data.data):
         raise ValueError(f"length mismatch: {len(data.indices)} vs {len(data.data)}")
@@ -165,7 +166,7 @@ def _from_numpy_array(
     missing,
     nthread,
     feature_names: FeatureNames,
-    feature_types: Optional[List[str]],
+    feature_types: FeatureTypes,
 ):
     """Initialize data from a 2-D numpy matrix.
 
@@ -228,6 +229,12 @@ def _is_modin_df(data):
 }
 
 
+_ENABLE_CAT_ERR = (
+    "When categorical type is supplied, DMatrix parameter `enable_categorical` must "
+    "be set to `True`."
+)
+
+
 def _invalid_dataframe_dtype(data: Any) -> None:
     # pandas series has `dtypes` but it's just a single object
     # cudf series doesn't have `dtypes`.
@@ -241,9 +248,8 @@ def _invalid_dataframe_dtype(data: Any) -> None:
     else:
         err = ""
 
-    msg = """DataFrame.dtypes for data must be int, float, bool or category.  When
-categorical type is supplied, DMatrix parameter `enable_categorical` must
-be set to `True`.""" + err
+    type_err = "DataFrame.dtypes for data must be int, float, bool or category."
+    msg = f"""{type_err} {_ENABLE_CAT_ERR} {err}"""
     raise ValueError(msg)
 
 
@@ -340,8 +346,8 @@ def _from_pandas_df(
     missing: float,
     nthread: int,
     feature_names: FeatureNames,
-    feature_types: Optional[List[str]],
-) -> Tuple[ctypes.c_void_p, FeatureNames, Optional[List[str]]]:
+    feature_types: FeatureTypes,
+) -> Tuple[ctypes.c_void_p, FeatureNames, FeatureTypes]:
     data, feature_names, feature_types = _transform_pandas_df(
         data, enable_categorical, feature_names, feature_types
     )
@@ -382,7 +388,7 @@ def _from_pandas_series(
     nthread: int,
     enable_categorical: bool,
     feature_names: FeatureNames,
-    feature_types: Optional[List[str]],
+    feature_types: FeatureTypes,
 ):
     from pandas.api.types import is_categorical_dtype
 
@@ -413,7 +419,7 @@ def _is_dt_df(data):
 def _transform_dt_df(
     data,
     feature_names: FeatureNames,
-    feature_types: Optional[List[str]],
+    feature_types: FeatureTypes,
     meta=None,
     meta_type=None,
 ):
@@ -454,9 +460,9 @@ def _from_dt_df(
     missing,
     nthread,
     feature_names: FeatureNames,
-    feature_types: Optional[List[str]],
+    feature_types: FeatureTypes,
     enable_categorical: bool,
-) -> Tuple[ctypes.c_void_p, FeatureNames, Optional[List[str]]]:
+) -> Tuple[ctypes.c_void_p, FeatureNames, FeatureTypes]:
     if enable_categorical:
         raise ValueError("categorical data in datatable is not supported yet.")
     data, feature_names, feature_types = _transform_dt_df(
@@ -542,10 +548,10 @@ def _from_arrow(
     data,
     missing: float,
     nthread: int,
-    feature_names: Optional[List[str]],
-    feature_types: Optional[List[str]],
+    feature_names: FeatureNames,
+    feature_types: FeatureTypes,
     enable_categorical: bool,
-) -> Tuple[ctypes.c_void_p, Optional[List[str]], Optional[List[str]]]:
+) -> Tuple[ctypes.c_void_p, FeatureNames, FeatureTypes]:
     import pyarrow as pa
 
     if not all(
@@ -621,7 +627,7 @@ def _cudf_array_interfaces(data, cat_codes: list) -> bytes:
 def _transform_cudf_df(
     data,
     feature_names: FeatureNames,
-    feature_types: Optional[List[str]],
+    feature_types: FeatureTypes,
     enable_categorical: bool,
 ):
     try:
@@ -687,7 +693,7 @@ def _from_cudf_df(
     missing,
     nthread,
     feature_names: FeatureNames,
-    feature_types: Optional[List[str]],
+    feature_types: FeatureTypes,
     enable_categorical: bool,
 ) -> Tuple[ctypes.c_void_p, Any, Any]:
     data, cat_codes, feature_names, feature_types = _transform_cudf_df(
@@ -735,7 +741,7 @@ def _from_cupy_array(
     missing,
     nthread,
     feature_names: FeatureNames,
-    feature_types: Optional[List[str]],
+    feature_types: FeatureTypes,
 ):
     """Initialize DMatrix from cupy ndarray."""
     data = _transform_cupy_array(data)
@@ -782,7 +788,7 @@ def _from_dlpack(
     missing,
     nthread,
     feature_names: FeatureNames,
-    feature_types: Optional[List[str]],
+    feature_types: FeatureTypes,
 ):
     data = _transform_dlpack(data)
     return _from_cupy_array(data, missing, nthread, feature_names,
@@ -797,7 +803,7 @@ def _from_uri(
     data,
     missing,
     feature_names: FeatureNames,
-    feature_types: Optional[List[str]],
+    feature_types: FeatureTypes,
 ):
     _warn_unused_missing(data, missing)
     handle = ctypes.c_void_p()
@@ -817,7 +823,7 @@ def _from_list(
     missing,
     n_threads,
     feature_names: FeatureNames,
-    feature_types: Optional[List[str]],
+    feature_types: FeatureTypes,
 ):
     array = np.array(data)
     _check_data_shape(data)
@@ -833,7 +839,7 @@ def _from_tuple(
     missing,
     n_threads,
     feature_names: FeatureNames,
-    feature_types: Optional[List[str]],
+    feature_types: FeatureTypes,
 ):
     return _from_list(data, missing, n_threads, feature_names, feature_types)
 
@@ -869,7 +875,7 @@ def dispatch_data_backend(
     missing,
     threads,
     feature_names: FeatureNames,
-    feature_types: Optional[List[str]],
+    feature_types: FeatureTypes,
     enable_categorical: bool = False,
 ):
     '''Dispatch data for DMatrix.'''
@@ -884,8 +890,7 @@ def dispatch_data_backend(
             data.tocsr(), missing, threads, feature_names, feature_types
         )
     if _is_numpy_array(data):
-        return _from_numpy_array(data, missing, threads, feature_names,
-                                 feature_types)
+        return _from_numpy_array(data, missing, threads, feature_names, feature_types)
     if _is_uri(data):
         return _from_uri(data, missing, feature_names, feature_types)
     if _is_list(data):
@@ -1101,7 +1106,7 @@ def reset(self) -> None:
 def _proxy_transform(
     data,
     feature_names: FeatureNames,
-    feature_types: Optional[List[str]],
+    feature_types: FeatureTypes,
     enable_categorical: bool,
 ):
     if _is_cudf_df(data) or _is_cudf_ser(data):
diff --git a/python-package/xgboost/sklearn.py b/python-package/xgboost/sklearn.py
index 0b4d4f5a914f..ae883e30ee17 100644
--- a/python-package/xgboost/sklearn.py
+++ b/python-package/xgboost/sklearn.py
@@ -14,7 +14,7 @@
 from .training import train
 from .callback import TrainingCallback
 from .data import _is_cudf_df, _is_cudf_ser, _is_cupy_array
-from ._typing import ArrayLike
+from ._typing import ArrayLike, FeatureTypes
 
 # Do not use class names on scikit-learn directly.  Re-define the classes on
 # .compat to guarantee the behavior without scikit-learn
@@ -211,6 +211,13 @@ def inner(y_score: np.ndarray, dmatrix: DMatrix) -> Tuple[str, float]:
         should be used to specify categorical data type.  Also, JSON/UBJSON
         serialization format is required.
 
+    feature_types : FeatureTypes
+
+        .. versionadded:: 2.0.0
+
+        Used for specifying feature types without constructing a dataframe. See
+        :py:class:`DMatrix` for details.
+
     max_cat_to_onehot : Optional[int]
 
         .. versionadded:: 1.6.0
@@ -394,6 +401,7 @@ def _wrap_evaluation_matrices(
     eval_qid: Optional[Sequence[Any]],
     create_dmatrix: Callable,
     enable_categorical: bool,
+    feature_types: FeatureTypes,
 ) -> Tuple[Any, List[Tuple[Any, str]]]:
     """Convert array_like evaluation matrices into DMatrix.  Perform validation on the way.
 
@@ -408,6 +416,7 @@ def _wrap_evaluation_matrices(
         feature_weights=feature_weights,
         missing=missing,
         enable_categorical=enable_categorical,
+        feature_types=feature_types,
     )
 
     n_validation = 0 if eval_set is None else len(eval_set)
@@ -455,6 +464,7 @@ def validate_or_none(meta: Optional[Sequence], name: str) -> Sequence:
                     base_margin=base_margin_eval_set[i],
                     missing=missing,
                     enable_categorical=enable_categorical,
+                    feature_types=feature_types,
                 )
                 evals.append(m)
         nevals = len(evals)
@@ -518,6 +528,7 @@ def __init__(
         validate_parameters: Optional[bool] = None,
         predictor: Optional[str] = None,
         enable_categorical: bool = False,
+        feature_types: FeatureTypes = None,
         max_cat_to_onehot: Optional[int] = None,
         eval_metric: Optional[Union[str, List[str], Callable]] = None,
         early_stopping_rounds: Optional[int] = None,
@@ -562,6 +573,7 @@ def __init__(
         self.validate_parameters = validate_parameters
         self.predictor = predictor
         self.enable_categorical = enable_categorical
+        self.feature_types = feature_types
         self.max_cat_to_onehot = max_cat_to_onehot
         self.eval_metric = eval_metric
         self.early_stopping_rounds = early_stopping_rounds
@@ -684,6 +696,7 @@ def get_xgb_params(self) -> Dict[str, Any]:
             "enable_categorical",
             "early_stopping_rounds",
             "callbacks",
+            "feature_types",
         }
         filtered = {}
         for k, v in params.items():
@@ -715,6 +728,10 @@ def save_model(self, fname: Union[str, os.PathLike]) -> None:
                 # numpy array is not JSON serializable
                 meta['classes_'] = self.classes_.tolist()
                 continue
+            if k == "feature_types":
+                # Use the `feature_types` attribute from booster instead.
+                meta["feature_types"] = None
+                continue
             try:
                 json.dumps({k: v})
                 meta[k] = v
@@ -754,6 +771,9 @@ def load_model(self, fname: Union[str, bytearray, os.PathLike]) -> None:
             if k == 'classes_':
                 self.classes_ = np.array(v)
                 continue
+            if k == "feature_types":
+                self.feature_types = self.get_booster().feature_types
+                continue
             if k == "_estimator_type":
                 if self._get_type() != v:
                     raise TypeError(
@@ -944,6 +964,7 @@ def fit(
             eval_qid=None,
             create_dmatrix=lambda **kwargs: DMatrix(nthread=self.n_jobs, **kwargs),
             enable_categorical=self.enable_categorical,
+            feature_types=self.feature_types
         )
         params = self.get_xgb_params()
 
@@ -1063,9 +1084,11 @@ def predict(
                 pass
 
         test = DMatrix(
-            X, base_margin=base_margin,
+            X,
+            base_margin=base_margin,
             missing=self.missing,
             nthread=self.n_jobs,
+            feature_types=self.feature_types,
             enable_categorical=self.enable_categorical
         )
         return self.get_booster().predict(
@@ -1106,7 +1129,9 @@ def apply(
             self.get_booster(), ntree_limit, iteration_range
         )
         iteration_range = self._get_iteration_range(iteration_range)
-        test_dmatrix = DMatrix(X, missing=self.missing, nthread=self.n_jobs)
+        test_dmatrix = DMatrix(
+            X, missing=self.missing, feature_types=self.feature_types, nthread=self.n_jobs
+        )
         return self.get_booster().predict(
             test_dmatrix,
             pred_leaf=True,
@@ -1397,6 +1422,7 @@ def fit(
             eval_qid=None,
             create_dmatrix=lambda **kwargs: DMatrix(nthread=self.n_jobs, **kwargs),
             enable_categorical=self.enable_categorical,
+            feature_types=self.feature_types,
         )
 
         self._Booster = train(
@@ -1828,6 +1854,7 @@ def fit(
             eval_qid=eval_qid,
             create_dmatrix=lambda **kwargs: DMatrix(nthread=self.n_jobs, **kwargs),
             enable_categorical=self.enable_categorical,
+            feature_types=self.feature_types,
         )
 
         evals_result: TrainingCallback.EvalsLog = {}
diff --git a/tests/python/test_with_dask.py b/tests/python/test_with_dask.py
index c20291d74dec..21e7983cf59f 100644
--- a/tests/python/test_with_dask.py
+++ b/tests/python/test_with_dask.py
@@ -306,6 +306,13 @@ def test_categorical(client: "Client") -> None:
     run_categorical(client, "approx", X, X_onehot, y)
     run_categorical(client, "hist", X, X_onehot, y)
 
+    ft = ["c"] * X.shape[1]
+    reg = xgb.dask.DaskXGBRegressor(
+        tree_method="hist", feature_types=ft, enable_categorical=True
+    )
+    reg.fit(X, y)
+    assert reg.get_booster().feature_types == ft
+
 
 def test_dask_predict_shape_infer(client: "Client") -> None:
     X, y = make_classification(n_samples=1000, n_informative=5, n_classes=3)
diff --git a/tests/python/test_with_sklearn.py b/tests/python/test_with_sklearn.py
index cd1297f708d6..7edc392f061d 100644
--- a/tests/python/test_with_sklearn.py
+++ b/tests/python/test_with_sklearn.py
@@ -1273,6 +1273,38 @@ def test_estimator_reg(estimator, check):
     check(estimator)
 
 
+def test_categorical():
+    X, y = tm.make_categorical(n_samples=32, n_features=2, n_categories=3, onehot=False)
+    ft = ["c"] * X.shape[1]
+    reg = xgb.XGBRegressor(
+        tree_method="hist",
+        feature_types=ft,
+        max_cat_to_onehot=1,
+        enable_categorical=True,
+    )
+    reg.fit(X.values, y, eval_set=[(X.values, y)])
+    from_cat = reg.evals_result()["validation_0"]["rmse"]
+    predt_cat = reg.predict(X.values)
+    assert reg.get_booster().feature_types == ft
+    with tempfile.TemporaryDirectory() as tmpdir:
+        path = os.path.join(tmpdir, "model.json")
+        reg.save_model(path)
+        reg = xgb.XGBRegressor()
+        reg.load_model(path)
+        assert reg.feature_types == ft
+
+    onehot, y = tm.make_categorical(
+        n_samples=32, n_features=2, n_categories=3, onehot=True
+    )
+    reg = xgb.XGBRegressor(tree_method="hist")
+    reg.fit(onehot, y, eval_set=[(onehot, y)])
+    from_enc = reg.evals_result()["validation_0"]["rmse"]
+    predt_enc = reg.predict(onehot)
+
+    np.testing.assert_allclose(from_cat, from_enc)
+    np.testing.assert_allclose(predt_cat, predt_enc)
+
+
 def test_prediction_config():
     reg = xgb.XGBRegressor()
     assert reg._can_use_inplace_predict() is True

From c13a2a31145dea2cf110e95dd5ae2ddda76d3cd9 Mon Sep 17 00:00:00 2001
From: forestkey <284918285@qq.com>
Date: Fri, 22 Apr 2022 16:54:30 +0800
Subject: [PATCH 05/16] [doc] "irrevelant" to "irrelevant" (#7832)

---
 doc/treemethod.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/treemethod.rst b/doc/treemethod.rst
index 8feba686c4f5..91f546c36a9c 100644
--- a/doc/treemethod.rst
+++ b/doc/treemethod.rst
@@ -10,7 +10,7 @@ are also some free standing updaters including ``grow_local_histmaker``, ``refre
 as the latter is just a pre-configuration of the former.  The difference is mostly due to
 historical reasons that each updater requires some specific configurations and might has
 missing features.  As we are moving forward, the gap between them is becoming more and
-more irrevelant.  We will collectively document them under tree methods.
+more irrelevant.  We will collectively document them under tree methods.
 
 **************
 Exact Solution

From f0f76259c920bfbc6c34f0239e74561fea8f4ec3 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Fri, 22 Apr 2022 19:07:51 +0800
Subject: [PATCH 06/16] Remove `STRING_TYPES`. (#7827)

---
 python-package/xgboost/callback.py |  3 +--
 python-package/xgboost/compat.py   |  3 ---
 python-package/xgboost/core.py     | 14 +++++++-------
 python-package/xgboost/rabit.py    |  4 ++--
 4 files changed, 10 insertions(+), 14 deletions(-)

diff --git a/python-package/xgboost/callback.py b/python-package/xgboost/callback.py
index a0189c15ad32..32d408f3a29e 100644
--- a/python-package/xgboost/callback.py
+++ b/python-package/xgboost/callback.py
@@ -16,7 +16,6 @@
 
 from . import rabit
 from .core import Booster, DMatrix, XGBoostError, _get_booster_layer_trees
-from .compat import STRING_TYPES
 
 
 __all__ = [
@@ -82,7 +81,7 @@ def _aggcv(rlist: List[str]) -> List[Tuple[str, float, float]]:
     results = []
     for (_, name), s in sorted(cvmap.items(), key=lambda x: x[0][0]):
         as_arr = numpy.array(s)
-        if not isinstance(msg, STRING_TYPES):
+        if not isinstance(msg, str):
             msg = msg.decode()
         mean, std = numpy.mean(as_arr), numpy.std(as_arr)
         results.extend([(name, mean, std)])
diff --git a/python-package/xgboost/compat.py b/python-package/xgboost/compat.py
index 256a77adf463..1967ffc8e1e6 100644
--- a/python-package/xgboost/compat.py
+++ b/python-package/xgboost/compat.py
@@ -10,9 +10,6 @@
 
 assert (sys.version_info[0] == 3), 'Python 2 is no longer supported.'
 
-# pylint: disable=invalid-name, redefined-builtin
-STRING_TYPES = (str,)
-
 
 def py_str(x):
     """convert c string back to python string"""
diff --git a/python-package/xgboost/core.py b/python-package/xgboost/core.py
index 1c537d365a73..5972db02f669 100644
--- a/python-package/xgboost/core.py
+++ b/python-package/xgboost/core.py
@@ -17,7 +17,7 @@
 import numpy as np
 import scipy.sparse
 
-from .compat import STRING_TYPES, DataFrame, py_str, PANDAS_INSTALLED
+from .compat import DataFrame, py_str, PANDAS_INSTALLED
 from .libpath import find_lib_path
 from ._typing import (
     CStrPptr,
@@ -1387,7 +1387,7 @@ def __init__(
             _check_call(
                 _LIB.XGBoosterUnserializeFromBuffer(self.handle, ptr, length))
             self.__dict__.update(state)
-        elif isinstance(model_file, (STRING_TYPES, os.PathLike, bytearray)):
+        elif isinstance(model_file, (str, os.PathLike, bytearray)):
             self.load_model(model_file)
         elif model_file is None:
             pass
@@ -1629,7 +1629,7 @@ def set_attr(self, **kwargs: Optional[str]) -> None:
         """
         for key, value in kwargs.items():
             if value is not None:
-                if not isinstance(value, STRING_TYPES):
+                if not isinstance(value, str):
                     raise ValueError("Set Attr only accepts string values")
                 value = c_str(str(value))
             _check_call(_LIB.XGBoosterSetAttr(
@@ -1705,7 +1705,7 @@ def set_param(
         """
         if isinstance(params, Mapping):
             params = params.items()
-        elif isinstance(params, STRING_TYPES) and value is not None:
+        elif isinstance(params, str) and value is not None:
             params = [(params, value)]
         for key, val in params:
             if val is not None:
@@ -1796,7 +1796,7 @@ def eval_set(
         for d in evals:
             if not isinstance(d[0], DMatrix):
                 raise TypeError(f"expected DMatrix, got {type(d[0]).__name__}")
-            if not isinstance(d[1], STRING_TYPES):
+            if not isinstance(d[1], str):
                 raise TypeError(f"expected string, got {type(d[1]).__name__}")
             self._validate_features(d[0])
 
@@ -2192,7 +2192,7 @@ def save_model(self, fname: Union[str, os.PathLike]) -> None:
             Output file name
 
         """
-        if isinstance(fname, (STRING_TYPES, os.PathLike)):  # assume file name
+        if isinstance(fname, (str, os.PathLike)):  # assume file name
             fname = os.fspath(os.path.expanduser(fname))
             _check_call(_LIB.XGBoosterSaveModel(
                 self.handle, c_str(fname)))
@@ -2301,7 +2301,7 @@ def dump_model(self, fout: Union[str, os.PathLike], fmap: Union[str, os.PathLike
         dump_format : string, optional
             Format of model dump file. Can be 'text' or 'json'.
         """
-        if isinstance(fout, (STRING_TYPES, os.PathLike)):
+        if isinstance(fout, (str, os.PathLike)):
             fout = os.fspath(os.path.expanduser(fout))
             # pylint: disable=consider-using-with
             fout_obj = open(fout, 'w', encoding="utf-8")
diff --git a/python-package/xgboost/rabit.py b/python-package/xgboost/rabit.py
index 29723f4d062e..a28448df8a67 100644
--- a/python-package/xgboost/rabit.py
+++ b/python-package/xgboost/rabit.py
@@ -6,7 +6,7 @@
 
 import numpy as np
 
-from .core import _LIB, c_str, STRING_TYPES, _check_call
+from .core import _LIB, c_str, _check_call
 
 
 def _init_rabit() -> None:
@@ -73,7 +73,7 @@ def tracker_print(msg: Any) -> None:
     msg : str
         The message to be printed to tracker.
     """
-    if not isinstance(msg, STRING_TYPES):
+    if not isinstance(msg, str):
         msg = str(msg)
     is_dist = _LIB.RabitIsDistributed()
     if is_dist != 0:

From c45665a55a50e7134880b17ce100dd5be74887d2 Mon Sep 17 00:00:00 2001
From: Bobby Wang <wbo4958@gmail.com>
Date: Sat, 23 Apr 2022 00:06:50 +0800
Subject: [PATCH 07/16] [jvm-packages] move the dmatrix building into rabit
 context (#7823)

This fixes the QuantileDeviceDMatrix in distributed environment.
---
 .../scala/rapids/spark/GpuPreXGBoost.scala    | 28 +++++-----
 .../xgboost4j/scala/spark/PreXGBoost.scala    | 53 ++++++++++---------
 .../scala/spark/PreXGBoostProvider.scala      | 12 +++--
 .../dmlc/xgboost4j/scala/spark/XGBoost.scala  | 47 +++++++++++-----
 .../spark/XGBoostRabitRegressionSuite.scala   |  2 +
 5 files changed, 87 insertions(+), 55 deletions(-)

diff --git a/jvm-packages/xgboost4j-spark-gpu/src/main/scala/ml/dmlc/xgboost4j/scala/rapids/spark/GpuPreXGBoost.scala b/jvm-packages/xgboost4j-spark-gpu/src/main/scala/ml/dmlc/xgboost4j/scala/rapids/spark/GpuPreXGBoost.scala
index 0c3521069b37..5176a9cc0106 100644
--- a/jvm-packages/xgboost4j-spark-gpu/src/main/scala/ml/dmlc/xgboost4j/scala/rapids/spark/GpuPreXGBoost.scala
+++ b/jvm-packages/xgboost4j-spark-gpu/src/main/scala/ml/dmlc/xgboost4j/scala/rapids/spark/GpuPreXGBoost.scala
@@ -56,18 +56,20 @@ class GpuPreXGBoost extends PreXGBoostProvider {
   }
 
   /**
-   * Convert the Dataset[_] to RDD[Watches] which will be fed to XGBoost
+   * Convert the Dataset[_] to RDD[() => Watches] which will be fed to XGBoost
    *
    * @param estimator [[XGBoostClassifier]] or [[XGBoostRegressor]]
    * @param dataset   the training data
    * @param params    all user defined and defaulted params
-   * @return [[XGBoostExecutionParams]] => (RDD[[Watches]], Option[ RDD[_] ])
-   *         RDD[Watches] will be used as the training input
+   * @return [[XGBoostExecutionParams]] => (Boolean, RDD[[() => Watches]], Option[ RDD[_] ])
+   *         Boolean if building DMatrix in rabit context
+   *         RDD[() => Watches] will be used as the training input
    *         Option[ RDD[_] ] is the optional cached RDD
    */
   override def buildDatasetToRDD(estimator: Estimator[_],
       dataset: Dataset[_],
-      params: Map[String, Any]): XGBoostExecutionParams => (RDD[Watches], Option[RDD[_]]) = {
+      params: Map[String, Any]):
+    XGBoostExecutionParams => (Boolean, RDD[() => Watches], Option[RDD[_]]) = {
     GpuPreXGBoost.buildDatasetToRDD(estimator, dataset, params)
   }
 
@@ -116,19 +118,21 @@ object GpuPreXGBoost extends PreXGBoostProvider {
   }
 
   /**
-   * Convert the Dataset[_] to RDD[Watches] which will be fed to XGBoost
+   * Convert the Dataset[_] to RDD[() => Watches] which will be fed to XGBoost
    *
    * @param estimator supports XGBoostClassifier and XGBoostRegressor
    * @param dataset   the training data
    * @param params    all user defined and defaulted params
-   * @return [[XGBoostExecutionParams]] => (RDD[[Watches]], Option[ RDD[_] ])
-   *         RDD[Watches] will be used as the training input
+   * @return [[XGBoostExecutionParams]] => (Boolean, RDD[[() => Watches]], Option[ RDD[_] ])
+   *         Boolean if building DMatrix in rabit context
+   *         RDD[() => Watches] will be used as the training input to build DMatrix
    *         Option[ RDD[_] ] is the optional cached RDD
    */
   override def buildDatasetToRDD(
       estimator: Estimator[_],
       dataset: Dataset[_],
-      params: Map[String, Any]): XGBoostExecutionParams => (RDD[Watches], Option[RDD[_]]) = {
+      params: Map[String, Any]):
+    XGBoostExecutionParams => (Boolean, RDD[() => Watches], Option[RDD[_]]) = {
 
     val (Seq(labelName, weightName, marginName), feturesCols, groupName, evalSets) =
       estimator match {
@@ -166,7 +170,7 @@ object GpuPreXGBoost extends PreXGBoostProvider {
     xgbExecParams: XGBoostExecutionParams =>
       val dataMap = prepareInputData(trainingData, evalDataMap, xgbExecParams.numWorkers,
         xgbExecParams.cacheTrainingSet)
-      (buildRDDWatches(dataMap, xgbExecParams, evalDataMap.isEmpty), None)
+      (true, buildRDDWatches(dataMap, xgbExecParams, evalDataMap.isEmpty), None)
   }
 
   /**
@@ -448,7 +452,7 @@ object GpuPreXGBoost extends PreXGBoostProvider {
   private def buildRDDWatches(
       dataMap: Map[String, ColumnDataBatch],
       xgbExeParams: XGBoostExecutionParams,
-      noEvalSet: Boolean): RDD[Watches] = {
+      noEvalSet: Boolean): RDD[() => Watches] = {
 
     val sc = dataMap(TRAIN_NAME).rawDF.sparkSession.sparkContext
     val maxBin = xgbExeParams.toMap.getOrElse("max_bin", 256).asInstanceOf[Int]
@@ -459,7 +463,7 @@ object GpuPreXGBoost extends PreXGBoostProvider {
       GpuUtils.toColumnarRdd(dataMap(TRAIN_NAME).rawDF).mapPartitions({
         iter =>
           val iterColBatch = iter.map(table => new GpuColumnBatch(table, null))
-          Iterator(buildWatches(
+          Iterator(() => buildWatches(
             PreXGBoost.getCacheDirName(xgbExeParams.useExternalMemory), xgbExeParams.missing,
             colIndicesForTrain, iterColBatch, maxBin))
       })
@@ -469,7 +473,7 @@ object GpuPreXGBoost extends PreXGBoostProvider {
       val nameAndColIndices = dataMap.map(nc => (nc._1, nc._2.colIndices))
       coPartitionForGpu(dataMap, sc, xgbExeParams.numWorkers).mapPartitions {
         nameAndColumnBatchIter =>
-          Iterator(buildWatchesWithEval(
+          Iterator(() => buildWatchesWithEval(
             PreXGBoost.getCacheDirName(xgbExeParams.useExternalMemory), xgbExeParams.missing,
             nameAndColIndices, nameAndColumnBatchIter, maxBin))
       }
diff --git a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/PreXGBoost.scala b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/PreXGBoost.scala
index 32fd6938eb17..01eb3d0a4f32 100644
--- a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/PreXGBoost.scala
+++ b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/PreXGBoost.scala
@@ -96,19 +96,21 @@ object PreXGBoost extends PreXGBoostProvider {
   }
 
   /**
-   * Convert the Dataset[_] to RDD[Watches] which will be fed to XGBoost
+   * Convert the Dataset[_] to RDD[() => Watches] which will be fed to XGBoost
    *
    * @param estimator supports XGBoostClassifier and XGBoostRegressor
    * @param dataset the training data
    * @param params all user defined and defaulted params
-   * @return [[XGBoostExecutionParams]] => (RDD[[Watches]], Option[ RDD[_] ])
-   *         RDD[Watches] will be used as the training input
+   * @return [[XGBoostExecutionParams]] => (Boolean, RDD[[() => Watches]], Option[ RDD[_] ])
+   *         Boolean if building DMatrix in rabit context
+   *         RDD[() => Watches] will be used as the training input
    *         Option[RDD[_]\] is the optional cached RDD
    */
   override def buildDatasetToRDD(
       estimator: Estimator[_],
       dataset: Dataset[_],
-      params: Map[String, Any]): XGBoostExecutionParams => (RDD[Watches], Option[RDD[_]]) = {
+      params: Map[String, Any]): XGBoostExecutionParams =>
+    (Boolean, RDD[() => Watches], Option[RDD[_]]) = {
 
     if (optionProvider.isDefined && optionProvider.get.providerEnabled(Some(dataset))) {
       return optionProvider.get.buildDatasetToRDD(estimator, dataset, params)
@@ -170,12 +172,12 @@ object PreXGBoost extends PreXGBoostProvider {
           val cachedRDD = if (xgbExecParams.cacheTrainingSet) {
             Some(trainingData.persist(StorageLevel.MEMORY_AND_DISK))
           } else None
-          (trainForRanking(trainingData, xgbExecParams, evalRDDMap), cachedRDD)
+          (false, trainForRanking(trainingData, xgbExecParams, evalRDDMap), cachedRDD)
         case Right(trainingData) =>
           val cachedRDD = if (xgbExecParams.cacheTrainingSet) {
             Some(trainingData.persist(StorageLevel.MEMORY_AND_DISK))
           } else None
-          (trainForNonRanking(trainingData, xgbExecParams, evalRDDMap), cachedRDD)
+          (false, trainForNonRanking(trainingData, xgbExecParams, evalRDDMap), cachedRDD)
       }
 
   }
@@ -311,17 +313,18 @@ object PreXGBoost extends PreXGBoostProvider {
 
 
   /**
-   * Converting the RDD[XGBLabeledPoint] to the function to build RDD[Watches]
+   * Converting the RDD[XGBLabeledPoint] to the function to build RDD[() => Watches]
    *
    * @param trainingSet the input training RDD[XGBLabeledPoint]
    * @param evalRDDMap the eval set
    * @param hasGroup if has group
-   * @return function to build (RDD[Watches], the cached RDD)
+   * @return function to build (RDD[() => Watches], the cached RDD)
    */
   private[spark] def buildRDDLabeledPointToRDDWatches(
       trainingSet: RDD[XGBLabeledPoint],
       evalRDDMap: Map[String, RDD[XGBLabeledPoint]] = Map(),
-      hasGroup: Boolean = false): XGBoostExecutionParams => (RDD[Watches], Option[RDD[_]]) = {
+      hasGroup: Boolean = false):
+  XGBoostExecutionParams => (Boolean, RDD[() => Watches], Option[RDD[_]]) = {
 
     xgbExecParams: XGBoostExecutionParams =>
       composeInputData(trainingSet, hasGroup, xgbExecParams.numWorkers) match {
@@ -329,12 +332,12 @@ object PreXGBoost extends PreXGBoostProvider {
           val cachedRDD = if (xgbExecParams.cacheTrainingSet) {
             Some(trainingData.persist(StorageLevel.MEMORY_AND_DISK))
           } else None
-          (trainForRanking(trainingData, xgbExecParams, evalRDDMap), cachedRDD)
+          (false, trainForRanking(trainingData, xgbExecParams, evalRDDMap), cachedRDD)
         case Right(trainingData) =>
           val cachedRDD = if (xgbExecParams.cacheTrainingSet) {
             Some(trainingData.persist(StorageLevel.MEMORY_AND_DISK))
           } else None
-          (trainForNonRanking(trainingData, xgbExecParams, evalRDDMap), cachedRDD)
+          (false, trainForNonRanking(trainingData, xgbExecParams, evalRDDMap), cachedRDD)
       }
   }
 
@@ -374,34 +377,34 @@ object PreXGBoost extends PreXGBoostProvider {
   }
 
   /**
-   * Build RDD[Watches] for Ranking
+   * Build RDD[() => Watches] for Ranking
    * @param trainingData the training data RDD
    * @param xgbExecutionParams xgboost execution params
    * @param evalSetsMap the eval RDD
-   * @return RDD[Watches]
+   * @return RDD[() => Watches]
    */
   private def trainForRanking(
       trainingData: RDD[Array[XGBLabeledPoint]],
       xgbExecutionParam: XGBoostExecutionParams,
-      evalSetsMap: Map[String, RDD[XGBLabeledPoint]]): RDD[Watches] = {
+      evalSetsMap: Map[String, RDD[XGBLabeledPoint]]): RDD[() => Watches] = {
     if (evalSetsMap.isEmpty) {
       trainingData.mapPartitions(labeledPointGroups => {
-        val watches = Watches.buildWatchesWithGroup(xgbExecutionParam,
+        val buildWatches = () => Watches.buildWatchesWithGroup(xgbExecutionParam,
           DataUtils.processMissingValuesWithGroup(labeledPointGroups, xgbExecutionParam.missing,
             xgbExecutionParam.allowNonZeroForMissing),
           getCacheDirName(xgbExecutionParam.useExternalMemory))
-        Iterator.single(watches)
+        Iterator.single(buildWatches)
       }).cache()
     } else {
       coPartitionGroupSets(trainingData, evalSetsMap, xgbExecutionParam.numWorkers).mapPartitions(
         labeledPointGroupSets => {
-          val watches = Watches.buildWatchesWithGroup(
+          val buildWatches = () => Watches.buildWatchesWithGroup(
             labeledPointGroupSets.map {
               case (name, iter) => (name, DataUtils.processMissingValuesWithGroup(iter,
                 xgbExecutionParam.missing, xgbExecutionParam.allowNonZeroForMissing))
             },
             getCacheDirName(xgbExecutionParam.useExternalMemory))
-          Iterator.single(watches)
+          Iterator.single(buildWatches)
         }).cache()
     }
   }
@@ -462,35 +465,35 @@ object PreXGBoost extends PreXGBoostProvider {
   }
 
   /**
-   * Build RDD[Watches] for Non-Ranking
+   * Build RDD[() => Watches] for Non-Ranking
    * @param trainingData the training data RDD
    * @param xgbExecutionParams xgboost execution params
    * @param evalSetsMap the eval RDD
-   * @return RDD[Watches]
+   * @return RDD[() => Watches]
    */
   private def trainForNonRanking(
       trainingData: RDD[XGBLabeledPoint],
       xgbExecutionParams: XGBoostExecutionParams,
-      evalSetsMap: Map[String, RDD[XGBLabeledPoint]]): RDD[Watches] = {
+      evalSetsMap: Map[String, RDD[XGBLabeledPoint]]): RDD[() => Watches] = {
     if (evalSetsMap.isEmpty) {
       trainingData.mapPartitions { labeledPoints => {
-        val watches = Watches.buildWatches(xgbExecutionParams,
+        val buildWatches = () => Watches.buildWatches(xgbExecutionParams,
           DataUtils.processMissingValues(labeledPoints, xgbExecutionParams.missing,
             xgbExecutionParams.allowNonZeroForMissing),
           getCacheDirName(xgbExecutionParams.useExternalMemory))
-        Iterator.single(watches)
+        Iterator.single(buildWatches)
       }}.cache()
     } else {
       coPartitionNoGroupSets(trainingData, evalSetsMap, xgbExecutionParams.numWorkers).
         mapPartitions {
           nameAndLabeledPointSets =>
-            val watches = Watches.buildWatches(
+            val buildWatches = () => Watches.buildWatches(
               nameAndLabeledPointSets.map {
                 case (name, iter) => (name, DataUtils.processMissingValues(iter,
                   xgbExecutionParams.missing, xgbExecutionParams.allowNonZeroForMissing))
               },
               getCacheDirName(xgbExecutionParams.useExternalMemory))
-            Iterator.single(watches)
+            Iterator.single(buildWatches)
         }.cache()
     }
   }
diff --git a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/PreXGBoostProvider.scala b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/PreXGBoostProvider.scala
index 7d0c1dde2e3d..d133aea288dd 100644
--- a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/PreXGBoostProvider.scala
+++ b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/PreXGBoostProvider.scala
@@ -1,5 +1,5 @@
 /*
- Copyright (c) 2021 by Contributors
+ Copyright (c) 2021-2022 by Contributors
 
  Licensed under the Apache License, Version 2.0 (the "License");
  you may not use this file except in compliance with the License.
@@ -45,19 +45,21 @@ private[scala] trait PreXGBoostProvider {
   def transformSchema(xgboostEstimator: XGBoostEstimatorCommon, schema: StructType): StructType
 
   /**
-   * Convert the Dataset[_] to RDD[Watches] which will be fed to XGBoost
+   * Convert the Dataset[_] to RDD[() => Watches] which will be fed to XGBoost
    *
    * @param estimator supports XGBoostClassifier and XGBoostRegressor
    * @param dataset the training data
    * @param params all user defined and defaulted params
-   * @return [[XGBoostExecutionParams]] => (RDD[[Watches]], Option[ RDD[_] ])
-   *         RDD[Watches] will be used as the training input
+   * @return [[XGBoostExecutionParams]] => (Boolean, RDD[[() => Watches]], Option[ RDD[_] ])
+   *         Boolean if building DMatrix in rabit context
+   *         RDD[() => Watches] will be used as the training input to build DMatrix
    *         Option[ RDD[_] ] is the optional cached RDD
    */
   def buildDatasetToRDD(
     estimator: Estimator[_],
     dataset: Dataset[_],
-    params: Map[String, Any]): XGBoostExecutionParams => (RDD[Watches], Option[RDD[_]])
+    params: Map[String, Any]):
+  XGBoostExecutionParams => (Boolean, RDD[() => Watches], Option[RDD[_]])
 
   /**
    * Transform Dataset
diff --git a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala
index c16e45858415..df19858749cd 100644
--- a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala
+++ b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala
@@ -283,13 +283,8 @@ object XGBoost extends Serializable {
     }
   }
 
-  private def buildDistributedBooster(
-      watches: Watches,
-      xgbExecutionParam: XGBoostExecutionParams,
-      rabitEnv: java.util.Map[String, String],
-      obj: ObjectiveTrait,
-      eval: EvalTrait,
-      prevBooster: Booster): Iterator[(Booster, Map[String, Array[Float]])] = {
+  private def buildWatchesAndCheck(buildWatchesFun: () => Watches): Watches = {
+    val watches = buildWatchesFun()
     // to workaround the empty partitions in training dataset,
     // this might not be the best efficient implementation, see
     // (https://github.com/dmlc/xgboost/issues/1277)
@@ -298,14 +293,39 @@ object XGBoost extends Serializable {
         s"detected an empty partition in the training data, partition ID:" +
           s" ${TaskContext.getPartitionId()}")
     }
+    watches
+  }
+
+  private def buildDistributedBooster(
+      buildDMatrixInRabit: Boolean,
+      buildWatches: () => Watches,
+      xgbExecutionParam: XGBoostExecutionParams,
+      rabitEnv: java.util.Map[String, String],
+      obj: ObjectiveTrait,
+      eval: EvalTrait,
+      prevBooster: Booster): Iterator[(Booster, Map[String, Array[Float]])] = {
+
+    var watches: Watches = null
+    if (!buildDMatrixInRabit) {
+      // for CPU pipeline, we need to build DMatrix out of rabit context
+      watches = buildWatchesAndCheck(buildWatches)
+    }
+
     val taskId = TaskContext.getPartitionId().toString
     val attempt = TaskContext.get().attemptNumber.toString
     rabitEnv.put("DMLC_TASK_ID", taskId)
     rabitEnv.put("DMLC_NUM_ATTEMPT", attempt)
     val numRounds = xgbExecutionParam.numRounds
     val makeCheckpoint = xgbExecutionParam.checkpointParam.isDefined && taskId.toInt == 0
+
     try {
       Rabit.init(rabitEnv)
+
+      if (buildDMatrixInRabit) {
+        // for GPU pipeline, we need to move dmatrix building into rabit context
+        watches = buildWatchesAndCheck(buildWatches)
+      }
+
       val numEarlyStoppingRounds = xgbExecutionParam.earlyStoppingParams.numEarlyStoppingRounds
       val metrics = Array.tabulate(watches.size)(_ => Array.ofDim[Float](numRounds))
       val externalCheckpointParams = xgbExecutionParam.checkpointParam
@@ -338,7 +358,7 @@ object XGBoost extends Serializable {
         throw xgbException
     } finally {
       Rabit.shutdown()
-      watches.delete()
+      if (watches != null) watches.delete()
     }
   }
 
@@ -364,7 +384,7 @@ object XGBoost extends Serializable {
   @throws(classOf[XGBoostError])
   private[spark] def trainDistributed(
       sc: SparkContext,
-      buildTrainingData: XGBoostExecutionParams => (RDD[Watches], Option[RDD[_]]),
+      buildTrainingData: XGBoostExecutionParams => (Boolean, RDD[() => Watches], Option[RDD[_]]),
       params: Map[String, Any]):
     (Booster, Map[String, Array[Float]]) = {
 
@@ -383,7 +403,7 @@ object XGBoost extends Serializable {
     }.orNull
 
     // Get the training data RDD and the cachedRDD
-    val (trainingRDD, optionalCachedRDD) = buildTrainingData(xgbExecParams)
+    val (buildDMatrixInRabit, trainingRDD, optionalCachedRDD) = buildTrainingData(xgbExecParams)
 
     try {
       // Train for every ${savingRound} rounds and save the partially completed booster
@@ -398,15 +418,16 @@ object XGBoost extends Serializable {
         val rabitEnv = tracker.getWorkerEnvs
 
         val boostersAndMetrics = trainingRDD.mapPartitions { iter => {
-          var optionWatches: Option[Watches] = None
+          var optionWatches: Option[() => Watches] = None
 
           // take the first Watches to train
           if (iter.hasNext) {
             optionWatches = Some(iter.next())
           }
 
-          optionWatches.map { watches => buildDistributedBooster(watches, xgbExecParams, rabitEnv,
-            xgbExecParams.obj, xgbExecParams.eval, prevBooster)}
+          optionWatches.map { buildWatches => buildDistributedBooster(buildDMatrixInRabit,
+            buildWatches, xgbExecParams, rabitEnv, xgbExecParams.obj,
+            xgbExecParams.eval, prevBooster)}
             .getOrElse(throw new RuntimeException("No Watches to train"))
 
         }}.cache()
diff --git a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostRabitRegressionSuite.scala b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostRabitRegressionSuite.scala
index 2e51f15b0161..7e2cbb6d537f 100644
--- a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostRabitRegressionSuite.scala
+++ b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostRabitRegressionSuite.scala
@@ -119,6 +119,8 @@ class XGBoostRabitRegressionSuite extends FunSuite with PerTest {
   }
 
   test("test SparkContext should not be killed ") {
+    cancel("For some reason, sparkContext can't cancel the job locally in the CI env," +
+      "which will be resolved when introducing barrier mode")
     val training = buildDataFrame(Classification.train)
     // mock rank 0 failure during 8th allreduce synchronization
     Rabit.mockList = Array("0,8,0,0").toList.asJava

From 332380479bd7ea718c04124fe35693773eca1112 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Sat, 23 Apr 2022 02:07:01 +0800
Subject: [PATCH 08/16] Avoid warning in np primitive type tests. (#7833)

---
 tests/python/test_basic_models.py |  2 +-
 tests/python/test_predict.py      | 10 +++++++---
 tests/python/test_with_pandas.py  |  2 +-
 3 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/tests/python/test_basic_models.py b/tests/python/test_basic_models.py
index 2cfa26402de8..510aec506a1a 100644
--- a/tests/python/test_basic_models.py
+++ b/tests/python/test_basic_models.py
@@ -569,7 +569,7 @@ def test_feature_info(self):
         y = rng.randn(rows)
         feature_names = ["test_feature_" + str(i) for i in range(cols)]
         X_pd = pd.DataFrame(X, columns=feature_names)
-        X_pd.iloc[:, 3] = X_pd.iloc[:, 3].astype(np.int)
+        X_pd.iloc[:, 3] = X_pd.iloc[:, 3].astype(np.int32)
 
         Xy = xgb.DMatrix(X_pd, y)
         assert Xy.feature_types[3] == "int"
diff --git a/tests/python/test_predict.py b/tests/python/test_predict.py
index b34d508cd845..f4ea944e8bd4 100644
--- a/tests/python/test_predict.py
+++ b/tests/python/test_predict.py
@@ -245,19 +245,23 @@ def test_dtypes(self):
         predt_orig = self.booster.inplace_predict(orig)
         # all primitive types in numpy
         for dtype in [
-            np.signedinteger,
+            np.int32,
+            np.int64,
             np.byte,
             np.short,
             np.intc,
             np.int_,
             np.longlong,
-            np.unsignedinteger,
+            np.uint32,
+            np.uint64,
             np.ubyte,
             np.ushort,
             np.uintc,
             np.uint,
             np.ulonglong,
-            np.floating,
+            np.float16,
+            np.float32,
+            np.float64,
             np.half,
             np.single,
             np.double,
diff --git a/tests/python/test_with_pandas.py b/tests/python/test_with_pandas.py
index c55d698bbe11..1401dd69948b 100644
--- a/tests/python/test_with_pandas.py
+++ b/tests/python/test_with_pandas.py
@@ -328,5 +328,5 @@ def test_bool(dtype) -> bytes:
             return to_bytes(Xy)
 
         b0 = test_bool(pd.BooleanDtype())
-        b1 = test_bool(np.bool)
+        b1 = test_bool(bool)
         assert b0 != b1         # None is converted to False with np.bool

From 6ece549a905759617b6160d465b356aa058daed6 Mon Sep 17 00:00:00 2001
From: Bobby Wang <wbo4958@gmail.com>
Date: Sun, 24 Apr 2022 14:25:22 +0800
Subject: [PATCH 09/16] [doc] update the jvm tutorial to 1.6.1 [skip ci]
 (#7834)

---
 doc/jvm/xgboost4j_spark_gpu_tutorial.rst | 4 ++--
 doc/jvm/xgboost4j_spark_tutorial.rst     | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/doc/jvm/xgboost4j_spark_gpu_tutorial.rst b/doc/jvm/xgboost4j_spark_gpu_tutorial.rst
index 5af257da0439..f3b97d9c319f 100644
--- a/doc/jvm/xgboost4j_spark_gpu_tutorial.rst
+++ b/doc/jvm/xgboost4j_spark_gpu_tutorial.rst
@@ -1,5 +1,5 @@
 #############################################
-XGBoost4J-Spark-GPU Tutorial (version 1.6.0+)
+XGBoost4J-Spark-GPU Tutorial (version 1.6.1+)
 #############################################
 
 **XGBoost4J-Spark-GPU** is an open source library aiming to accelerate distributed XGBoost training on Apache Spark cluster from
@@ -220,7 +220,7 @@ application jar is iris-1.0.0.jar
 
   cudf_version=22.02.0
   rapids_version=22.02.0
-  xgboost_version=1.6.0
+  xgboost_version=1.6.1
   main_class=Iris
   app_jar=iris-1.0.0.jar
 
diff --git a/doc/jvm/xgboost4j_spark_tutorial.rst b/doc/jvm/xgboost4j_spark_tutorial.rst
index bc0ae92764da..ce689cb95358 100644
--- a/doc/jvm/xgboost4j_spark_tutorial.rst
+++ b/doc/jvm/xgboost4j_spark_tutorial.rst
@@ -129,7 +129,7 @@ labels. A DataFrame like this (containing vector-represented features and numeri
 
 .. note::
 
-  There is no need to assemble feature columns from version 1.6.0+. Instead, users can specify an array of
+  There is no need to assemble feature columns from version 1.6.1+. Instead, users can specify an array of
   feture column names by ``setFeaturesCol(value: Array[String])`` and XGBoost4j-Spark will do it.
 
 Dealing with missing values

From dc2e6996568cd824bf9afb3361c40df1696358c4 Mon Sep 17 00:00:00 2001
From: Bobby Wang <wbo4958@gmail.com>
Date: Mon, 25 Apr 2022 17:09:52 +0800
Subject: [PATCH 10/16] [Breaking][jvm-packages] Use barrier execution mode
 (#7836)

With the introduction of the barrier execution mode. we don't need to kill SparkContext when some xgboost tasks failed. Instead, Spark will handle the errors for us. So in this PR, `killSparkContextOnWorkerFailure` parameter is deleted.
---
 .../dmlc/xgboost4j/scala/spark/XGBoost.scala  |  78 ++------
 .../spark/params/LearningTaskParams.scala     |   8 +-
 .../spark/SparkParallelismTracker.scala       | 175 ------------------
 .../src/test/resources/log4j.properties       |   2 +-
 .../ExternalCheckpointManagerSuite.scala      |   4 +-
 .../spark/FeatureSizeValidatingSuite.scala    |   6 +-
 .../spark/MissingValueHandlingSuite.scala     |  10 +-
 .../scala/spark/ParameterSuite.scala          |  32 +---
 .../dmlc/xgboost4j/scala/spark/PerTest.scala  |  30 +--
 .../scala/spark/PersistenceSuite.scala        |   2 +-
 .../scala/spark/XGBoostConfigureSuite.scala   |   4 +-
 .../scala/spark/XGBoostGeneralSuite.scala     |  16 +-
 .../spark/XGBoostRabitRegressionSuite.scala   |  57 +-----
 .../scala/spark/XGBoostRegressorSuite.scala   |   1 -
 .../spark/SparkParallelismTrackerSuite.scala  | 151 ---------------
 15 files changed, 60 insertions(+), 516 deletions(-)
 delete mode 100644 jvm-packages/xgboost4j-spark/src/main/scala/org/apache/spark/SparkParallelismTracker.scala
 delete mode 100644 jvm-packages/xgboost4j-spark/src/test/scala/org/apache/spark/SparkParallelismTrackerSuite.scala

diff --git a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala
index df19858749cd..e6ccb6349b57 100644
--- a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala
+++ b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala
@@ -21,6 +21,7 @@ import java.io.File
 import scala.collection.mutable
 import scala.util.Random
 import scala.collection.JavaConverters._
+
 import ml.dmlc.xgboost4j.java.{IRabitTracker, Rabit, XGBoostError, RabitTracker => PyRabitTracker}
 import ml.dmlc.xgboost4j.scala.rabit.RabitTracker
 import ml.dmlc.xgboost4j.scala.spark.params.LearningTaskParams
@@ -30,8 +31,9 @@ import ml.dmlc.xgboost4j.{LabeledPoint => XGBLabeledPoint}
 import org.apache.commons.io.FileUtils
 import org.apache.commons.logging.LogFactory
 import org.apache.hadoop.fs.FileSystem
+
 import org.apache.spark.rdd.RDD
-import org.apache.spark.{SparkContext, SparkParallelismTracker, TaskContext}
+import org.apache.spark.{SparkContext, TaskContext}
 import org.apache.spark.sql.SparkSession
 
 /**
@@ -79,8 +81,7 @@ private[scala] case class XGBoostExecutionParams(
     earlyStoppingParams: XGBoostExecutionEarlyStoppingParams,
     cacheTrainingSet: Boolean,
     treeMethod: Option[String],
-    isLocal: Boolean,
-    killSparkContextOnWorkerFailure: Boolean) {
+    isLocal: Boolean) {
 
   private var rawParamMap: Map[String, Any] = _
 
@@ -224,9 +225,6 @@ private[this] class XGBoostExecutionParamsFactory(rawParams: Map[String, Any], s
     val cacheTrainingSet = overridedParams.getOrElse("cache_training_set", false)
       .asInstanceOf[Boolean]
 
-    val killSparkContext = overridedParams.getOrElse("kill_spark_context_on_worker_failure", true)
-      .asInstanceOf[Boolean]
-
     val xgbExecParam = XGBoostExecutionParams(nWorkers, round, useExternalMemory, obj, eval,
       missing, allowNonZeroForMissing, trackerConf,
       timeoutRequestWorkers,
@@ -235,8 +233,7 @@ private[this] class XGBoostExecutionParamsFactory(rawParams: Map[String, Any], s
       xgbExecEarlyStoppingParams,
       cacheTrainingSet,
       treeMethod,
-      isLocal,
-      killSparkContext)
+      isLocal)
     xgbExecParam.setRawParamMap(overridedParams)
     xgbExecParam
   }
@@ -351,7 +348,11 @@ object XGBoost extends Serializable {
           watches.toMap, metrics, obj, eval,
           earlyStoppingRound = numEarlyStoppingRounds, prevBooster)
       }
-      Iterator(booster -> watches.toMap.keys.zip(metrics).toMap)
+      if (TaskContext.get().partitionId() == 0) {
+        Iterator(booster -> watches.toMap.keys.zip(metrics).toMap)
+      } else {
+        Iterator.empty
+      }
     } catch {
       case xgbException: XGBoostError =>
         logger.error(s"XGBooster worker $taskId has failed $attempt times due to ", xgbException)
@@ -409,15 +410,10 @@ object XGBoost extends Serializable {
       // Train for every ${savingRound} rounds and save the partially completed booster
       val tracker = startTracker(xgbExecParams.numWorkers, xgbExecParams.trackerConf)
       val (booster, metrics) = try {
-        val parallelismTracker = new SparkParallelismTracker(sc,
-          xgbExecParams.timeoutRequestWorkers,
-          xgbExecParams.numWorkers,
-          xgbExecParams.killSparkContextOnWorkerFailure)
-
         tracker.getWorkerEnvs().putAll(xgbRabitParams)
         val rabitEnv = tracker.getWorkerEnvs
 
-        val boostersAndMetrics = trainingRDD.mapPartitions { iter => {
+        val boostersAndMetrics = trainingRDD.barrier().mapPartitions { iter => {
           var optionWatches: Option[() => Watches] = None
 
           // take the first Watches to train
@@ -430,24 +426,14 @@ object XGBoost extends Serializable {
             xgbExecParams.eval, prevBooster)}
             .getOrElse(throw new RuntimeException("No Watches to train"))
 
-        }}.cache()
-
-        val sparkJobThread = new Thread() {
-          override def run() {
-            // force the job
-            boostersAndMetrics.foreachPartition(() => _)
-          }
-        }
-        sparkJobThread.setUncaughtExceptionHandler(tracker)
-
-        val trackerReturnVal = parallelismTracker.execute {
-          sparkJobThread.start()
-          tracker.waitFor(0L)
-        }
+        }}
 
+        val (booster, metrics) = boostersAndMetrics.collect()(0)
+        val trackerReturnVal = tracker.waitFor(0L)
         logger.info(s"Rabit returns with exit code $trackerReturnVal")
-        val (booster, metrics) = postTrackerReturnProcessing(trackerReturnVal,
-          boostersAndMetrics, sparkJobThread)
+        if (trackerReturnVal != 0) {
+          throw new XGBoostError("XGBoostModel training failed.")
+        }
         (booster, metrics)
       } finally {
         tracker.stop()
@@ -467,42 +453,12 @@ object XGBoost extends Serializable {
       case t: Throwable =>
         // if the job was aborted due to an exception
         logger.error("the job was aborted due to ", t)
-        if (xgbExecParams.killSparkContextOnWorkerFailure) {
-          sc.stop()
-        }
         throw t
     } finally {
       optionalCachedRDD.foreach(_.unpersist())
     }
   }
 
-  private def postTrackerReturnProcessing(
-      trackerReturnVal: Int,
-      distributedBoostersAndMetrics: RDD[(Booster, Map[String, Array[Float]])],
-      sparkJobThread: Thread): (Booster, Map[String, Array[Float]]) = {
-    if (trackerReturnVal == 0) {
-      // Copies of the final booster and the corresponding metrics
-      // reside in each partition of the `distributedBoostersAndMetrics`.
-      // Any of them can be used to create the model.
-      // it's safe to block here forever, as the tracker has returned successfully, and the Spark
-      // job should have finished, there is no reason for the thread cannot return
-      sparkJobThread.join()
-      val (booster, metrics) = distributedBoostersAndMetrics.first()
-      distributedBoostersAndMetrics.unpersist(false)
-      (booster, metrics)
-    } else {
-      try {
-        if (sparkJobThread.isAlive) {
-          sparkJobThread.interrupt()
-        }
-      } catch {
-        case _: InterruptedException =>
-          logger.info("spark job thread is interrupted")
-      }
-      throw new XGBoostError("XGBoostModel training failed")
-    }
-  }
-
 }
 
 class Watches private[scala] (
diff --git a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/params/LearningTaskParams.scala b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/params/LearningTaskParams.scala
index 988535547441..852864d9cb1c 100644
--- a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/params/LearningTaskParams.scala
+++ b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/params/LearningTaskParams.scala
@@ -105,14 +105,8 @@ private[spark] trait LearningTaskParams extends Params {
 
   final def getMaximizeEvaluationMetrics: Boolean = $(maximizeEvaluationMetrics)
 
-  /**
-   * whether killing SparkContext when training task fails
-   */
-  final val killSparkContextOnWorkerFailure = new BooleanParam(this,
-    "killSparkContextOnWorkerFailure", "whether killing SparkContext when training task fails")
-
   setDefault(objective -> "reg:squarederror", baseScore -> 0.5, trainTestRatio -> 1.0,
-    numEarlyStoppingRounds -> 0, cacheTrainingSet -> false, killSparkContextOnWorkerFailure -> true)
+    numEarlyStoppingRounds -> 0, cacheTrainingSet -> false)
 }
 
 private[spark] object LearningTaskParams {
diff --git a/jvm-packages/xgboost4j-spark/src/main/scala/org/apache/spark/SparkParallelismTracker.scala b/jvm-packages/xgboost4j-spark/src/main/scala/org/apache/spark/SparkParallelismTracker.scala
deleted file mode 100644
index 99c1cccf2761..000000000000
--- a/jvm-packages/xgboost4j-spark/src/main/scala/org/apache/spark/SparkParallelismTracker.scala
+++ /dev/null
@@ -1,175 +0,0 @@
-/*
- Copyright (c) 2014 by Contributors
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
- */
-
-package org.apache.spark
-
-import org.apache.commons.logging.LogFactory
-import org.apache.spark.scheduler._
-
-import scala.collection.mutable.{HashMap, HashSet}
-
-/**
- * A tracker that ensures enough number of executor cores are alive.
- * Throws an exception when the number of alive cores is less than nWorkers.
- *
- * @param sc The SparkContext object
- * @param timeout The maximum time to wait for enough number of workers.
- * @param numWorkers nWorkers used in an XGBoost Job
- * @param killSparkContextOnWorkerFailure kill SparkContext or not when task fails
- */
-class SparkParallelismTracker(
-    val sc: SparkContext,
-    timeout: Long,
-    numWorkers: Int,
-    killSparkContextOnWorkerFailure: Boolean = true) {
-
-  private[this] val requestedCores = numWorkers * sc.conf.getInt("spark.task.cpus", 1)
-  private[this] val logger = LogFactory.getLog("XGBoostSpark")
-
-  private[this] def numAliveCores: Int = {
-    sc.statusStore.executorList(true).map(_.totalCores).sum
-  }
-
-  private[this] def waitForCondition(
-      condition: => Boolean,
-      timeout: Long,
-      checkInterval: Long = 100L) = {
-    val waitImpl = new ((Long, Boolean) => Boolean) {
-      override def apply(waitedTime: Long, status: Boolean): Boolean = status match {
-        case s if s => true
-        case _ => waitedTime match {
-          case t if t < timeout =>
-            Thread.sleep(checkInterval)
-            apply(t + checkInterval, status = condition)
-          case _ => false
-        }
-      }
-    }
-    waitImpl(0L, condition)
-  }
-
-  private[this] def safeExecute[T](body: => T): T = {
-    val listener = new TaskFailedListener(killSparkContextOnWorkerFailure)
-    sc.addSparkListener(listener)
-    try {
-      body
-    } finally {
-      sc.removeSparkListener(listener)
-    }
-  }
-
-  /**
-   * Execute a blocking function call with two checks on enough nWorkers:
-   *  - Before the function starts, wait until there are enough executor cores.
-   *  - During the execution, throws an exception if there is any executor lost.
-   *
-   * @param body A blocking function call
-   * @tparam T Return type
-   * @return The return of body
-   */
-  def execute[T](body: => T): T = {
-    if (timeout <= 0) {
-      logger.info("starting training without setting timeout for waiting for resources")
-      safeExecute(body)
-    } else {
-      logger.info(s"starting training with timeout set as $timeout ms for waiting for resources")
-      if (!waitForCondition(numAliveCores >= requestedCores, timeout)) {
-        throw new IllegalStateException(s"Unable to get $requestedCores cores for XGBoost training")
-      }
-      safeExecute(body)
-    }
-  }
-}
-
-class TaskFailedListener(killSparkContext: Boolean = true) extends SparkListener {
-
-  private[this] val logger = LogFactory.getLog("XGBoostTaskFailedListener")
-
-  // {jobId, [stageId0, stageId1, ...] }
-  // keep track of the mapping of job id and stage ids
-  // when a task fails, find the job id and stage id the task belongs to, finally
-  // cancel the jobs
-  private val jobIdToStageIds: HashMap[Int, HashSet[Int]] = HashMap.empty
-
-  override def onJobStart(jobStart: SparkListenerJobStart): Unit = {
-    if (!killSparkContext) {
-      jobStart.stageIds.foreach(stageId => {
-        jobIdToStageIds.getOrElseUpdate(jobStart.jobId, new HashSet[Int]()) += stageId
-      })
-    }
-  }
-
-  override def onJobEnd(jobEnd: SparkListenerJobEnd): Unit = {
-    if (!killSparkContext) {
-      jobIdToStageIds.remove(jobEnd.jobId)
-    }
-  }
-
-  override def onTaskEnd(taskEnd: SparkListenerTaskEnd): Unit = {
-    taskEnd.reason match {
-      case taskEndReason: TaskFailedReason =>
-        logger.error(s"Training Task Failed during XGBoost Training: " +
-            s"$taskEndReason")
-        if (killSparkContext) {
-          logger.error("killing SparkContext")
-          TaskFailedListener.startedSparkContextKiller()
-        } else {
-          val stageId = taskEnd.stageId
-          // find job ids according to stage id and then cancel the job
-
-          jobIdToStageIds.foreach {
-            case (jobId, stageIds) =>
-              if (stageIds.contains(stageId)) {
-                logger.error("Cancelling jobId:" + jobId)
-                jobIdToStageIds.remove(jobId)
-                SparkContext.getOrCreate().cancelJob(jobId)
-              }
-          }
-        }
-      case _ =>
-    }
-  }
-}
-
-object TaskFailedListener {
-
-  var killerStarted: Boolean = false
-
-  var sparkContextKiller: Thread = _
-
-  val sparkContextShutdownLock = new AnyRef
-
-  private def startedSparkContextKiller(): Unit = this.synchronized {
-    if (!killerStarted) {
-      killerStarted = true
-      // Spark does not allow ListenerThread to shutdown SparkContext so that we have to do it
-      // in a separate thread
-      sparkContextKiller = new Thread() {
-        override def run(): Unit = {
-          LiveListenerBus.withinListenerThread.withValue(false) {
-            sparkContextShutdownLock.synchronized {
-              SparkContext.getActive.foreach(_.stop())
-              killerStarted = false
-              sparkContextShutdownLock.notify()
-            }
-          }
-        }
-      }
-      sparkContextKiller.setDaemon(true)
-      sparkContextKiller.start()
-    }
-  }
-}
diff --git a/jvm-packages/xgboost4j-spark/src/test/resources/log4j.properties b/jvm-packages/xgboost4j-spark/src/test/resources/log4j.properties
index dcd02d2c878d..900a698ae76c 100644
--- a/jvm-packages/xgboost4j-spark/src/test/resources/log4j.properties
+++ b/jvm-packages/xgboost4j-spark/src/test/resources/log4j.properties
@@ -1 +1 @@
-log4j.logger.org.apache.spark=ERROR
\ No newline at end of file
+log4j.logger.org.apache.spark=ERROR
diff --git a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/ExternalCheckpointManagerSuite.scala b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/ExternalCheckpointManagerSuite.scala
index 5ef49431468f..cdcfd76f5bf7 100755
--- a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/ExternalCheckpointManagerSuite.scala
+++ b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/ExternalCheckpointManagerSuite.scala
@@ -1,5 +1,5 @@
 /*
- Copyright (c) 2014 by Contributors
+ Copyright (c) 2014-2022 by Contributors
 
  Licensed under the Apache License, Version 2.0 (the "License");
  you may not use this file except in compliance with the License.
@@ -19,7 +19,7 @@ package ml.dmlc.xgboost4j.scala.spark
 import java.io.File
 
 import ml.dmlc.xgboost4j.scala.{Booster, DMatrix, ExternalCheckpointManager, XGBoost => SXGBoost}
-import org.scalatest.{FunSuite, Ignore}
+import org.scalatest.FunSuite
 import org.apache.hadoop.fs.{FileSystem, Path}
 
 class ExternalCheckpointManagerSuite extends FunSuite with TmpFolderPerSuite with PerTest {
diff --git a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/FeatureSizeValidatingSuite.scala b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/FeatureSizeValidatingSuite.scala
index 7e560827b5b6..79562d1f428b 100644
--- a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/FeatureSizeValidatingSuite.scala
+++ b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/FeatureSizeValidatingSuite.scala
@@ -1,5 +1,5 @@
 /*
- Copyright (c) 2014 by Contributors
+ Copyright (c) 2014-2022 by Contributors
 
  Licensed under the Apache License, Version 2.0 (the "License");
  you may not use this file except in compliance with the License.
@@ -16,10 +16,8 @@
 
 package ml.dmlc.xgboost4j.scala.spark
 
-import ml.dmlc.xgboost4j.java.XGBoostError
 import org.apache.spark.Partitioner
 import org.apache.spark.ml.feature.VectorAssembler
-import org.apache.spark.sql.SparkSession
 import org.scalatest.FunSuite
 import org.apache.spark.sql.functions._
 
@@ -53,7 +51,7 @@ class FeatureSizeValidatingSuite extends FunSuite with PerTest {
       "objective" -> "binary:logistic",
       "num_round" -> 5, "num_workers" -> 2, "use_external_memory" -> true, "missing" -> 0)
     import DataUtils._
-    val sparkSession = SparkSession.builder().getOrCreate()
+    val sparkSession = ss
     import sparkSession.implicits._
     val repartitioned = sc.parallelize(Synthetic.trainWithDiffFeatureSize, 2)
       .map(lp => (lp.label, lp)).partitionBy(
diff --git a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/MissingValueHandlingSuite.scala b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/MissingValueHandlingSuite.scala
index 9e23d81b51d1..5863e2ace566 100644
--- a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/MissingValueHandlingSuite.scala
+++ b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/MissingValueHandlingSuite.scala
@@ -1,5 +1,5 @@
 /*
- Copyright (c) 2014 by Contributors
+ Copyright (c) 2014-2022 by Contributors
 
  Licensed under the Apache License, Version 2.0 (the "License");
  you may not use this file except in compliance with the License.
@@ -16,14 +16,14 @@
 
 package ml.dmlc.xgboost4j.scala.spark
 
-import ml.dmlc.xgboost4j.java.XGBoostError
 import org.apache.spark.ml.feature.VectorAssembler
 import org.apache.spark.ml.linalg.Vectors
 import org.apache.spark.sql.DataFrame
 import org.scalatest.FunSuite
-
 import scala.util.Random
 
+import org.apache.spark.SparkException
+
 class MissingValueHandlingSuite extends FunSuite with PerTest {
   test("dense vectors containing missing value") {
     def buildDenseDataFrame(): DataFrame = {
@@ -113,7 +113,7 @@ class MissingValueHandlingSuite extends FunSuite with PerTest {
     val inputDF = vectorAssembler.transform(testDF).select("features", "label")
     val paramMap = List("eta" -> "1", "max_depth" -> "2",
       "objective" -> "binary:logistic", "missing" -> -1.0f, "num_workers" -> 1).toMap
-    intercept[XGBoostError] {
+    intercept[SparkException] {
       new XGBoostClassifier(paramMap).fit(inputDF)
     }
   }
@@ -140,7 +140,7 @@ class MissingValueHandlingSuite extends FunSuite with PerTest {
     inputDF.show()
     val paramMap = List("eta" -> "1", "max_depth" -> "2",
       "objective" -> "binary:logistic", "missing" -> -1.0f, "num_workers" -> 1).toMap
-    intercept[XGBoostError] {
+    intercept[SparkException] {
       new XGBoostClassifier(paramMap).fit(inputDF)
     }
   }
diff --git a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/ParameterSuite.scala b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/ParameterSuite.scala
index 50596c69f7ae..ab1226d2bf2f 100644
--- a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/ParameterSuite.scala
+++ b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/ParameterSuite.scala
@@ -1,5 +1,5 @@
 /*
- Copyright (c) 2014 by Contributors
+ Copyright (c) 2014-2022 by Contributors
 
  Licensed under the Apache License, Version 2.0 (the "License");
  you may not use this file except in compliance with the License.
@@ -16,9 +16,9 @@
 
 package ml.dmlc.xgboost4j.scala.spark
 
-import ml.dmlc.xgboost4j.java.XGBoostError
-import org.scalatest.{BeforeAndAfterAll, FunSuite, Ignore}
+import org.scalatest.{BeforeAndAfterAll, FunSuite}
 
+import org.apache.spark.SparkException
 import org.apache.spark.ml.param.ParamMap
 
 class ParameterSuite extends FunSuite with PerTest with BeforeAndAfterAll {
@@ -40,28 +40,16 @@ class ParameterSuite extends FunSuite with PerTest with BeforeAndAfterAll {
     assert(xgbCopy2.MLlib2XGBoostParams("eval_metric").toString === "logloss")
   }
 
-  private def waitForSparkContextShutdown(): Unit = {
-    var totalWaitedTime = 0L
-    while (!ss.sparkContext.isStopped && totalWaitedTime <= 120000) {
-      Thread.sleep(10000)
-      totalWaitedTime += 10000
-    }
-    assert(ss.sparkContext.isStopped === true)
-  }
-
   test("fail training elegantly with unsupported objective function") {
     val paramMap = Map("eta" -> "0.1", "max_depth" -> "6", "silent" -> "1",
       "objective" -> "wrong_objective_function", "num_class" -> "6", "num_round" -> 5,
       "num_workers" -> numWorkers)
     val trainingDF = buildDataFrame(MultiClassification.train)
     val xgb = new XGBoostClassifier(paramMap)
-    try {
-      val model = xgb.fit(trainingDF)
-    } catch {
-      case e: Throwable => // swallow anything
-    } finally {
-      waitForSparkContextShutdown()
+    intercept[SparkException] {
+      xgb.fit(trainingDF)
     }
+
   }
 
   test("fail training elegantly with unsupported eval metrics") {
@@ -70,12 +58,8 @@ class ParameterSuite extends FunSuite with PerTest with BeforeAndAfterAll {
       "num_workers" -> numWorkers, "eval_metric" -> "wrong_eval_metrics")
     val trainingDF = buildDataFrame(MultiClassification.train)
     val xgb = new XGBoostClassifier(paramMap)
-    try {
-      val model = xgb.fit(trainingDF)
-    } catch {
-      case e: Throwable => // swallow anything
-    } finally {
-      waitForSparkContextShutdown()
+    intercept[SparkException] {
+      xgb.fit(trainingDF)
     }
   }
 
diff --git a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/PerTest.scala b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/PerTest.scala
index 6148e6dbe8e7..f5775bc4d7bb 100644
--- a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/PerTest.scala
+++ b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/PerTest.scala
@@ -1,5 +1,5 @@
 /*
- Copyright (c) 2014 by Contributors
+ Copyright (c) 2014-2022 by Contributors
 
  Licensed under the Apache License, Version 2.0 (the "License");
  you may not use this file except in compliance with the License.
@@ -19,7 +19,7 @@ package ml.dmlc.xgboost4j.scala.spark
 import java.io.File
 
 import ml.dmlc.xgboost4j.{LabeledPoint => XGBLabeledPoint}
-import org.apache.spark.{SparkConf, SparkContext, TaskFailedListener}
+import org.apache.spark.SparkContext
 import org.apache.spark.sql._
 import org.scalatest.{BeforeAndAfterEach, FunSuite}
 
@@ -40,32 +40,16 @@ trait PerTest extends BeforeAndAfterEach { self: FunSuite =>
       .appName("XGBoostSuite")
       .config("spark.ui.enabled", false)
       .config("spark.driver.memory", "512m")
+      .config("spark.barrier.sync.timeout", 10)
       .config("spark.task.cpus", 1)
 
   override def beforeEach(): Unit = getOrCreateSession
 
   override def afterEach() {
-    TaskFailedListener.sparkContextShutdownLock.synchronized {
-      if (currentSession != null) {
-        // this synchronization is mostly for the tests involving SparkContext shutdown
-        // for unit test involving the sparkContext shutdown there are two different events sequence
-        // 1. SparkContext killer is executed before afterEach, in this case, before SparkContext
-        // is fully stopped, afterEach() will block at the following code block
-        // 2. SparkContext killer is executed afterEach, in this case, currentSession.stop() in will
-        // block to wait for all msgs in ListenerBus get processed. Because currentSession.stop()
-        // has been called, SparkContext killer will not take effect
-        while (TaskFailedListener.killerStarted) {
-          TaskFailedListener.sparkContextShutdownLock.wait()
-        }
-        currentSession.stop()
-        cleanExternalCache(currentSession.sparkContext.appName)
-        currentSession = null
-      }
-      if (TaskFailedListener.sparkContextKiller != null) {
-        TaskFailedListener.sparkContextKiller.interrupt()
-        TaskFailedListener.sparkContextKiller = null
-      }
-      TaskFailedListener.killerStarted = false
+    if (currentSession != null) {
+      currentSession.stop()
+      cleanExternalCache(currentSession.sparkContext.appName)
+      currentSession = null
     }
   }
 
diff --git a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/PersistenceSuite.scala b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/PersistenceSuite.scala
index a1732c7f7e1b..93b7554017a0 100755
--- a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/PersistenceSuite.scala
+++ b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/PersistenceSuite.scala
@@ -1,5 +1,5 @@
 /*
- Copyright (c) 2014,2021 by Contributors
+ Copyright (c) 2014-2022 by Contributors
 
  Licensed under the Apache License, Version 2.0 (the "License");
  you may not use this file except in compliance with the License.
diff --git a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostConfigureSuite.scala b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostConfigureSuite.scala
index 4b3d8d7c936a..7d588d97ce0a 100644
--- a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostConfigureSuite.scala
+++ b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostConfigureSuite.scala
@@ -1,5 +1,5 @@
 /*
- Copyright (c) 2014 by Contributors
+ Copyright (c) 2014-2022 by Contributors
 
  Licensed under the Apache License, Version 2.0 (the "License");
  you may not use this file except in compliance with the License.
@@ -16,10 +16,8 @@
 
 package ml.dmlc.xgboost4j.scala.spark
 
-import ml.dmlc.xgboost4j.java.Rabit
 import ml.dmlc.xgboost4j.scala.{Booster, DMatrix}
 
-import scala.collection.JavaConverters._
 import org.apache.spark.sql._
 import org.scalatest.FunSuite
 
diff --git a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostGeneralSuite.scala b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostGeneralSuite.scala
index 875960ed667c..cd13e4b6cafd 100755
--- a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostGeneralSuite.scala
+++ b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostGeneralSuite.scala
@@ -1,5 +1,5 @@
 /*
- Copyright (c) 2014 by Contributors
+ Copyright (c) 2014-2022 by Contributors
 
  Licensed under the Apache License, Version 2.0 (the "License");
  you may not use this file except in compliance with the License.
@@ -16,13 +16,12 @@
 
 package ml.dmlc.xgboost4j.scala.spark
 
-import ml.dmlc.xgboost4j.java.XGBoostError
 import scala.util.Random
 
 import ml.dmlc.xgboost4j.{LabeledPoint => XGBLabeledPoint}
 import ml.dmlc.xgboost4j.scala.DMatrix
 
-import org.apache.spark.TaskContext
+import org.apache.spark.{SparkException, TaskContext}
 import org.scalatest.FunSuite
 
 import org.apache.spark.ml.feature.VectorAssembler
@@ -375,13 +374,14 @@ class XGBoostGeneralSuite extends FunSuite with TmpFolderPerSuite with PerTest {
 
   test("throw exception for empty partition in trainingset") {
     val paramMap = Map("eta" -> "0.1", "max_depth" -> "6", "silent" -> "1",
-      "objective" -> "multi:softmax", "num_class" -> "2", "num_round" -> 5,
-      "num_workers" -> numWorkers, "tree_method" -> "auto")
+      "objective" -> "binary:logistic", "num_class" -> "2", "num_round" -> 5,
+      "num_workers" -> numWorkers, "tree_method" -> "auto", "allow_non_zero_for_missing" -> true)
     // The Dmatrix will be empty
-    val trainingDF = buildDataFrame(Seq(XGBLabeledPoint(1.0f, 1, Array(), Array())))
+    val trainingDF = buildDataFrame(Seq(XGBLabeledPoint(1.0f, 4,
+      Array(0, 1, 2, 3), Array(0, 1, 2, 3))))
     val xgb = new XGBoostClassifier(paramMap)
-    intercept[XGBoostError] {
-      val model = xgb.fit(trainingDF)
+    intercept[SparkException] {
+      xgb.fit(trainingDF)
     }
   }
 
diff --git a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostRabitRegressionSuite.scala b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostRabitRegressionSuite.scala
index 7e2cbb6d537f..00a29681ca73 100644
--- a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostRabitRegressionSuite.scala
+++ b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostRabitRegressionSuite.scala
@@ -1,5 +1,5 @@
 /*
- Copyright (c) 2014 by Contributors
+ Copyright (c) 2014-2022 by Contributors
 
  Licensed under the Apache License, Version 2.0 (the "License");
  you may not use this file except in compliance with the License.
@@ -16,14 +16,15 @@
 
 package ml.dmlc.xgboost4j.scala.spark
 
-import ml.dmlc.xgboost4j.java.{Rabit, XGBoostError}
-import ml.dmlc.xgboost4j.scala.{Booster, DMatrix}
-import org.apache.spark.TaskFailedListener
-import org.apache.spark.SparkException
+import ml.dmlc.xgboost4j.java.Rabit
+import ml.dmlc.xgboost4j.scala.Booster
 import scala.collection.JavaConverters._
+
 import org.apache.spark.sql._
 import org.scalatest.FunSuite
 
+import org.apache.spark.SparkException
+
 class XGBoostRabitRegressionSuite extends FunSuite with PerTest {
   val predictionErrorMin = 0.00001f
   val maxFailure = 2;
@@ -33,15 +34,6 @@ class XGBoostRabitRegressionSuite extends FunSuite with PerTest {
     .config("spark.kryo.classesToRegister", classOf[Booster].getName)
     .master(s"local[${numWorkers},${maxFailure}]")
 
-  private def waitAndCheckSparkShutdown(waitMiliSec: Int): Boolean = {
-    var totalWaitedTime = 0L
-    while (!ss.sparkContext.isStopped && totalWaitedTime <= waitMiliSec) {
-      Thread.sleep(10)
-      totalWaitedTime += 10
-    }
-    return ss.sparkContext.isStopped
-  }
-
   test("test classification prediction parity w/o ring reduce") {
     val training = buildDataFrame(Classification.train)
     val testDF = buildDataFrame(Classification.test)
@@ -91,14 +83,11 @@ class XGBoostRabitRegressionSuite extends FunSuite with PerTest {
   }
 
   test("test rabit timeout fail handle") {
-    // disable spark kill listener to verify if rabit_timeout take effect and kill tasks
-    TaskFailedListener.killerStarted = true
-
     val training = buildDataFrame(Classification.train)
     // mock rank 0 failure during 8th allreduce synchronization
     Rabit.mockList = Array("0,8,0,0").toList.asJava
 
-    try {
+    intercept[SparkException] {
       new XGBoostClassifier(Map(
         "eta" -> "0.1",
         "max_depth" -> "10",
@@ -108,39 +97,7 @@ class XGBoostRabitRegressionSuite extends FunSuite with PerTest {
         "num_workers" -> numWorkers,
         "rabit_timeout" -> 0))
         .fit(training)
-    } catch {
-      case e: Throwable => // swallow anything
-    } finally {
-      // assume all tasks throw exception almost same time
-      // 100ms should be enough to exhaust all retries
-      assert(waitAndCheckSparkShutdown(100) == true)
-      TaskFailedListener.killerStarted = false
     }
   }
 
-  test("test SparkContext should not be killed ") {
-    cancel("For some reason, sparkContext can't cancel the job locally in the CI env," +
-      "which will be resolved when introducing barrier mode")
-    val training = buildDataFrame(Classification.train)
-    // mock rank 0 failure during 8th allreduce synchronization
-    Rabit.mockList = Array("0,8,0,0").toList.asJava
-
-    try {
-      new XGBoostClassifier(Map(
-        "eta" -> "0.1",
-        "max_depth" -> "10",
-        "verbosity" -> "1",
-        "objective" -> "binary:logistic",
-        "num_round" -> 5,
-        "num_workers" -> numWorkers,
-        "kill_spark_context_on_worker_failure" -> false,
-        "rabit_timeout" -> 0))
-        .fit(training)
-    } catch {
-      case e: Throwable => // swallow anything
-    } finally {
-      // wait 3s to check if SparkContext is killed
-      assert(waitAndCheckSparkShutdown(3000) == false)
-    }
-  }
 }
diff --git a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostRegressorSuite.scala b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostRegressorSuite.scala
index e427c17e31a5..bd104f6c7987 100644
--- a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostRegressorSuite.scala
+++ b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostRegressorSuite.scala
@@ -21,7 +21,6 @@ import ml.dmlc.xgboost4j.scala.{DMatrix, XGBoost => ScalaXGBoost}
 import org.apache.spark.ml.linalg.{Vector, Vectors}
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.{DataFrame, Row}
-import org.apache.spark.sql.types._
 import org.scalatest.FunSuite
 
 import org.apache.spark.ml.feature.VectorAssembler
diff --git a/jvm-packages/xgboost4j-spark/src/test/scala/org/apache/spark/SparkParallelismTrackerSuite.scala b/jvm-packages/xgboost4j-spark/src/test/scala/org/apache/spark/SparkParallelismTrackerSuite.scala
deleted file mode 100644
index cb8fa579476a..000000000000
--- a/jvm-packages/xgboost4j-spark/src/test/scala/org/apache/spark/SparkParallelismTrackerSuite.scala
+++ /dev/null
@@ -1,151 +0,0 @@
-/*
- Copyright (c) 2014 by Contributors
-
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
-
- http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
- */
-
-package org.apache.spark
-
-import org.scalatest.FunSuite
-import _root_.ml.dmlc.xgboost4j.scala.spark.PerTest
-import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.SparkSession
-
-import scala.math.min
-
-class SparkParallelismTrackerSuite extends FunSuite with PerTest {
-
-  val numParallelism: Int = min(Runtime.getRuntime.availableProcessors(), 4)
-
-  override protected def sparkSessionBuilder: SparkSession.Builder = SparkSession.builder()
-    .master(s"local[${numParallelism}]")
-    .appName("XGBoostSuite")
-    .config("spark.ui.enabled", true)
-    .config("spark.driver.memory", "512m")
-    .config("spark.task.cpus", 1)
-
-  private def waitAndCheckSparkShutdown(waitMiliSec: Int): Boolean = {
-    var totalWaitedTime = 0L
-    while (!ss.sparkContext.isStopped && totalWaitedTime <= waitMiliSec) {
-      Thread.sleep(100)
-      totalWaitedTime += 100
-    }
-    ss.sparkContext.isStopped
-  }
-
-  test("tracker should not affect execution result when timeout is not larger than 0") {
-    val nWorkers = numParallelism
-    val rdd: RDD[Int] = sc.parallelize(1 to nWorkers)
-    val tracker = new SparkParallelismTracker(sc, 10000, nWorkers)
-    val disabledTracker = new SparkParallelismTracker(sc, 0, nWorkers)
-    assert(tracker.execute(rdd.sum()) == rdd.sum())
-    assert(disabledTracker.execute(rdd.sum()) == rdd.sum())
-  }
-
-  test("tracker should throw exception if parallelism is not sufficient") {
-    val nWorkers = numParallelism * 3
-    val rdd: RDD[Int] = sc.parallelize(1 to nWorkers)
-    val tracker = new SparkParallelismTracker(sc, 1000, nWorkers)
-    intercept[IllegalStateException] {
-      tracker.execute {
-        rdd.map { i =>
-          // Test interruption
-          Thread.sleep(Long.MaxValue)
-          i
-        }.sum()
-      }
-    }
-  }
-
-  test("tracker should throw exception if parallelism is not sufficient with" +
-    " spark.task.cpus larger than 1") {
-    sc.conf.set("spark.task.cpus", "2")
-    val nWorkers = numParallelism
-    val rdd: RDD[Int] = sc.parallelize(1 to nWorkers)
-    val tracker = new SparkParallelismTracker(sc, 1000, nWorkers)
-    intercept[IllegalStateException] {
-      tracker.execute {
-        rdd.map { i =>
-          // Test interruption
-          Thread.sleep(Long.MaxValue)
-          i
-        }.sum()
-      }
-    }
-  }
-
-  test("tracker should not kill SparkContext when killSparkContextOnWorkerFailure=false") {
-    val nWorkers = numParallelism
-    val tracker = new SparkParallelismTracker(sc, 0, nWorkers, false)
-    val rdd: RDD[Int] = sc.parallelize(1 to nWorkers, nWorkers)
-    try {
-      tracker.execute {
-        rdd.map { i =>
-          val partitionId = TaskContext.get().partitionId()
-          if (partitionId == 0) {
-            throw new RuntimeException("mocking task failing")
-          }
-          i
-        }.sum()
-      }
-    } catch {
-      case e: Exception => // catch the exception
-    } finally {
-      // wait 3s to check if SparkContext is killed
-      assert(waitAndCheckSparkShutdown(3000) == false)
-    }
-  }
-
-  test("tracker should cancel the correct job when killSparkContextOnWorkerFailure=false") {
-    val nWorkers = 2
-    val tracker = new SparkParallelismTracker(sc, 0, nWorkers, false)
-    val rdd: RDD[Int] = sc.parallelize(1 to 10, nWorkers)
-    val thread = new TestThread(sc)
-    thread.start()
-    try {
-      tracker.execute {
-        rdd.map { i =>
-          Thread.sleep(100)
-          val partitionId = TaskContext.get().partitionId()
-          if (partitionId == 0) {
-            throw new RuntimeException("mocking task failing")
-          }
-          i
-        }.sum()
-      }
-    } catch {
-      case e: Exception => // catch the exception
-    } finally {
-      thread.join(8000)
-      // wait 3s to check if SparkContext is killed
-      assert(waitAndCheckSparkShutdown(3000) == false)
-    }
-  }
-
-  private[this] class TestThread(sc: SparkContext) extends Thread {
-    override def run(): Unit = {
-      var sum: Double = 0.0f
-      try {
-        val rdd = sc.parallelize(1 to 4, 2)
-        sum = rdd.mapPartitions(iter => {
-          // sleep 2s to ensure task is alive when cancelling other jobs
-          Thread.sleep(2000)
-          iter
-        }).sum()
-      } finally {
-        // get the correct result
-        assert(sum.toInt == 10)
-      }
-    }
-  }
-}

From bef1f939ceb964283f17f1f4cff3234ca18d3bc8 Mon Sep 17 00:00:00 2001
From: Bobby Wang <wbo4958@gmail.com>
Date: Mon, 25 Apr 2022 19:29:16 +0800
Subject: [PATCH 11/16] [doc] remove the doc about killing SparkContext [skip
 ci] (#7840)

---
 doc/jvm/xgboost4j_spark_tutorial.rst | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/doc/jvm/xgboost4j_spark_tutorial.rst b/doc/jvm/xgboost4j_spark_tutorial.rst
index ce689cb95358..60c1dd601991 100644
--- a/doc/jvm/xgboost4j_spark_tutorial.rst
+++ b/doc/jvm/xgboost4j_spark_tutorial.rst
@@ -16,12 +16,6 @@ This tutorial is to cover the end-to-end process to build a machine learning pip
 * Building a Machine Learning Pipeline with XGBoost4J-Spark
 * Running XGBoost4J-Spark in Production
 
-.. note::
-
-  **SparkContext will be stopped by default when XGBoost training task fails**.
-
-  XGBoost4J-Spark 1.2.0+ exposes a parameter **kill_spark_context_on_worker_failure**. Set **kill_spark_context_on_worker_failure** to **false** so that the SparkContext will not be stopping on training failure. Instead of stopping the SparkContext, XGBoost4J-Spark will throw an exception instead. Users who want to re-use the SparkContext should wrap the training code in a try-catch block.
-
 .. contents::
   :backlinks: none
   :local:

From ad06172c6b19dde83e2e937904b64a2d87fe01f0 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Tue, 26 Apr 2022 18:53:43 +0800
Subject: [PATCH 12/16] Refactor pandas dataframe handling. (#7843)

---
 python-package/xgboost/data.py | 133 ++++++++++++++++++++-------------
 1 file changed, 80 insertions(+), 53 deletions(-)

diff --git a/python-package/xgboost/data.py b/python-package/xgboost/data.py
index 00d47599fe73..47c41d994d8b 100644
--- a/python-package/xgboost/data.py
+++ b/python-package/xgboost/data.py
@@ -6,7 +6,7 @@
 import json
 import warnings
 import os
-from typing import Any, Tuple, Callable, Optional, List, Union, Iterator
+from typing import Any, Tuple, Callable, Optional, List, Union, Iterator, Type
 
 import numpy as np
 
@@ -21,8 +21,6 @@
 CAT_T = "c"
 
 # meta info that can be a matrix instead of vector.
-# For now it's base_margin for multi-class, but it can be extended to label once we have
-# multi-output.
 _matrix_meta = {"base_margin", "label"}
 
 
@@ -253,41 +251,19 @@ def _invalid_dataframe_dtype(data: Any) -> None:
     raise ValueError(msg)
 
 
-# pylint: disable=too-many-locals
-def _transform_pandas_df(
+def _pandas_feature_info(
     data: DataFrame,
+    meta: Optional[str],
+    feature_names: FeatureNames,
+    feature_types: FeatureTypes,
     enable_categorical: bool,
-    feature_names: FeatureNames = None,
-    feature_types: Optional[List[str]] = None,
-    meta: Optional[str] = None,
-    meta_type: Optional[str] = None,
-) -> Tuple[np.ndarray, FeatureNames, Optional[List[str]]]:
+) -> Tuple[FeatureNames, FeatureTypes]:
     import pandas as pd
     from pandas.api.types import (
         is_sparse,
         is_categorical_dtype,
-        is_integer_dtype,
-        is_bool_dtype,
     )
 
-    nullable_alias = {"Int16", "Int32", "Int64"}
-
-    # dtype: pd.core.arrays.numeric.NumericDtype
-    def is_nullable_dtype(dtype: Any) -> bool:
-        is_int = is_integer_dtype(dtype) and dtype.name in nullable_alias
-        # np.bool has alias `bool`, while pd.BooleanDtype has `boolean`.
-        is_bool = is_bool_dtype(dtype) and dtype.name == "boolean"
-        return is_int or is_bool
-
-    if not all(
-        dtype.name in _pandas_dtype_mapper
-        or is_sparse(dtype)
-        or is_nullable_dtype(dtype)
-        or (is_categorical_dtype(dtype) and enable_categorical)
-        for dtype in data.dtypes
-    ):
-        _invalid_dataframe_dtype(data)
-
     # handle feature names
     if feature_names is None and meta is None:
         if isinstance(data.columns, pd.MultiIndex):
@@ -300,43 +276,94 @@ def is_nullable_dtype(dtype: Any) -> bool:
     # handle feature types
     if feature_types is None and meta is None:
         feature_types = []
-        for i, dtype in enumerate(data.dtypes):
+        for dtype in data.dtypes:
             if is_sparse(dtype):
                 feature_types.append(_pandas_dtype_mapper[dtype.subtype.name])
             elif is_categorical_dtype(dtype) and enable_categorical:
                 feature_types.append(CAT_T)
             else:
                 feature_types.append(_pandas_dtype_mapper[dtype.name])
+    return feature_names, feature_types
 
-    # handle category codes.
-    transformed = pd.DataFrame()
-    # Avoid transformation due to: PerformanceWarning: DataFrame is highly fragmented
-    if (
-        enable_categorical and any(is_categorical_dtype(dtype) for dtype in data.dtypes)
-    ) or any(is_nullable_dtype(dtype) for dtype in data.dtypes):
-        for i, dtype in enumerate(data.dtypes):
-            if is_categorical_dtype(dtype):
-                # pandas uses -1 as default missing value for categorical data
-                transformed[data.columns[i]] = (
-                    data[data.columns[i]]
-                    .cat.codes.astype(np.float32)
-                    .replace(-1.0, np.NaN)
-                )
-            elif is_nullable_dtype(dtype):
-                # Converts integer <NA> to float NaN
-                transformed[data.columns[i]] = data[data.columns[i]].astype(np.float32)
-            else:
-                transformed[data.columns[i]] = data[data.columns[i]]
+
+def is_nullable_dtype(dtype: Any) -> bool:
+    """Wether dtype is a pandas nullable type."""
+    from pandas.api.types import is_integer_dtype, is_bool_dtype
+    # dtype: pd.core.arrays.numeric.NumericDtype
+    nullable_alias = {"Int16", "Int32", "Int64"}
+    is_int = is_integer_dtype(dtype) and dtype.name in nullable_alias
+    # np.bool has alias `bool`, while pd.BooleanDtype has `boolean`.
+    is_bool = is_bool_dtype(dtype) and dtype.name == "boolean"
+    return is_int or is_bool
+
+
+def _pandas_cat_null(data: DataFrame) -> DataFrame:
+    from pandas.api.types import is_categorical_dtype
+    # handle category codes and nullable.
+    cat_columns = [
+        col
+        for col, dtype in zip(data.columns, data.dtypes)
+        if is_categorical_dtype(dtype)
+    ]
+    nul_columns = [
+        col for col, dtype in zip(data.columns, data.dtypes) if is_nullable_dtype(dtype)
+    ]
+    if cat_columns or nul_columns:
+        # Avoid transformation due to: PerformanceWarning: DataFrame is highly
+        # fragmented
+        transformed = data.copy()
     else:
         transformed = data
 
+    if cat_columns:
+        # DF doesn't have the cat attribute, so we use apply here
+        transformed[cat_columns] = (
+            transformed[cat_columns]
+            .apply(lambda x: x.cat.codes)
+            .astype(np.float32)
+            .replace(-1.0, np.NaN)
+        )
+    if nul_columns:
+        transformed[nul_columns] = transformed[nul_columns].astype(np.float32)
+
+    return transformed
+
+
+def _transform_pandas_df(
+    data: DataFrame,
+    enable_categorical: bool,
+    feature_names: FeatureNames = None,
+    feature_types: FeatureTypes = None,
+    meta: Optional[str] = None,
+    meta_type: Optional[str] = None,
+) -> Tuple[np.ndarray, FeatureNames, FeatureTypes]:
+    from pandas.api.types import (
+        is_sparse,
+        is_categorical_dtype,
+    )
+
+    if not all(
+        dtype.name in _pandas_dtype_mapper
+        or is_sparse(dtype)
+        or is_nullable_dtype(dtype)
+        or (is_categorical_dtype(dtype) and enable_categorical)
+        for dtype in data.dtypes
+    ):
+        _invalid_dataframe_dtype(data)
+
+    feature_names, feature_types = _pandas_feature_info(
+        data, meta, feature_names, feature_types, enable_categorical
+    )
+
+    transformed = _pandas_cat_null(data)
+
     if meta and len(data.columns) > 1 and meta not in _matrix_meta:
         raise ValueError(f"DataFrame for {meta} cannot have multiple columns")
 
-    dtype = meta_type if meta_type else np.float32
-    arr = transformed.values
+    dtype: Union[Type[np.floating], str] = meta_type if meta_type else np.float32
+    arr: np.ndarray = transformed.values
     if meta_type:
-        arr = arr.astype(meta_type)
+        arr = arr.astype(dtype)
     return arr, feature_names, feature_types
 
 

From fdf533f2b9af9c068cddba50839574c6abb58dc3 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Tue, 26 Apr 2022 21:41:55 +0800
Subject: [PATCH 13/16] [POC] Experimental support for l1 error. (#7812)

Support adaptive tree, a feature supported by both sklearn and lightgbm.  The tree leaf is recomputed based on residue of labels and predictions after construction.

For l1 error, the optimal value is the median (50 percentile).

This is marked as experimental support for the following reasons:
- The value is not well defined for distributed training, where we might have empty leaves for local workers. Right now I just use the original leaf value for computing the average with other workers, which might cause significant errors.
- Some follow-ups are required, for exact, pruner, and optimization for quantile function. Also, we need to calculate the initial estimation.
---
 amalgamation/xgboost-all0.cc                  |   1 +
 doc/model.schema                              |   9 +-
 doc/parameter.rst                             |   1 +
 include/xgboost/gbm.h                         |   5 +-
 include/xgboost/linalg.h                      |   8 +-
 include/xgboost/objective.h                   |  20 +-
 include/xgboost/task.h                        |   9 +-
 include/xgboost/tree_updater.h                |  15 +-
 plugin/example/custom_obj.cc                  |  10 +-
 src/common/common.h                           |  99 +++++++++-
 src/common/device_helpers.cuh                 |  40 +++-
 src/common/linalg_op.cuh                      |   3 +-
 src/common/partition_builder.h                |  28 ++-
 src/common/row_set.h                          |  23 ++-
 src/common/stats.cuh                          | 127 ++++++++++++
 src/common/stats.h                            |  95 +++++++++
 src/data/data.cc                              |  11 +-
 src/data/iterative_device_dmatrix.h           |   2 +-
 src/gbm/gblinear.cc                           |   5 +-
 src/gbm/gbtree.cc                             | 101 ++++++----
 src/gbm/gbtree.h                              |  17 +-
 src/learner.cc                                |   4 +-
 src/metric/auc.cu                             |  30 ++-
 src/objective/adaptive.cc                     | 126 ++++++++++++
 src/objective/adaptive.cu                     | 182 ++++++++++++++++++
 src/objective/adaptive.h                      |  83 ++++++++
 src/objective/aft_obj.cu                      |   4 +-
 src/objective/hinge.cu                        |   6 +-
 src/objective/multiclass_obj.cu               |   2 +-
 src/objective/rank_obj.cu                     |   9 +-
 src/objective/regression_loss.h               |  12 +-
 src/objective/regression_obj.cu               |  98 +++++++---
 src/tree/gpu_hist/row_partitioner.cuh         |  51 +++--
 src/tree/hist/evaluate_splits.h               |   5 +-
 src/tree/updater_approx.cc                    |  30 ++-
 src/tree/updater_approx.h                     |   9 +-
 src/tree/updater_colmaker.cc                  |   6 +-
 src/tree/updater_gpu_hist.cu                  | 172 ++++++++++-------
 src/tree/updater_histmaker.cc                 |   6 +-
 src/tree/updater_prune.cc                     |   8 +-
 src/tree/updater_quantile_hist.cc             |  37 +++-
 src/tree/updater_quantile_hist.h              |  25 ++-
 src/tree/updater_refresh.cc                   |   6 +-
 src/tree/updater_sync.cc                      |   6 +-
 tests/cpp/common/test_stats.cc                |  58 ++++++
 tests/cpp/common/test_stats.cu                |  77 ++++++++
 tests/cpp/gbm/test_gbtree.cc                  |   8 +-
 tests/cpp/helpers.cc                          |   2 +-
 tests/cpp/objective/test_regression_obj.cc    | 116 ++++++++++-
 tests/cpp/predictor/test_cpu_predictor.cc     |   4 +-
 .../cpp/tree/gpu_hist/test_row_partitioner.cu |  64 +++++-
 tests/cpp/tree/test_approx.cc                 |  78 +++++++-
 tests/cpp/tree/test_gpu_hist.cu               |  23 ++-
 tests/cpp/tree/test_histmaker.cc              |   6 +-
 tests/cpp/tree/test_prediction_cache.cc       |   3 +-
 tests/cpp/tree/test_prune.cc                  |  11 +-
 tests/cpp/tree/test_refresh.cc                |   3 +-
 tests/cpp/tree/test_tree_stat.cc              |  12 +-
 tests/python-gpu/test_gpu_prediction.py       |   4 +
 tests/python-gpu/test_gpu_updaters.py         |   2 +
 tests/python-gpu/test_gpu_with_dask.py        |  19 +-
 tests/python/test_updaters.py                 |   2 +
 tests/python/test_with_dask.py                |  15 +-
 tests/python/testing.py                       |   4 +
 64 files changed, 1724 insertions(+), 333 deletions(-)
 create mode 100644 src/common/stats.cuh
 create mode 100644 src/common/stats.h
 create mode 100644 src/objective/adaptive.cc
 create mode 100644 src/objective/adaptive.cu
 create mode 100644 src/objective/adaptive.h
 create mode 100644 tests/cpp/common/test_stats.cc
 create mode 100644 tests/cpp/common/test_stats.cu

diff --git a/amalgamation/xgboost-all0.cc b/amalgamation/xgboost-all0.cc
index 45eb5e72593d..c684e6309de8 100644
--- a/amalgamation/xgboost-all0.cc
+++ b/amalgamation/xgboost-all0.cc
@@ -24,6 +24,7 @@
 #include "../src/objective/rank_obj.cc"
 #include "../src/objective/hinge.cc"
 #include "../src/objective/aft_obj.cc"
+#include "../src/objective/adaptive.cc"
 
 // gbms
 #include "../src/gbm/gbm.cc"
diff --git a/doc/model.schema b/doc/model.schema
index b192cabc6864..02725cb36d31 100644
--- a/doc/model.schema
+++ b/doc/model.schema
@@ -400,7 +400,6 @@
                 "reg_loss_param"
               ]
             },
-
             {
               "type": "object",
               "properties": {
@@ -433,6 +432,14 @@
                 "tweedie_regression_param"
               ]
             },
+            {
+              "properties": {
+                "name": {
+                  "const": "reg:absoluteerror"
+                }
+              },
+              "type": "object"
+            },
             {
               "type": "object",
               "properties": {
diff --git a/doc/parameter.rst b/doc/parameter.rst
index 781150490082..b361b01d4d9f 100644
--- a/doc/parameter.rst
+++ b/doc/parameter.rst
@@ -349,6 +349,7 @@ Specify the learning task and the corresponding learning objective. The objectiv
   - ``reg:squaredlogerror``: regression with squared log loss :math:`\frac{1}{2}[log(pred + 1) - log(label + 1)]^2`.  All input labels are required to be greater than -1.  Also, see metric ``rmsle`` for possible issue  with this objective.
   - ``reg:logistic``: logistic regression.
   - ``reg:pseudohubererror``: regression with Pseudo Huber loss, a twice differentiable alternative to absolute loss.
+  - ``reg:absoluteerror``: Regression with L1 error. When tree model is used, leaf value is refreshed after tree construction.
   - ``binary:logistic``: logistic regression for binary classification, output probability
   - ``binary:logitraw``: logistic regression for binary classification, output score before logistic transformation
   - ``binary:hinge``: hinge loss for binary classification. This makes predictions of 0 or 1, rather than producing probabilities.
diff --git a/include/xgboost/gbm.h b/include/xgboost/gbm.h
index d24057e255a7..cce92d3679f4 100644
--- a/include/xgboost/gbm.h
+++ b/include/xgboost/gbm.h
@@ -90,9 +90,8 @@ class GradientBooster : public Model, public Configurable {
    * \param prediction The output prediction cache entry that needs to be updated.
    * the booster may change content of gpair
    */
-  virtual void DoBoost(DMatrix* p_fmat,
-                       HostDeviceVector<GradientPair>* in_gpair,
-                       PredictionCacheEntry*) = 0;
+  virtual void DoBoost(DMatrix* p_fmat, HostDeviceVector<GradientPair>* in_gpair,
+                       PredictionCacheEntry*, ObjFunction const* obj) = 0;
 
   /*!
    * \brief generate predictions for given feature matrix
diff --git a/include/xgboost/linalg.h b/include/xgboost/linalg.h
index 32d0f9fb9f9c..015121560039 100644
--- a/include/xgboost/linalg.h
+++ b/include/xgboost/linalg.h
@@ -670,9 +670,13 @@ class Tensor {
    * See \ref TensorView for parameters of this constructor.
    */
   template <typename I, int32_t D>
-  explicit Tensor(I const (&shape)[D], int32_t device) {
+  explicit Tensor(I const (&shape)[D], int32_t device)
+      : Tensor{common::Span<I const, D>{shape}, device} {}
+
+  template <typename I, size_t D>
+  explicit Tensor(common::Span<I const, D> shape, int32_t device) {
     // No device unroll as this is a host only function.
-    std::copy(shape, shape + D, shape_);
+    std::copy(shape.data(), shape.data() + D, shape_);
     for (auto i = D; i < kDim; ++i) {
       shape_[i] = 1;
     }
diff --git a/include/xgboost/objective.h b/include/xgboost/objective.h
index 44dc46ddc8da..cb0fe7741dc9 100644
--- a/include/xgboost/objective.h
+++ b/include/xgboost/objective.h
@@ -1,5 +1,5 @@
 /*!
- * Copyright 2014-2019 by Contributors
+ * Copyright 2014-2022 by Contributors
  * \file objective.h
  * \brief interface of objective function used by xgboost.
  * \author Tianqi Chen, Kailong Chen
@@ -22,6 +22,8 @@
 
 namespace xgboost {
 
+class RegTree;
+
 /*! \brief interface of objective function */
 class ObjFunction : public Configurable {
  protected:
@@ -88,6 +90,22 @@ class ObjFunction : public Configurable {
     return 1;
   }
 
+  /**
+   * \brief Update the leaf values after a tree is built. Needed for objectives with 0
+   *        hessian.
+   *
+   *   Note that the leaf update is not well defined for distributed training as XGBoost
+   *   computes only an average of quantile between workers. This breaks when some leaf
+   *   have no sample assigned in a local worker.
+   *
+   * \param position The leaf index for each rows.
+   * \param info MetaInfo providing labels and weights.
+   * \param prediction Model prediction after transformation.
+   * \param p_tree Tree that needs to be updated.
+   */
+  virtual void UpdateTreeLeaf(HostDeviceVector<bst_node_t> const& position, MetaInfo const& info,
+                              HostDeviceVector<float> const& prediction, RegTree* p_tree) const {}
+
   /*!
    * \brief Create an objective function according to name.
    * \param tparam Generic parameters.
diff --git a/include/xgboost/task.h b/include/xgboost/task.h
index 537320657544..739207a309d8 100644
--- a/include/xgboost/task.h
+++ b/include/xgboost/task.h
@@ -33,13 +33,18 @@ struct ObjInfo {
   } task;
   // Does the objective have constant hessian value?
   bool const_hess{false};
+  bool zero_hess{false};
 
-  explicit ObjInfo(Task t) : task{t} {}
-  ObjInfo(Task t, bool khess) : task{t}, const_hess{khess} {}
+  ObjInfo(Task t) : task{t} {}  // NOLINT
+  ObjInfo(Task t, bool khess, bool zhess) : task{t}, const_hess{khess}, zero_hess(zhess) {}
 
   XGBOOST_DEVICE bool UseOneHot() const {
     return (task != ObjInfo::kRegression && task != ObjInfo::kBinary);
   }
+  /**
+   * \brief Use adaptive tree if the objective doesn't have valid hessian value.
+   */
+  XGBOOST_DEVICE bool UpdateTreeLeaf() const { return zero_hess; }
 };
 }  // namespace xgboost
 #endif  // XGBOOST_TASK_H_
diff --git a/include/xgboost/tree_updater.h b/include/xgboost/tree_updater.h
index 6189221dc0bf..f0fabb26d9a0 100644
--- a/include/xgboost/tree_updater.h
+++ b/include/xgboost/tree_updater.h
@@ -49,18 +49,25 @@ class TreeUpdater : public Configurable {
    *  existing trees.
    */
   virtual bool CanModifyTree() const { return false; }
+  /*!
+   * \brief Wether the out_position in `Update` is valid. This determines whether adaptive
+   *        tree can be used.
+   */
+  virtual bool HasNodePosition() const { return false; }
   /*!
    * \brief perform update to the tree models
    * \param gpair the gradient pair statistics of the data
    * \param data The data matrix passed to the updater.
-   * \param trees references the trees to be updated, updater will change the content of trees
+   * \param out_position The leaf index for each row.  The index is negated if that row is
+   *                     removed during sampling. So the 3th node is ~3.
+   * \param out_trees references the trees to be updated, updater will change the content of trees
    *   note: all the trees in the vector are updated, with the same statistics,
    *         but maybe different random seeds, usually one tree is passed in at a time,
    *         there can be multiple trees when we train random forest style model
    */
-  virtual void Update(HostDeviceVector<GradientPair>* gpair,
-                      DMatrix* data,
-                      const std::vector<RegTree*>& trees) = 0;
+  virtual void Update(HostDeviceVector<GradientPair>* gpair, DMatrix* data,
+                      common::Span<HostDeviceVector<bst_node_t>> out_position,
+                      const std::vector<RegTree*>& out_trees) = 0;
 
   /*!
    * \brief determines whether updater has enough knowledge about a given dataset
diff --git a/plugin/example/custom_obj.cc b/plugin/example/custom_obj.cc
index b61073360e00..e220e4497141 100644
--- a/plugin/example/custom_obj.cc
+++ b/plugin/example/custom_obj.cc
@@ -1,5 +1,5 @@
 /*!
- * Copyright 2015-2019 by Contributors
+ * Copyright 2015-2022 by Contributors
  * \file custom_metric.cc
  * \brief This is an example to define plugin of xgboost.
  *  This plugin defines the additional metric function.
@@ -31,13 +31,9 @@ DMLC_REGISTER_PARAMETER(MyLogisticParam);
 // Implement the interface.
 class MyLogistic : public ObjFunction {
  public:
-  void Configure(const std::vector<std::pair<std::string, std::string> >& args) override {
-    param_.UpdateAllowUnknown(args);
-  }
+  void Configure(const Args& args) override { param_.UpdateAllowUnknown(args); }
 
-  struct ObjInfo Task() const override {
-    return {ObjInfo::kRegression, false};
-  }
+  ObjInfo Task() const override { return ObjInfo::kRegression; }
 
   void GetGradient(const HostDeviceVector<bst_float> &preds,
                    const MetaInfo &info,
diff --git a/src/common/common.h b/src/common/common.h
index fb7e7fee55da..aa2d8197b4a1 100644
--- a/src/common/common.h
+++ b/src/common/common.h
@@ -1,5 +1,5 @@
 /*!
- * Copyright 2015-2018 by Contributors
+ * Copyright 2015-2022 by XGBoost Contributors
  * \file common.h
  * \brief Common utilities
  */
@@ -14,12 +14,12 @@
 #include <exception>
 #include <functional>
 #include <limits>
-#include <type_traits>
-#include <vector>
-#include <string>
-#include <sstream>
 #include <numeric>
+#include <sstream>
+#include <string>
+#include <type_traits>
 #include <utility>
+#include <vector>
 
 #if defined(__CUDACC__)
 #include <thrust/system/cuda/error.h>
@@ -164,6 +164,67 @@ class Range {
   Iterator end_;
 };
 
+/**
+ * \brief Transform iterator that takes an index and calls transform operator.
+ *
+ *   This is CPU-only right now as taking host device function as operator complicates the
+ *   code.  For device side one can use `thrust::transform_iterator` instead.
+ */
+template <typename Fn>
+class IndexTransformIter {
+  size_t iter_{0};
+  Fn fn_;
+
+ public:
+  using iterator_category = std::random_access_iterator_tag;  // NOLINT
+  using value_type = std::result_of_t<Fn(size_t)>;            // NOLINT
+  using difference_type = detail::ptrdiff_t;                  // NOLINT
+  using reference = std::add_lvalue_reference_t<value_type>;  // NOLINT
+  using pointer = std::add_pointer_t<value_type>;             // NOLINT
+
+ public:
+  /**
+   * \param op Transform operator, takes a size_t index as input.
+   */
+  explicit IndexTransformIter(Fn &&op) : fn_{op} {}
+  IndexTransformIter(IndexTransformIter const &) = default;
+
+  value_type operator*() const { return fn_(iter_); }
+
+  auto operator-(IndexTransformIter const &that) const { return iter_ - that.iter_; }
+
+  IndexTransformIter &operator++() {
+    iter_++;
+    return *this;
+  }
+  IndexTransformIter operator++(int) {
+    auto ret = *this;
+    ++(*this);
+    return ret;
+  }
+  IndexTransformIter &operator+=(difference_type n) {
+    iter_ += n;
+    return *this;
+  }
+  IndexTransformIter &operator-=(difference_type n) {
+    (*this) += -n;
+    return *this;
+  }
+  IndexTransformIter operator+(difference_type n) const {
+    auto ret = *this;
+    return ret += n;
+  }
+  IndexTransformIter operator-(difference_type n) const {
+    auto ret = *this;
+    return ret -= n;
+  }
+};
+
+template <typename Fn>
+auto MakeIndexTransformIter(Fn&& fn) {
+  return IndexTransformIter<Fn>(std::forward<Fn>(fn));
+}
+
 int AllVisibleGPUs();
 
 inline void AssertGPUSupport() {
@@ -191,13 +252,39 @@ std::vector<Idx> ArgSort(Container const &array, Comp comp = std::less<V>{}) {
 
 struct OptionalWeights {
   Span<float const> weights;
-  float dft{1.0f};
+  float dft{1.0f};  // fixme: make this compile time constant
 
   explicit OptionalWeights(Span<float const> w) : weights{w} {}
   explicit OptionalWeights(float w) : dft{w} {}
 
   XGBOOST_DEVICE float operator[](size_t i) const { return weights.empty() ? dft : weights[i]; }
 };
+
+/**
+ * Last index of a group in a CSR style of index pointer.
+ */
+template <typename Indexable>
+XGBOOST_DEVICE size_t LastOf(size_t group, Indexable const &indptr) {
+  return indptr[group + 1] - 1;
+}
+
+/**
+ * \brief Run length encode on CPU, input must be sorted.
+ */
+template <typename Iter, typename Idx>
+void RunLengthEncode(Iter begin, Iter end, std::vector<Idx> *p_out) {
+  auto &out = *p_out;
+  out = std::vector<Idx>{0};
+  size_t n = std::distance(begin, end);
+  for (size_t i = 1; i < n; ++i) {
+    if (begin[i] != begin[i - 1]) {
+      out.push_back(i);
+    }
+  }
+  if (out.back() != n) {
+    out.push_back(n);
+  }
+}
 }  // namespace common
 }  // namespace xgboost
 #endif  // XGBOOST_COMMON_COMMON_H_
diff --git a/src/common/device_helpers.cuh b/src/common/device_helpers.cuh
index 9adf866fece9..334e3b4f89bf 100644
--- a/src/common/device_helpers.cuh
+++ b/src/common/device_helpers.cuh
@@ -1,5 +1,5 @@
 /*!
- * Copyright 2017-2021 XGBoost contributors
+ * Copyright 2017-2022 XGBoost contributors
  */
 #pragma once
 #include <thrust/device_ptr.h>
@@ -1537,6 +1537,43 @@ void SegmentedArgSort(xgboost::common::Span<U> values,
                             sorted_idx.size_bytes(), cudaMemcpyDeviceToDevice));
 }
 
+/**
+ * \brief Different from the above one, this one can handle cases where segment doesn't
+ *        start from 0, but as a result it uses comparison sort.
+ */
+template <typename SegIt, typename ValIt>
+void SegmentedArgSort(SegIt seg_begin, SegIt seg_end, ValIt val_begin, ValIt val_end,
+                      dh::device_vector<size_t> *p_sorted_idx) {
+  using Tup = thrust::tuple<int32_t, float>;
+  auto &sorted_idx = *p_sorted_idx;
+  size_t n = std::distance(val_begin, val_end);
+  sorted_idx.resize(n);
+  dh::Iota(dh::ToSpan(sorted_idx));
+  dh::device_vector<Tup> keys(sorted_idx.size());
+  auto key_it = dh::MakeTransformIterator<Tup>(thrust::make_counting_iterator(0ul),
+                                               [=] XGBOOST_DEVICE(size_t i) -> Tup {
+                                                 int32_t leaf_idx;
+                                                 if (i < *seg_begin) {
+                                                   leaf_idx = -1;
+                                                 } else {
+                                                   leaf_idx = dh::SegmentId(seg_begin, seg_end, i);
+                                                 }
+                                                 auto residue = val_begin[i];
+                                                 return thrust::make_tuple(leaf_idx, residue);
+                                               });
+  dh::XGBCachingDeviceAllocator<char> caching;
+  thrust::copy(thrust::cuda::par(caching), key_it, key_it + keys.size(), keys.begin());
+
+  dh::XGBDeviceAllocator<char> alloc;
+  thrust::stable_sort_by_key(thrust::cuda::par(alloc), keys.begin(), keys.end(), sorted_idx.begin(),
+                             [=] XGBOOST_DEVICE(Tup const &l, Tup const &r) {
+                               if (thrust::get<0>(l) != thrust::get<0>(r)) {
+                                 return thrust::get<0>(l) < thrust::get<0>(r);  // segment index
+                               }
+                               return thrust::get<1>(l) < thrust::get<1>(r);  // residue
+                             });
+}
+
 class CUDAStreamView;
 
 class CUDAEvent {
@@ -1600,5 +1637,6 @@ class CUDAStream {
   }
 
   CUDAStreamView View() const { return CUDAStreamView{stream_}; }
+  void Sync() { this->View().Sync(); }
 };
 }  // namespace dh
diff --git a/src/common/linalg_op.cuh b/src/common/linalg_op.cuh
index f0f89df8ab26..558a09ca6acb 100644
--- a/src/common/linalg_op.cuh
+++ b/src/common/linalg_op.cuh
@@ -13,6 +13,7 @@ namespace xgboost {
 namespace linalg {
 template <typename T, int32_t D, typename Fn>
 void ElementWiseKernelDevice(linalg::TensorView<T, D> t, Fn&& fn, cudaStream_t s = nullptr) {
+  dh::safe_cuda(cudaSetDevice(t.DeviceIdx()));
   static_assert(std::is_void<std::result_of_t<Fn(size_t, T&)>>::value,
                 "For function with return, use transform instead.");
   if (t.Contiguous()) {
@@ -40,7 +41,7 @@ void ElementWiseTransformDevice(linalg::TensorView<T, D> t, Fn&& fn, cudaStream_
 }
 
 template <typename T, int32_t D, typename Fn>
-void ElementWiseKernel(GenericParameter const* ctx, linalg::TensorView<T, D> t, Fn&& fn) {
+void ElementWiseKernel(Context const* ctx, linalg::TensorView<T, D> t, Fn&& fn) {
   ctx->IsCPU() ? ElementWiseKernelHost(t, ctx->Threads(), fn) : ElementWiseKernelDevice(t, fn);
 }
 }  // namespace linalg
diff --git a/src/common/partition_builder.h b/src/common/partition_builder.h
index 3250b9d2bf25..648cbe61a3a3 100644
--- a/src/common/partition_builder.h
+++ b/src/common/partition_builder.h
@@ -12,10 +12,12 @@
 #include <algorithm>
 #include <memory>
 #include <utility>
+#include <limits>
 #include <vector>
 
 #include "categorical.h"
 #include "column_matrix.h"
+#include "xgboost/generic_parameters.h"
 #include "xgboost/tree_model.h"
 
 namespace xgboost {
@@ -254,7 +256,7 @@ class PartitionBuilder {
         n_left += mem_blocks_[j]->n_left;
       }
       size_t n_right = 0;
-      for (size_t j = blocks_offsets_[i]; j < blocks_offsets_[i+1]; ++j) {
+      for (size_t j = blocks_offsets_[i]; j < blocks_offsets_[i + 1]; ++j) {
         mem_blocks_[j]->n_offset_right = n_left + n_right;
         n_right += mem_blocks_[j]->n_right;
       }
@@ -279,6 +281,30 @@ class PartitionBuilder {
     return blocks_offsets_[nid] + begin / BlockSize;
   }
 
+  // Copy row partitions into global cache for reuse in objective
+  template <typename Sampledp>
+  void LeafPartition(Context const* ctx, RegTree const& tree, RowSetCollection const& row_set,
+                     std::vector<bst_node_t>* p_position, Sampledp sampledp) const {
+    auto& h_pos = *p_position;
+    h_pos.resize(row_set.Data()->size(), std::numeric_limits<bst_node_t>::max());
+
+    auto p_begin = row_set.Data()->data();
+    ParallelFor(row_set.Size(), ctx->Threads(), [&](size_t i) {
+      auto const& node = row_set[i];
+      if (node.node_id < 0) {
+        return;
+      }
+      CHECK(tree[node.node_id].IsLeaf());
+      if (node.begin) {  // guard for empty node.
+        size_t ptr_offset = node.end - p_begin;
+        CHECK_LE(ptr_offset, row_set.Data()->size()) << node.node_id;
+        for (auto idx = node.begin; idx != node.end; ++idx) {
+          h_pos[*idx] = sampledp(*idx) ? ~node.node_id : node.node_id;
+        }
+      }
+    });
+  }
+
  protected:
   struct BlockInfo{
     size_t n_left;
diff --git a/src/common/row_set.h b/src/common/row_set.h
index dc61d5f5d877..87d5f52874f2 100644
--- a/src/common/row_set.h
+++ b/src/common/row_set.h
@@ -1,5 +1,5 @@
 /*!
- * Copyright 2017 by Contributors
+ * Copyright 2017-2022 by Contributors
  * \file row_set.h
  * \brief Quick Utility to compute subset of rows
  * \author Philip Cho, Tianqi Chen
@@ -15,10 +15,15 @@
 
 namespace xgboost {
 namespace common {
-
 /*! \brief collection of rowset */
 class RowSetCollection {
  public:
+  RowSetCollection() = default;
+  RowSetCollection(RowSetCollection const&) = delete;
+  RowSetCollection(RowSetCollection&&) = default;
+  RowSetCollection& operator=(RowSetCollection const&) = delete;
+  RowSetCollection& operator=(RowSetCollection&&) = default;
+
   /*! \brief data structure to store an instance set, a subset of
    *  rows (instances) associated with a particular node in a decision
    *  tree. */
@@ -38,20 +43,17 @@ class RowSetCollection {
       return end - begin;
     }
   };
-  /* \brief specifies how to split a rowset into two */
-  struct Split {
-    std::vector<size_t> left;
-    std::vector<size_t> right;
-  };
 
-  inline std::vector<Elem>::const_iterator begin() const {  // NOLINT
+  std::vector<Elem>::const_iterator begin() const {  // NOLINT
     return elem_of_each_node_.begin();
   }
 
-  inline std::vector<Elem>::const_iterator end() const {  // NOLINT
+  std::vector<Elem>::const_iterator end() const {  // NOLINT
     return elem_of_each_node_.end();
   }
 
+  size_t Size() const { return std::distance(begin(), end()); }
+
   /*! \brief return corresponding element set given the node_id */
   inline const Elem& operator[](unsigned node_id) const {
     const Elem& e = elem_of_each_node_[node_id];
@@ -86,6 +88,8 @@ class RowSetCollection {
   }
 
   std::vector<size_t>* Data() { return &row_indices_; }
+  std::vector<size_t> const* Data() const { return &row_indices_; }
+
   // split rowset into two
   inline void AddSplit(unsigned node_id, unsigned left_node_id, unsigned right_node_id,
                        size_t n_left, size_t n_right) {
@@ -123,7 +127,6 @@ class RowSetCollection {
   // vector: node_id -> elements
   std::vector<Elem> elem_of_each_node_;
 };
-
 }  // namespace common
 }  // namespace xgboost
 
diff --git a/src/common/stats.cuh b/src/common/stats.cuh
new file mode 100644
index 000000000000..9d9e526a8576
--- /dev/null
+++ b/src/common/stats.cuh
@@ -0,0 +1,127 @@
+/*!
+ * Copyright 2022 by XGBoost Contributors
+ */
+#ifndef XGBOOST_COMMON_STATS_CUH_
+#define XGBOOST_COMMON_STATS_CUH_
+
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/permutation_iterator.h>
+
+#include <iterator>  // std::distance
+
+#include "device_helpers.cuh"
+#include "linalg_op.cuh"
+#include "xgboost/generic_parameters.h"
+#include "xgboost/linalg.h"
+#include "xgboost/tree_model.h"
+
+namespace xgboost {
+namespace common {
+/**
+ * \brief Compute segmented quantile on GPU.
+ *
+ * \tparam SegIt Iterator for CSR style segments indptr
+ * \tparam ValIt Iterator for values
+ *
+ * \param alpha The p^th quantile we want to compute
+ *
+ *    std::distance(ptr_begin, ptr_end) should be equal to n_segments + 1
+ */
+template <typename SegIt, typename ValIt>
+void SegmentedQuantile(Context const* ctx, double alpha, SegIt seg_begin, SegIt seg_end,
+                       ValIt val_begin, ValIt val_end, HostDeviceVector<float>* quantiles) {
+  CHECK(alpha >= 0 && alpha <= 1);
+
+  dh::device_vector<size_t> sorted_idx;
+  using Tup = thrust::tuple<size_t, float>;
+  dh::SegmentedArgSort(seg_begin, seg_end, val_begin, val_end, &sorted_idx);
+  auto n_segments = std::distance(seg_begin, seg_end) - 1;
+  if (n_segments <= 0) {
+    return;
+  }
+
+  quantiles->SetDevice(ctx->gpu_id);
+  quantiles->Resize(n_segments);
+  auto d_results = quantiles->DeviceSpan();
+  auto d_sorted_idx = dh::ToSpan(sorted_idx);
+
+  auto val = thrust::make_permutation_iterator(val_begin, dh::tcbegin(d_sorted_idx));
+
+  dh::LaunchN(n_segments, [=] XGBOOST_DEVICE(size_t i) {
+    // each segment is the index of a leaf.
+    size_t seg_idx = i;
+    size_t begin = seg_begin[seg_idx];
+    auto n = static_cast<double>(seg_begin[seg_idx + 1] - begin);
+    if (n == 0) {
+      d_results[i] = std::numeric_limits<float>::quiet_NaN();
+      return;
+    }
+
+    if (alpha <= (1 / (n + 1))) {
+      d_results[i] = val[begin];
+      return;
+    }
+    if (alpha >= (n / (n + 1))) {
+      d_results[i] = val[common::LastOf(seg_idx, seg_begin)];
+      return;
+    }
+
+    double x = alpha * static_cast<double>(n + 1);
+    double k = std::floor(x) - 1;
+    double d = (x - 1) - k;
+
+    auto v0 = val[begin + static_cast<size_t>(k)];
+    auto v1 = val[begin + static_cast<size_t>(k) + 1];
+    d_results[seg_idx] = v0 + d * (v1 - v0);
+  });
+}
+
+template <typename SegIt, typename ValIt, typename WIter>
+void SegmentedWeightedQuantile(Context const* ctx, double alpha, SegIt seg_beg, SegIt seg_end,
+                               ValIt val_begin, ValIt val_end, WIter w_begin, WIter w_end,
+                               HostDeviceVector<float>* quantiles) {
+  CHECK(alpha >= 0 && alpha <= 1);
+  dh::device_vector<size_t> sorted_idx;
+  dh::SegmentedArgSort(seg_beg, seg_end, val_begin, val_end, &sorted_idx);
+  auto d_sorted_idx = dh::ToSpan(sorted_idx);
+  size_t n_weights = std::distance(w_begin, w_end);
+  dh::device_vector<float> weights_cdf(n_weights);
+
+  dh::XGBCachingDeviceAllocator<char> caching;
+  auto scan_key = dh::MakeTransformIterator<size_t>(
+      thrust::make_counting_iterator(0ul),
+      [=] XGBOOST_DEVICE(size_t i) { return dh::SegmentId(seg_beg, seg_end, i); });
+  auto scan_val = dh::MakeTransformIterator<float>(
+      thrust::make_counting_iterator(0ul),
+      [=] XGBOOST_DEVICE(size_t i) { return w_begin[d_sorted_idx[i]]; });
+  thrust::inclusive_scan_by_key(thrust::cuda::par(caching), scan_key, scan_key + n_weights,
+                                scan_val, weights_cdf.begin());
+
+  auto n_segments = std::distance(seg_beg, seg_end) - 1;
+  quantiles->SetDevice(ctx->gpu_id);
+  quantiles->Resize(n_segments);
+  auto d_results = quantiles->DeviceSpan();
+  auto d_weight_cdf = dh::ToSpan(weights_cdf);
+
+  dh::LaunchN(n_segments, [=] XGBOOST_DEVICE(size_t i) {
+    size_t seg_idx = i;
+    size_t begin = seg_beg[seg_idx];
+    auto n = static_cast<double>(seg_beg[seg_idx + 1] - begin);
+    if (n == 0) {
+      d_results[i] = std::numeric_limits<float>::quiet_NaN();
+      return;
+    }
+    auto leaf_cdf = d_weight_cdf.subspan(begin, static_cast<size_t>(n));
+    auto leaf_sorted_idx = d_sorted_idx.subspan(begin, static_cast<size_t>(n));
+    float thresh = leaf_cdf.back() * alpha;
+
+    size_t idx = thrust::lower_bound(thrust::seq, leaf_cdf.data(),
+                                     leaf_cdf.data() + leaf_cdf.size(), thresh) -
+                 leaf_cdf.data();
+    idx = std::min(idx, static_cast<size_t>(n - 1));
+    d_results[i] = val_begin[leaf_sorted_idx[idx]];
+  });
+}
+}  // namespace common
+}  // namespace xgboost
+#endif  // XGBOOST_COMMON_STATS_CUH_
diff --git a/src/common/stats.h b/src/common/stats.h
new file mode 100644
index 000000000000..4ad9e4aa770a
--- /dev/null
+++ b/src/common/stats.h
@@ -0,0 +1,95 @@
+/*!
+ * Copyright 2022 by XGBoost Contributors
+ */
+#ifndef XGBOOST_COMMON_STATS_H_
+#define XGBOOST_COMMON_STATS_H_
+#include <algorithm>
+#include <iterator>
+#include <limits>
+#include <vector>
+
+#include "common.h"
+#include "xgboost/linalg.h"
+
+namespace xgboost {
+namespace common {
+
+/**
+ * \brief Percentile with masked array using linear interpolation.
+ *
+ *   https://www.itl.nist.gov/div898/handbook/prc/section2/prc262.htm
+ *
+ * \param alpha Percentile, must be in range [0, 1].
+ * \param begin Iterator begin for input array.
+ * \param end   Iterator end for input array.
+ *
+ * \return The result of interpolation.
+ */
+template <typename Iter>
+float Quantile(double alpha, Iter const& begin, Iter const& end) {
+  CHECK(alpha >= 0 && alpha <= 1);
+  auto n = static_cast<double>(std::distance(begin, end));
+  if (n == 0) {
+    return std::numeric_limits<float>::quiet_NaN();
+  }
+
+  std::vector<size_t> sorted_idx(n);
+  std::iota(sorted_idx.begin(), sorted_idx.end(), 0);
+  std::stable_sort(sorted_idx.begin(), sorted_idx.end(),
+                   [&](size_t l, size_t r) { return *(begin + l) < *(begin + r); });
+
+  auto val = [&](size_t i) { return *(begin + sorted_idx[i]); };
+  static_assert(std::is_same<decltype(val(0)), float>::value, "");
+
+  if (alpha <= (1 / (n + 1))) {
+    return val(0);
+  }
+  if (alpha >= (n / (n + 1))) {
+    return val(sorted_idx.size() - 1);
+  }
+  assert(n != 0 && "The number of rows in a leaf can not be zero.");
+  double x = alpha * static_cast<double>((n + 1));
+  double k = std::floor(x) - 1;
+  CHECK_GE(k, 0);
+  double d = (x - 1) - k;
+
+  auto v0 = val(static_cast<size_t>(k));
+  auto v1 = val(static_cast<size_t>(k) + 1);
+  return v0 + d * (v1 - v0);
+}
+
+/**
+ * \brief Calculate the weighted quantile with step function. Unlike the unweighted
+ *        version, no interpolation is used.
+ *
+ *   See https://aakinshin.net/posts/weighted-quantiles/ for some discussion on computing
+ *   weighted quantile with interpolation.
+ */
+template <typename Iter, typename WeightIter>
+float WeightedQuantile(double alpha, Iter begin, Iter end, WeightIter weights) {
+  auto n = static_cast<double>(std::distance(begin, end));
+  if (n == 0) {
+    return std::numeric_limits<float>::quiet_NaN();
+  }
+  std::vector<size_t> sorted_idx(n);
+  std::iota(sorted_idx.begin(), sorted_idx.end(), 0);
+  std::stable_sort(sorted_idx.begin(), sorted_idx.end(),
+                   [&](size_t l, size_t r) { return *(begin + l) < *(begin + r); });
+
+  auto val = [&](size_t i) { return *(begin + sorted_idx[i]); };
+
+  std::vector<float> weight_cdf(n);  // S_n
+  // weighted cdf is sorted during construction
+  weight_cdf[0] = *(weights + sorted_idx[0]);
+  for (size_t i = 1; i < n; ++i) {
+    weight_cdf[i] = weight_cdf[i - 1] + *(weights + sorted_idx[i]);
+  }
+  float thresh = weight_cdf.back() * alpha;
+  size_t idx =
+      std::lower_bound(weight_cdf.cbegin(), weight_cdf.cend(), thresh) - weight_cdf.cbegin();
+  idx = std::min(idx, static_cast<size_t>(n - 1));
+  return val(idx);
+}
+}  // namespace common
+}  // namespace xgboost
+#endif  // XGBOOST_COMMON_STATS_H_
diff --git a/src/data/data.cc b/src/data/data.cc
index 86f73523a39d..c297527c6bae 100644
--- a/src/data/data.cc
+++ b/src/data/data.cc
@@ -512,16 +512,7 @@ void MetaInfo::SetInfoFromHost(Context const& ctx, StringView key, Json arr) {
       }
     }
     CHECK(non_dec) << "`qid` must be sorted in non-decreasing order along with data.";
-    group_ptr_.clear();
-    group_ptr_.push_back(0);
-    for (size_t i = 1; i < query_ids.size(); ++i) {
-      if (query_ids[i] != query_ids[i - 1]) {
-        group_ptr_.push_back(i);
-      }
-    }
-    if (group_ptr_.back() != query_ids.size()) {
-      group_ptr_.push_back(query_ids.size());
-    }
+    common::RunLengthEncode(query_ids.cbegin(), query_ids.cend(), &group_ptr_);
     data::ValidateQueryGroup(group_ptr_);
     return;
   }
diff --git a/src/data/iterative_device_dmatrix.h b/src/data/iterative_device_dmatrix.h
index ba2d4a92f9da..031b289f2760 100644
--- a/src/data/iterative_device_dmatrix.h
+++ b/src/data/iterative_device_dmatrix.h
@@ -68,7 +68,7 @@ class IterativeDeviceDMatrix : public DMatrix {
 
   BatchSet<EllpackPage> GetEllpackBatches(const BatchParam& param) override;
 
-  bool SingleColBlock() const override { return false; }
+  bool SingleColBlock() const override { return true; }
 
   MetaInfo &Info() override { return info_; }
   MetaInfo const &Info() const override { return info_; }
diff --git a/src/gbm/gblinear.cc b/src/gbm/gblinear.cc
index cbf6ffebfca5..0e983fe4b37f 100644
--- a/src/gbm/gblinear.cc
+++ b/src/gbm/gblinear.cc
@@ -134,9 +134,8 @@ class GBLinear : public GradientBooster {
     this->updater_->SaveConfig(&j_updater);
   }
 
-  void DoBoost(DMatrix *p_fmat,
-               HostDeviceVector<GradientPair> *in_gpair,
-               PredictionCacheEntry*) override {
+  void DoBoost(DMatrix* p_fmat, HostDeviceVector<GradientPair>* in_gpair, PredictionCacheEntry*,
+               ObjFunction const*) override {
     monitor_.Start("DoBoost");
 
     model_.LazyInitModel();
diff --git a/src/gbm/gbtree.cc b/src/gbm/gbtree.cc
index ec611ee95a68..bb7c341f8beb 100644
--- a/src/gbm/gbtree.cc
+++ b/src/gbm/gbtree.cc
@@ -1,33 +1,34 @@
 /*!
- * Copyright 2014-2021 by Contributors
+ * Copyright 2014-2022 by Contributors
  * \file gbtree.cc
  * \brief gradient boosted tree implementation.
  * \author Tianqi Chen
  */
+#include "gbtree.h"
+
 #include <dmlc/omp.h>
 #include <dmlc/parameter.h>
 
-#include <vector>
+#include <algorithm>
+#include <limits>
 #include <memory>
-#include <utility>
 #include <string>
-#include <limits>
-#include <algorithm>
+#include <utility>
+#include <vector>
 
+#include "../common/common.h"
+#include "../common/random.h"
+#include "../common/threading_utils.h"
+#include "../common/timer.h"
+#include "gbtree_model.h"
 #include "xgboost/data.h"
 #include "xgboost/gbm.h"
-#include "xgboost/logging.h"
+#include "xgboost/host_device_vector.h"
 #include "xgboost/json.h"
+#include "xgboost/logging.h"
+#include "xgboost/objective.h"
 #include "xgboost/predictor.h"
 #include "xgboost/tree_updater.h"
-#include "xgboost/host_device_vector.h"
-
-#include "gbtree.h"
-#include "gbtree_model.h"
-#include "../common/common.h"
-#include "../common/random.h"
-#include "../common/timer.h"
-#include "../common/threading_utils.h"
 
 namespace xgboost {
 namespace gbm {
@@ -216,53 +217,68 @@ void CopyGradient(HostDeviceVector<GradientPair> const* in_gpair, int32_t n_thre
   }
 }
 
-void GBTree::DoBoost(DMatrix* p_fmat,
-                     HostDeviceVector<GradientPair>* in_gpair,
-                     PredictionCacheEntry* predt) {
-  std::vector<std::vector<std::unique_ptr<RegTree> > > new_trees;
+void GBTree::UpdateTreeLeaf(DMatrix const* p_fmat, HostDeviceVector<float> const& predictions,
+                            ObjFunction const* obj, size_t gidx,
+                            std::vector<std::unique_ptr<RegTree>>* p_trees) {
+  CHECK(!updaters_.empty());
+  if (!updaters_.back()->HasNodePosition()) {
+    return;
+  }
+  if (!obj || !obj->Task().UpdateTreeLeaf()) {
+    return;
+  }
+  auto& trees = *p_trees;
+  for (size_t tree_idx = 0; tree_idx < trees.size(); ++tree_idx) {
+    auto const& position = this->node_position_.at(tree_idx);
+    obj->UpdateTreeLeaf(position, p_fmat->Info(), predictions, trees[tree_idx].get());
+  }
+}
+
+void GBTree::DoBoost(DMatrix* p_fmat, HostDeviceVector<GradientPair>* in_gpair,
+                     PredictionCacheEntry* predt, ObjFunction const* obj) {
+  std::vector<std::vector<std::unique_ptr<RegTree>>> new_trees;
   const int ngroup = model_.learner_model_param->num_output_group;
   ConfigureWithKnownData(this->cfg_, p_fmat);
   monitor_.Start("BoostNewTrees");
   // Weird case that tree method is cpu-based but gpu_id is set.  Ideally we should let
   // `gpu_id` be the single source of determining what algorithms to run, but that will
   // break a lots of existing code.
-  auto device = tparam_.tree_method != TreeMethod::kGPUHist
-                    ? GenericParameter::kCpuId
-                    : ctx_->gpu_id;
+  auto device = tparam_.tree_method != TreeMethod::kGPUHist ? Context::kCpuId : ctx_->gpu_id;
   auto out = linalg::TensorView<float, 2>{
-      device == GenericParameter::kCpuId ? predt->predictions.HostSpan()
-                                         : predt->predictions.DeviceSpan(),
-      {static_cast<size_t>(p_fmat->Info().num_row_),
-       static_cast<size_t>(ngroup)},
+      device == Context::kCpuId ? predt->predictions.HostSpan() : predt->predictions.DeviceSpan(),
+      {static_cast<size_t>(p_fmat->Info().num_row_), static_cast<size_t>(ngroup)},
       device};
   CHECK_NE(ngroup, 0);
+
+  if (!p_fmat->SingleColBlock() && obj->Task().UpdateTreeLeaf()) {
+    LOG(FATAL) << "Current objective doesn't support external memory.";
+  }
+
   if (ngroup == 1) {
     std::vector<std::unique_ptr<RegTree>> ret;
     BoostNewTrees(in_gpair, p_fmat, 0, &ret);
+    UpdateTreeLeaf(p_fmat, predt->predictions, obj, 0, &ret);
     const size_t num_new_trees = ret.size();
     new_trees.push_back(std::move(ret));
     auto v_predt = out.Slice(linalg::All(), 0);
-    if (updaters_.size() > 0 && num_new_trees == 1 &&
-        predt->predictions.Size() > 0 &&
+    if (updaters_.size() > 0 && num_new_trees == 1 && predt->predictions.Size() > 0 &&
         updaters_.back()->UpdatePredictionCache(p_fmat, v_predt)) {
       predt->Update(1);
     }
   } else {
-    CHECK_EQ(in_gpair->Size() % ngroup, 0U)
-        << "must have exactly ngroup * nrow gpairs";
-    HostDeviceVector<GradientPair> tmp(in_gpair->Size() / ngroup,
-                                       GradientPair(),
+    CHECK_EQ(in_gpair->Size() % ngroup, 0U) << "must have exactly ngroup * nrow gpairs";
+    HostDeviceVector<GradientPair> tmp(in_gpair->Size() / ngroup, GradientPair(),
                                        in_gpair->DeviceIdx());
     bool update_predict = true;
     for (int gid = 0; gid < ngroup; ++gid) {
       CopyGradient(in_gpair, ctx_->Threads(), ngroup, gid, &tmp);
-      std::vector<std::unique_ptr<RegTree> > ret;
+      std::vector<std::unique_ptr<RegTree>> ret;
       BoostNewTrees(&tmp, p_fmat, gid, &ret);
+      UpdateTreeLeaf(p_fmat, predt->predictions, obj, gid, &ret);
       const size_t num_new_trees = ret.size();
       new_trees.push_back(std::move(ret));
       auto v_predt = out.Slice(linalg::All(), gid);
-      if (!(updaters_.size() > 0 && predt->predictions.Size() > 0 &&
-            num_new_trees == 1 &&
+      if (!(updaters_.size() > 0 && predt->predictions.Size() > 0 && num_new_trees == 1 &&
             updaters_.back()->UpdatePredictionCache(p_fmat, v_predt))) {
         update_predict = false;
       }
@@ -271,6 +287,7 @@ void GBTree::DoBoost(DMatrix* p_fmat,
       predt->Update(1);
     }
   }
+
   monitor_.Stop("BoostNewTrees");
   this->CommitModel(std::move(new_trees), p_fmat, predt);
 }
@@ -316,10 +333,8 @@ void GBTree::InitUpdater(Args const& cfg) {
   }
 }
 
-void GBTree::BoostNewTrees(HostDeviceVector<GradientPair>* gpair,
-                           DMatrix *p_fmat,
-                           int bst_group,
-                           std::vector<std::unique_ptr<RegTree> >* ret) {
+void GBTree::BoostNewTrees(HostDeviceVector<GradientPair>* gpair, DMatrix* p_fmat, int bst_group,
+                           std::vector<std::unique_ptr<RegTree>>* ret) {
   std::vector<RegTree*> new_trees;
   ret->clear();
   // create the trees
@@ -338,9 +353,9 @@ void GBTree::BoostNewTrees(HostDeviceVector<GradientPair>* gpair,
     } else if (tparam_.process_type == TreeProcessType::kUpdate) {
       for (auto const& up : updaters_) {
         CHECK(up->CanModifyTree())
-          << "Updater: `" << up->Name() << "` "
-          << "can not be used to modify existing trees. "
-          << "Set `process_type` to `default` if you want to build new trees.";
+            << "Updater: `" << up->Name() << "` "
+            << "can not be used to modify existing trees. "
+            << "Set `process_type` to `default` if you want to build new trees.";
       }
       CHECK_LT(model_.trees.size(), model_.trees_to_update.size())
           << "No more tree left for updating.  For updating existing trees, "
@@ -356,8 +371,10 @@ void GBTree::BoostNewTrees(HostDeviceVector<GradientPair>* gpair,
   CHECK_EQ(gpair->Size(), p_fmat->Info().num_row_)
       << "Mismatching size between number of rows from input data and size of "
          "gradient vector.";
+  node_position_.resize(new_trees.size());
   for (auto& up : updaters_) {
-    up->Update(gpair, p_fmat, new_trees);
+    up->Update(gpair, p_fmat, common::Span<HostDeviceVector<bst_node_t>>{node_position_},
+               new_trees);
   }
 }
 
diff --git a/src/gbm/gbtree.h b/src/gbm/gbtree.h
index 67d9e212888a..020b7d0cb9c0 100644
--- a/src/gbm/gbtree.h
+++ b/src/gbm/gbtree.h
@@ -1,5 +1,5 @@
 /*!
- * Copyright 2014-2021 by Contributors
+ * Copyright 2014-2022 by Contributors
  * \file gbtree.cc
  * \brief gradient boosted tree implementation.
  * \author Tianqi Chen
@@ -202,10 +202,16 @@ class GBTree : public GradientBooster {
   void ConfigureUpdaters();
   void ConfigureWithKnownData(Args const& cfg, DMatrix* fmat);
 
+  /**
+   * \brief Optionally update the leaf value.
+   */
+  void UpdateTreeLeaf(DMatrix const* p_fmat, HostDeviceVector<float> const& predictions,
+                      ObjFunction const* obj, size_t gidx,
+                      std::vector<std::unique_ptr<RegTree>>* p_trees);
+
   /*! \brief Carry out one iteration of boosting */
-  void DoBoost(DMatrix* p_fmat,
-               HostDeviceVector<GradientPair>* in_gpair,
-               PredictionCacheEntry* predt) override;
+  void DoBoost(DMatrix* p_fmat, HostDeviceVector<GradientPair>* in_gpair,
+               PredictionCacheEntry* predt, ObjFunction const* obj) override;
 
   bool UseGPU() const override {
     return
@@ -435,6 +441,9 @@ class GBTree : public GradientBooster {
   Args cfg_;
   // the updaters that can be applied to each of tree
   std::vector<std::unique_ptr<TreeUpdater>> updaters_;
+  // The node position for each row, 1 HDV for each tree in the forest.  Note that the
+  // position is negated if the row is sampled out.
+  std::vector<HostDeviceVector<bst_node_t>> node_position_;
   // Predictors
   std::unique_ptr<Predictor> cpu_predictor_;
 #if defined(XGBOOST_USE_CUDA)
diff --git a/src/learner.cc b/src/learner.cc
index 7d8419259e1e..20b342b7b366 100644
--- a/src/learner.cc
+++ b/src/learner.cc
@@ -1169,7 +1169,7 @@ class LearnerImpl : public LearnerIO {
     monitor_.Stop("GetGradient");
     TrainingObserver::Instance().Observe(gpair_, "Gradients");
 
-    gbm_->DoBoost(train.get(), &gpair_, &predt);
+    gbm_->DoBoost(train.get(), &gpair_, &predt, obj_.get());
     monitor_.Stop("UpdateOneIter");
   }
 
@@ -1186,7 +1186,7 @@ class LearnerImpl : public LearnerIO {
     auto local_cache = this->GetPredictionCache();
     local_cache->Cache(train, generic_parameters_.gpu_id);
 
-    gbm_->DoBoost(train.get(), in_gpair, &local_cache->Entry(train.get()));
+    gbm_->DoBoost(train.get(), in_gpair, &local_cache->Entry(train.get()), obj_.get());
     monitor_.Stop("BoostOneIter");
   }
 
diff --git a/src/metric/auc.cu b/src/metric/auc.cu
index be89c015c93d..5faa116c8561 100644
--- a/src/metric/auc.cu
+++ b/src/metric/auc.cu
@@ -1,5 +1,5 @@
 /*!
- * Copyright 2021 by XGBoost Contributors
+ * Copyright 2021-2022 by XGBoost Contributors
  */
 #include <thrust/scan.h>
 #include <cub/cub.cuh>
@@ -201,14 +201,6 @@ void Transpose(common::Span<float const> in, common::Span<float> out, size_t m,
   });
 }
 
-/**
- * Last index of a group in a CSR style of index pointer.
- */
-template <typename Idx>
-XGBOOST_DEVICE size_t LastOf(size_t group, common::Span<Idx> indptr) {
-  return indptr[group + 1] - 1;
-}
-
 double ScaleClasses(common::Span<double> results,
                     common::Span<double> local_area, common::Span<double> fp,
                     common::Span<double> tp, common::Span<double> auc,
@@ -300,9 +292,9 @@ void SegmentedReduceAUC(common::Span<size_t const> d_unique_idx,
         double fp, tp, fp_prev, tp_prev;
         if (i == d_unique_class_ptr[class_id]) {
           // first item is ignored, we use this thread to calculate the last item
-          thrust::tie(fp, tp) = d_fptp[LastOf(class_id, d_class_ptr)];
+          thrust::tie(fp, tp) = d_fptp[common::LastOf(class_id, d_class_ptr)];
           thrust::tie(fp_prev, tp_prev) =
-              d_neg_pos[d_unique_idx[LastOf(class_id, d_unique_class_ptr)]];
+              d_neg_pos[d_unique_idx[common::LastOf(class_id, d_unique_class_ptr)]];
         } else {
           thrust::tie(fp, tp) = d_fptp[d_unique_idx[i] - 1];
           thrust::tie(fp_prev, tp_prev) = d_neg_pos[d_unique_idx[i - 1]];
@@ -413,10 +405,10 @@ double GPUMultiClassAUCOVR(common::Span<float const> predts,
     }
     uint32_t class_id = d_unique_idx[i] / n_samples;
     d_neg_pos[d_unique_idx[i]] = d_fptp[d_unique_idx[i] - 1];
-    if (i == LastOf(class_id, d_unique_class_ptr)) {
+    if (i == common::LastOf(class_id, d_unique_class_ptr)) {
       // last one needs to be included.
-      size_t last = d_unique_idx[LastOf(class_id, d_unique_class_ptr)];
-      d_neg_pos[LastOf(class_id, d_class_ptr)] = d_fptp[last - 1];
+      size_t last = d_unique_idx[common::LastOf(class_id, d_unique_class_ptr)];
+      d_neg_pos[common::LastOf(class_id, d_class_ptr)] = d_fptp[last - 1];
       return;
     }
   });
@@ -592,7 +584,7 @@ GPURankingAUC(common::Span<float const> predts, MetaInfo const &info,
         auto data_group_begin = d_group_ptr[group_id];
         size_t n_samples = d_group_ptr[group_id + 1] - data_group_begin;
         // last item of current group
-        if (item.idx == LastOf(group_id, d_threads_group_ptr)) {
+        if (item.idx == common::LastOf(group_id, d_threads_group_ptr)) {
           if (item.w > 0) {
             s_d_auc[group_id] = item.predt / item.w;
           } else {
@@ -797,10 +789,10 @@ GPURankingPRAUCImpl(common::Span<float const> predts, MetaInfo const &info,
     }
     auto group_idx = dh::SegmentId(d_group_ptr, d_unique_idx[i]);
     d_neg_pos[d_unique_idx[i]] = d_fptp[d_unique_idx[i] - 1];
-    if (i == LastOf(group_idx, d_unique_class_ptr)) {
+    if (i == common::LastOf(group_idx, d_unique_class_ptr)) {
       // last one needs to be included.
-      size_t last = d_unique_idx[LastOf(group_idx, d_unique_class_ptr)];
-      d_neg_pos[LastOf(group_idx, d_group_ptr)] = d_fptp[last - 1];
+      size_t last = d_unique_idx[common::LastOf(group_idx, d_unique_class_ptr)];
+      d_neg_pos[common::LastOf(group_idx, d_group_ptr)] = d_fptp[last - 1];
       return;
     }
   });
@@ -821,7 +813,7 @@ GPURankingPRAUCImpl(common::Span<float const> predts, MetaInfo const &info,
     auto it = dh::MakeTransformIterator<thrust::pair<double, uint32_t>>(
         thrust::make_counting_iterator(0ul), [=] XGBOOST_DEVICE(size_t g) {
           double fp, tp;
-          thrust::tie(fp, tp) = d_fptp[LastOf(g, d_group_ptr)];
+          thrust::tie(fp, tp) = d_fptp[common::LastOf(g, d_group_ptr)];
           double area = fp * tp;
           auto n_documents = d_group_ptr[g + 1] - d_group_ptr[g];
           if (area > 0 && n_documents >= 2) {
diff --git a/src/objective/adaptive.cc b/src/objective/adaptive.cc
new file mode 100644
index 000000000000..f2675d918bdf
--- /dev/null
+++ b/src/objective/adaptive.cc
@@ -0,0 +1,126 @@
+/*!
+ * Copyright 2022 by XGBoost Contributors
+ */
+#include "adaptive.h"
+
+#include <limits>
+#include <vector>
+
+#include "../common/common.h"
+#include "../common/stats.h"
+#include "../common/threading_utils.h"
+#include "xgboost/tree_model.h"
+
+namespace xgboost {
+namespace obj {
+namespace detail {
+void EncodeTreeLeafHost(RegTree const& tree, std::vector<bst_node_t> const& position,
+                        std::vector<size_t>* p_nptr, std::vector<bst_node_t>* p_nidx,
+                        std::vector<size_t>* p_ridx) {
+  auto& nptr = *p_nptr;
+  auto& nidx = *p_nidx;
+  auto& ridx = *p_ridx;
+  ridx = common::ArgSort<size_t>(position);
+  std::vector<bst_node_t> sorted_pos(position);
+  // permutation
+  for (size_t i = 0; i < position.size(); ++i) {
+    sorted_pos[i] = position[ridx[i]];
+  }
+  // find the first non-sampled row
+  auto begin_pos =
+      std::distance(sorted_pos.cbegin(), std::find_if(sorted_pos.cbegin(), sorted_pos.cend(),
+                                                      [](bst_node_t nidx) { return nidx >= 0; }));
+  CHECK_LE(begin_pos, sorted_pos.size());
+
+  std::vector<bst_node_t> leaf;
+  tree.WalkTree([&](bst_node_t nidx) {
+    if (tree[nidx].IsLeaf()) {
+      leaf.push_back(nidx);
+    }
+    return true;
+  });
+
+  if (begin_pos == sorted_pos.size()) {
+    nidx = leaf;
+    return;
+  }
+
+  auto beg_it = sorted_pos.begin() + begin_pos;
+  common::RunLengthEncode(beg_it, sorted_pos.end(), &nptr);
+  CHECK_GT(nptr.size(), 0);
+  // skip the sampled rows in indptr
+  std::transform(nptr.begin(), nptr.end(), nptr.begin(),
+                 [begin_pos](size_t ptr) { return ptr + begin_pos; });
+
+  size_t n_leaf = nptr.size() - 1;
+  auto n_unique = std::unique(beg_it, sorted_pos.end()) - beg_it;
+  CHECK_EQ(n_unique, n_leaf);
+  nidx.resize(n_leaf);
+  std::copy(beg_it, beg_it + n_unique, nidx.begin());
+
+  if (n_leaf != leaf.size()) {
+    FillMissingLeaf(leaf, &nidx, &nptr);
+  }
+}
+
+void UpdateTreeLeafHost(Context const* ctx, std::vector<bst_node_t> const& position,
+                        MetaInfo const& info, HostDeviceVector<float> const& predt, float alpha,
+                        RegTree* p_tree) {
+  auto& tree = *p_tree;
+
+  std::vector<bst_node_t> nidx;
+  std::vector<size_t> nptr;
+  std::vector<size_t> ridx;
+  EncodeTreeLeafHost(*p_tree, position, &nptr, &nidx, &ridx);
+  size_t n_leaf = nidx.size();
+  if (nptr.empty()) {
+    std::vector<float> quantiles;
+    UpdateLeafValues(&quantiles, nidx, p_tree);
+    return;
+  }
+
+  CHECK(!position.empty());
+  std::vector<float> quantiles(n_leaf, 0);
+  std::vector<int32_t> n_valids(n_leaf, 0);
+
+  auto const& h_node_idx = nidx;
+  auto const& h_node_ptr = nptr;
+  CHECK_LE(h_node_ptr.back(), info.num_row_);
+  // loop over each leaf
+  common::ParallelFor(quantiles.size(), ctx->Threads(), [&](size_t k) {
+    auto nidx = h_node_idx[k];
+    CHECK(tree[nidx].IsLeaf());
+    CHECK_LT(k + 1, h_node_ptr.size());
+    size_t n = h_node_ptr[k + 1] - h_node_ptr[k];
+    auto h_row_set = common::Span<size_t const>{ridx}.subspan(h_node_ptr[k], n);
+    // multi-target not yet supported.
+    auto h_labels = info.labels.HostView().Slice(linalg::All(), 0);
+    auto const& h_predt = predt.ConstHostVector();
+    auto h_weights = linalg::MakeVec(&info.weights_);
+
+    auto iter = common::MakeIndexTransformIter([&](size_t i) -> float {
+      auto row_idx = h_row_set[i];
+      return h_labels(row_idx) - h_predt[row_idx];
+    });
+    auto w_it = common::MakeIndexTransformIter([&](size_t i) -> float {
+      auto row_idx = h_row_set[i];
+      return h_weights(row_idx);
+    });
+
+    float q{0};
+    if (info.weights_.Empty()) {
+      q = common::Quantile(alpha, iter, iter + h_row_set.size());
+    } else {
+      q = common::WeightedQuantile(alpha, iter, iter + h_row_set.size(), w_it);
+    }
+    if (std::isnan(q)) {
+      CHECK(h_row_set.empty());
+    }
+    quantiles.at(k) = q;
+  });
+
+  UpdateLeafValues(&quantiles, nidx, p_tree);
+}
+}  // namespace detail
+}  // namespace obj
+}  // namespace xgboost
diff --git a/src/objective/adaptive.cu b/src/objective/adaptive.cu
new file mode 100644
index 000000000000..42d239acd977
--- /dev/null
+++ b/src/objective/adaptive.cu
@@ -0,0 +1,182 @@
+/*!
+ * Copyright 2022 by XGBoost Contributors
+ */
+#include <thrust/sort.h>
+
+#include <cub/cub.cuh>
+
+#include "../common/device_helpers.cuh"
+#include "../common/stats.cuh"
+#include "adaptive.h"
+
+namespace xgboost {
+namespace obj {
+namespace detail {
+void EncodeTreeLeafDevice(Context const* ctx, common::Span<bst_node_t const> position,
+                          dh::device_vector<size_t>* p_ridx, HostDeviceVector<size_t>* p_nptr,
+                          HostDeviceVector<bst_node_t>* p_nidx, RegTree const& tree) {
+  // copy position to buffer
+  dh::safe_cuda(cudaSetDevice(ctx->gpu_id));
+  size_t n_samples = position.size();
+  dh::XGBDeviceAllocator<char> alloc;
+  dh::device_vector<bst_node_t> sorted_position(position.size());
+  dh::safe_cuda(cudaMemcpyAsync(sorted_position.data().get(), position.data(),
+                                position.size_bytes(), cudaMemcpyDeviceToDevice));
+
+  p_ridx->resize(position.size());
+  dh::Iota(dh::ToSpan(*p_ridx));
+  // sort row index according to node index
+  thrust::stable_sort_by_key(thrust::cuda::par(alloc), sorted_position.begin(),
+                             sorted_position.begin() + n_samples, p_ridx->begin());
+  dh::XGBCachingDeviceAllocator<char> caching;
+  auto beg_pos =
+      thrust::find_if(thrust::cuda::par(caching), sorted_position.cbegin(), sorted_position.cend(),
+                      [] XGBOOST_DEVICE(bst_node_t nidx) { return nidx >= 0; }) -
+      sorted_position.cbegin();
+  if (beg_pos == sorted_position.size()) {
+    auto& leaf = p_nidx->HostVector();
+    tree.WalkTree([&](bst_node_t nidx) {
+      if (tree[nidx].IsLeaf()) {
+        leaf.push_back(nidx);
+      }
+      return true;
+    });
+    return;
+  }
+
+  size_t n_leaf = tree.GetNumLeaves();
+  size_t max_n_unique = n_leaf;
+
+  dh::caching_device_vector<size_t> counts_out(max_n_unique + 1, 0);
+  auto d_counts_out = dh::ToSpan(counts_out).subspan(0, max_n_unique);
+  auto d_num_runs_out = dh::ToSpan(counts_out).subspan(max_n_unique, 1);
+  dh::caching_device_vector<bst_node_t> unique_out(max_n_unique, 0);
+  auto d_unique_out = dh::ToSpan(unique_out);
+
+  size_t nbytes;
+  auto begin_it = sorted_position.begin() + beg_pos;
+  cub::DeviceRunLengthEncode::Encode(nullptr, nbytes, begin_it, unique_out.data().get(),
+                                     counts_out.data().get(), d_num_runs_out.data(),
+                                     n_samples - beg_pos);
+  dh::TemporaryArray<char> temp(nbytes);
+  cub::DeviceRunLengthEncode::Encode(temp.data().get(), nbytes, begin_it, unique_out.data().get(),
+                                     counts_out.data().get(), d_num_runs_out.data(),
+                                     n_samples - beg_pos);
+
+  dh::PinnedMemory pinned_pool;
+  auto pinned = pinned_pool.GetSpan<char>(sizeof(size_t) + sizeof(bst_node_t));
+  dh::CUDAStream copy_stream;
+  size_t* h_num_runs = reinterpret_cast<size_t*>(pinned.subspan(0, sizeof(size_t)).data());
+  // flag for whether there's ignored position
+  bst_node_t* h_first_unique =
+      reinterpret_cast<bst_node_t*>(pinned.subspan(sizeof(size_t), sizeof(bst_node_t)).data());
+  dh::safe_cuda(cudaMemcpyAsync(h_num_runs, d_num_runs_out.data(), sizeof(size_t),
+                                cudaMemcpyDeviceToHost, copy_stream.View()));
+  dh::safe_cuda(cudaMemcpyAsync(h_first_unique, d_unique_out.data(), sizeof(bst_node_t),
+                                cudaMemcpyDeviceToHost, copy_stream.View()));
+
+  /**
+   * copy node index (leaf index)
+   */
+  auto& nidx = *p_nidx;
+  auto& nptr = *p_nptr;
+  nidx.SetDevice(ctx->gpu_id);
+  nidx.Resize(n_leaf);
+  auto d_node_idx = nidx.DeviceSpan();
+
+  nptr.SetDevice(ctx->gpu_id);
+  nptr.Resize(n_leaf + 1, 0);
+  auto d_node_ptr = nptr.DeviceSpan();
+
+  dh::LaunchN(n_leaf, [=] XGBOOST_DEVICE(size_t i) {
+    if (i >= d_num_runs_out[0]) {
+      // d_num_runs_out <= max_n_unique
+      // this omits all the leaf that are empty. A leaf can be empty when there's
+      // missing data, which can be caused by sparse input and distributed training.
+      return;
+    }
+    d_node_idx[i] = d_unique_out[i];
+    d_node_ptr[i + 1] = d_counts_out[i];
+    if (i == 0) {
+      d_node_ptr[0] = beg_pos;
+    }
+  });
+  thrust::inclusive_scan(thrust::cuda::par(caching), dh::tbegin(d_node_ptr), dh::tend(d_node_ptr),
+                         dh::tbegin(d_node_ptr));
+  copy_stream.View().Sync();
+  CHECK_GT(*h_num_runs, 0);
+  CHECK_LE(*h_num_runs, n_leaf);
+
+  if (*h_num_runs < n_leaf) {
+    // shrink to omit the sampled nodes.
+    nptr.Resize(*h_num_runs + 1);
+    nidx.Resize(*h_num_runs);
+
+    std::vector<bst_node_t> leaves;
+    tree.WalkTree([&](bst_node_t nidx) {
+      if (tree[nidx].IsLeaf()) {
+        leaves.push_back(nidx);
+      }
+      return true;
+    });
+    CHECK_EQ(leaves.size(), n_leaf);
+    // Fill all the leaves that don't have any sample. This is hacky and inefficient. An
+    // alternative is to leave the objective to handle missing leaf, which is more messy
+    // as we need to take other distributed workers into account.
+    auto& h_nidx = nidx.HostVector();
+    auto& h_nptr = nptr.HostVector();
+    FillMissingLeaf(leaves, &h_nidx, &h_nptr);
+    nidx.DevicePointer();
+    nptr.DevicePointer();
+  }
+  CHECK_EQ(nidx.Size(), n_leaf);
+  CHECK_EQ(nptr.Size(), n_leaf + 1);
+}
+
+void UpdateTreeLeafDevice(Context const* ctx, common::Span<bst_node_t const> position,
+                          MetaInfo const& info, HostDeviceVector<float> const& predt, float alpha,
+                          RegTree* p_tree) {
+  dh::safe_cuda(cudaSetDevice(ctx->gpu_id));
+  dh::device_vector<size_t> ridx;
+  HostDeviceVector<size_t> nptr;
+  HostDeviceVector<bst_node_t> nidx;
+
+  EncodeTreeLeafDevice(ctx, position, &ridx, &nptr, &nidx, *p_tree);
+
+  if (nptr.Empty()) {
+    std::vector<float> quantiles;
+    UpdateLeafValues(&quantiles, nidx.ConstHostVector(), p_tree);
+  }
+
+  HostDeviceVector<float> quantiles;
+  predt.SetDevice(ctx->gpu_id);
+  auto d_predt = predt.ConstDeviceSpan();
+  auto d_labels = info.labels.View(ctx->gpu_id);
+
+  auto d_row_index = dh::ToSpan(ridx);
+  auto seg_beg = nptr.DevicePointer();
+  auto seg_end = seg_beg + nptr.Size();
+  auto val_beg = dh::MakeTransformIterator<float>(thrust::make_counting_iterator(0ul),
+                                                  [=] XGBOOST_DEVICE(size_t i) {
+                                                    auto predt = d_predt[d_row_index[i]];
+                                                    auto y = d_labels(d_row_index[i]);
+                                                    return y - predt;
+                                                  });
+  auto val_end = val_beg + d_labels.Size();
+  CHECK_EQ(nidx.Size() + 1, nptr.Size());
+  if (info.weights_.Empty()) {
+    common::SegmentedQuantile(ctx, alpha, seg_beg, seg_end, val_beg, val_end, &quantiles);
+  } else {
+    info.weights_.SetDevice(ctx->gpu_id);
+    auto d_weights = info.weights_.ConstDeviceSpan();
+    CHECK_EQ(d_weights.size(), d_row_index.size());
+    auto w_it = thrust::make_permutation_iterator(dh::tcbegin(d_weights), dh::tcbegin(d_row_index));
+    common::SegmentedWeightedQuantile(ctx, alpha, seg_beg, seg_end, val_beg, val_end, w_it,
+                                      w_it + d_weights.size(), &quantiles);
+  }
+
+  UpdateLeafValues(&quantiles.HostVector(), nidx.ConstHostVector(), p_tree);
+}
+}  // namespace detail
+}  // namespace obj
+}  // namespace xgboost
diff --git a/src/objective/adaptive.h b/src/objective/adaptive.h
new file mode 100644
index 000000000000..85c041347cb9
--- /dev/null
+++ b/src/objective/adaptive.h
@@ -0,0 +1,83 @@
+/*!
+ * Copyright 2022 by XGBoost Contributors
+ */
+#pragma once
+
+#include <algorithm>
+#include <limits>
+#include <vector>
+
+#include "rabit/rabit.h"
+#include "xgboost/generic_parameters.h"
+#include "xgboost/host_device_vector.h"
+#include "xgboost/tree_model.h"
+
+namespace xgboost {
+namespace obj {
+namespace detail {
+inline void FillMissingLeaf(std::vector<bst_node_t> const& maybe_missing,
+                            std::vector<bst_node_t>* p_nidx, std::vector<size_t>* p_nptr) {
+  auto& h_node_idx = *p_nidx;
+  auto& h_node_ptr = *p_nptr;
+
+  for (auto leaf : maybe_missing) {
+    if (std::binary_search(h_node_idx.cbegin(), h_node_idx.cend(), leaf)) {
+      continue;
+    }
+    auto it = std::upper_bound(h_node_idx.cbegin(), h_node_idx.cend(), leaf);
+    auto pos = it - h_node_idx.cbegin();
+    h_node_idx.insert(h_node_idx.cbegin() + pos, leaf);
+    h_node_ptr.insert(h_node_ptr.cbegin() + pos, h_node_ptr[pos]);
+  }
+}
+
+inline void UpdateLeafValues(std::vector<float>* p_quantiles, std::vector<bst_node_t> const nidx,
+                             RegTree* p_tree) {
+  auto& tree = *p_tree;
+  auto& quantiles = *p_quantiles;
+  auto const& h_node_idx = nidx;
+
+  size_t n_leaf{h_node_idx.size()};
+  rabit::Allreduce<rabit::op::Max>(&n_leaf, 1);
+  CHECK(quantiles.empty() || quantiles.size() == n_leaf);
+  if (quantiles.empty()) {
+    quantiles.resize(n_leaf, std::numeric_limits<float>::quiet_NaN());
+  }
+
+  // number of workers that have valid quantiles
+  std::vector<int32_t> n_valids(quantiles.size());
+  std::transform(quantiles.cbegin(), quantiles.cend(), n_valids.begin(),
+                 [](float q) { return static_cast<int32_t>(!std::isnan(q)); });
+  rabit::Allreduce<rabit::op::Sum>(n_valids.data(), n_valids.size());
+  // convert to 0 for all reduce
+  std::replace_if(
+      quantiles.begin(), quantiles.end(), [](float q) { return std::isnan(q); }, 0.f);
+  // use the mean value
+  rabit::Allreduce<rabit::op::Sum>(quantiles.data(), quantiles.size());
+  for (size_t i = 0; i < n_leaf; ++i) {
+    if (n_valids[i] > 0) {
+      quantiles[i] /= static_cast<float>(n_valids[i]);
+    } else {
+      // Use original leaf value if no worker can provide the quantile.
+      quantiles[i] = tree[h_node_idx[i]].LeafValue();
+    }
+  }
+
+  for (size_t i = 0; i < nidx.size(); ++i) {
+    auto nidx = h_node_idx[i];
+    auto q = quantiles[i];
+    CHECK(tree[nidx].IsLeaf());
+    tree[nidx].SetLeaf(q);
+  }
+}
+
+void UpdateTreeLeafDevice(Context const* ctx, common::Span<bst_node_t const> position,
+                          MetaInfo const& info, HostDeviceVector<float> const& predt, float alpha,
+                          RegTree* p_tree);
+
+void UpdateTreeLeafHost(Context const* ctx, std::vector<bst_node_t> const& position,
+                        MetaInfo const& info, HostDeviceVector<float> const& predt, float alpha,
+                        RegTree* p_tree);
+}  // namespace detail
+}  // namespace obj
+}  // namespace xgboost
diff --git a/src/objective/aft_obj.cu b/src/objective/aft_obj.cu
index 0e2d9290f95c..5f2306dee082 100644
--- a/src/objective/aft_obj.cu
+++ b/src/objective/aft_obj.cu
@@ -34,11 +34,11 @@ DMLC_REGISTRY_FILE_TAG(aft_obj_gpu);
 
 class AFTObj : public ObjFunction {
  public:
-  void Configure(const std::vector<std::pair<std::string, std::string> >& args) override {
+  void Configure(Args const& args) override {
     param_.UpdateAllowUnknown(args);
   }
 
-  ObjInfo Task() const override { return {ObjInfo::kSurvival, false}; }
+  ObjInfo Task() const override { return ObjInfo::kSurvival; }
 
   template <typename Distribution>
   void GetGradientImpl(const HostDeviceVector<bst_float> &preds,
diff --git a/src/objective/hinge.cu b/src/objective/hinge.cu
index e1f0df74d4e1..e062b2b48e3c 100644
--- a/src/objective/hinge.cu
+++ b/src/objective/hinge.cu
@@ -24,10 +24,8 @@ class HingeObj : public ObjFunction {
  public:
   HingeObj() = default;
 
-  void Configure(
-      const std::vector<std::pair<std::string, std::string> > &args) override {}
-
-  ObjInfo Task() const override { return {ObjInfo::kRegression, false}; }
+  void Configure(Args const&) override {}
+  ObjInfo Task() const override { return ObjInfo::kRegression; }
 
   void GetGradient(const HostDeviceVector<bst_float> &preds,
                    const MetaInfo &info,
diff --git a/src/objective/multiclass_obj.cu b/src/objective/multiclass_obj.cu
index 4b912a81710d..312992ec59f2 100644
--- a/src/objective/multiclass_obj.cu
+++ b/src/objective/multiclass_obj.cu
@@ -46,7 +46,7 @@ class SoftmaxMultiClassObj : public ObjFunction {
     param_.UpdateAllowUnknown(args);
   }
 
-  ObjInfo Task() const override { return {ObjInfo::kClassification, false}; }
+  ObjInfo Task() const override { return ObjInfo::kClassification; }
 
   void GetGradient(const HostDeviceVector<bst_float>& preds,
                    const MetaInfo& info,
diff --git a/src/objective/rank_obj.cu b/src/objective/rank_obj.cu
index 0bbf6f6df26b..f1c8702102df 100644
--- a/src/objective/rank_obj.cu
+++ b/src/objective/rank_obj.cu
@@ -1,5 +1,5 @@
 /*!
- * Copyright 2015-2019 XGBoost contributors
+ * Copyright 2015-2022 XGBoost contributors
  */
 #include <dmlc/omp.h>
 #include <dmlc/timer.h>
@@ -750,11 +750,8 @@ class SortedLabelList : dh::SegmentSorter<float> {
 template <typename LambdaWeightComputerT>
 class LambdaRankObj : public ObjFunction {
  public:
-  void Configure(const std::vector<std::pair<std::string, std::string> >& args) override {
-    param_.UpdateAllowUnknown(args);
-  }
-
-  ObjInfo Task() const override { return {ObjInfo::kRanking, false}; }
+  void Configure(Args const &args) override { param_.UpdateAllowUnknown(args); }
+  ObjInfo Task() const override { return ObjInfo::kRanking; }
 
   void GetGradient(const HostDeviceVector<bst_float>& preds,
                    const MetaInfo& info,
diff --git a/src/objective/regression_loss.h b/src/objective/regression_loss.h
index f92dfe2d47d7..f394432a8f28 100644
--- a/src/objective/regression_loss.h
+++ b/src/objective/regression_loss.h
@@ -1,5 +1,5 @@
 /*!
- * Copyright 2017-2019 XGBoost contributors
+ * Copyright 2017-2022 XGBoost contributors
  */
 #ifndef XGBOOST_OBJECTIVE_REGRESSION_LOSS_H_
 #define XGBOOST_OBJECTIVE_REGRESSION_LOSS_H_
@@ -38,7 +38,7 @@ struct LinearSquareLoss {
   static const char* DefaultEvalMetric() { return "rmse"; }
 
   static const char* Name() { return "reg:squarederror"; }
-  static ObjInfo Info() { return {ObjInfo::kRegression, true}; }
+  static ObjInfo Info() { return {ObjInfo::kRegression, true, false}; }
 };
 
 struct SquaredLogError {
@@ -65,7 +65,7 @@ struct SquaredLogError {
 
   static const char* Name() { return "reg:squaredlogerror"; }
 
-  static ObjInfo Info() { return {ObjInfo::kRegression, false}; }
+  static ObjInfo Info() { return ObjInfo::kRegression; }
 };
 
 // logistic loss for probability regression task
@@ -102,14 +102,14 @@ struct LogisticRegression {
 
   static const char* Name() { return "reg:logistic"; }
 
-  static ObjInfo Info() { return {ObjInfo::kRegression, false}; }
+  static ObjInfo Info() { return ObjInfo::kRegression; }
 };
 
 // logistic loss for binary classification task
 struct LogisticClassification : public LogisticRegression {
   static const char* DefaultEvalMetric() { return "logloss"; }
   static const char* Name() { return "binary:logistic"; }
-  static ObjInfo Info() { return {ObjInfo::kBinary, false}; }
+  static ObjInfo Info() { return ObjInfo::kBinary; }
 };
 
 // logistic loss, but predict un-transformed margin
@@ -146,7 +146,7 @@ struct LogisticRaw : public LogisticRegression {
 
   static const char* Name() { return "binary:logitraw"; }
 
-  static ObjInfo Info() { return {ObjInfo::kRegression, false}; }
+  static ObjInfo Info() { return ObjInfo::kRegression; }
 };
 
 }  // namespace obj
diff --git a/src/objective/regression_obj.cu b/src/objective/regression_obj.cu
index fa294a5a5773..3dc4a7b82316 100644
--- a/src/objective/regression_obj.cu
+++ b/src/objective/regression_obj.cu
@@ -4,10 +4,10 @@
  * \brief Definition of single-value regression and classification objectives.
  * \author Tianqi Chen, Kailong Chen
  */
-
 #include <dmlc/omp.h>
 #include <xgboost/logging.h>
 #include <xgboost/objective.h>
+#include <xgboost/tree_model.h>
 
 #include <cmath>
 #include <memory>
@@ -19,12 +19,18 @@
 #include "../common/threading_utils.h"
 #include "../common/transform.h"
 #include "./regression_loss.h"
+#include "adaptive.h"
+#include "xgboost/base.h"
+#include "xgboost/data.h"
+#include "xgboost/generic_parameters.h"
 #include "xgboost/host_device_vector.h"
 #include "xgboost/json.h"
+#include "xgboost/linalg.h"
 #include "xgboost/parameter.h"
 #include "xgboost/span.h"
 
 #if defined(XGBOOST_USE_CUDA)
+#include "../common/device_helpers.cuh"
 #include "../common/linalg_op.cuh"
 #endif  // defined(XGBOOST_USE_CUDA)
 
@@ -67,9 +73,7 @@ class RegLossObj : public ObjFunction {
     param_.UpdateAllowUnknown(args);
   }
 
-  struct ObjInfo Task() const override {
-    return Loss::Info();
-  }
+  ObjInfo Task() const override { return Loss::Info(); }
 
   uint32_t Targets(MetaInfo const& info) const override {
     // Multi-target regression.
@@ -209,7 +213,7 @@ class PseudoHuberRegression : public ObjFunction {
 
  public:
   void Configure(Args const& args) override { param_.UpdateAllowUnknown(args); }
-  struct ObjInfo Task() const override { return {ObjInfo::kRegression, false}; }
+  ObjInfo Task() const override { return ObjInfo::kRegression; }
   uint32_t Targets(MetaInfo const& info) const override {
     return std::max(static_cast<size_t>(1), info.labels.Shape(1));
   }
@@ -286,9 +290,7 @@ class PoissonRegression : public ObjFunction {
     param_.UpdateAllowUnknown(args);
   }
 
-  struct ObjInfo Task() const override {
-    return {ObjInfo::kRegression, false};
-  }
+  ObjInfo Task() const override { return ObjInfo::kRegression; }
 
   void GetGradient(const HostDeviceVector<bst_float>& preds,
                    const MetaInfo &info, int,
@@ -378,12 +380,8 @@ XGBOOST_REGISTER_OBJECTIVE(PoissonRegression, "count:poisson")
 // cox regression for survival data (negative values mean they are censored)
 class CoxRegression : public ObjFunction {
  public:
-  void Configure(
-      const std::vector<std::pair<std::string, std::string> >&) override {}
-
-  struct ObjInfo Task() const override {
-    return {ObjInfo::kRegression, false};
-  }
+  void Configure(Args const&) override {}
+  ObjInfo Task() const override { return ObjInfo::kRegression; }
 
   void GetGradient(const HostDeviceVector<bst_float>& preds,
                    const MetaInfo &info, int,
@@ -479,12 +477,8 @@ XGBOOST_REGISTER_OBJECTIVE(CoxRegression, "survival:cox")
 // gamma regression
 class GammaRegression : public ObjFunction {
  public:
-  void Configure(
-      const std::vector<std::pair<std::string, std::string> >&) override {}
-
-  struct ObjInfo Task() const override {
-    return {ObjInfo::kRegression, false};
-  }
+  void Configure(Args const&) override {}
+  ObjInfo Task() const override { return ObjInfo::kRegression; }
 
   void GetGradient(const HostDeviceVector<bst_float> &preds,
                    const MetaInfo &info, int,
@@ -582,9 +576,7 @@ class TweedieRegression : public ObjFunction {
     metric_ = os.str();
   }
 
-  struct ObjInfo Task() const override {
-    return {ObjInfo::kRegression, false};
-  }
+  ObjInfo Task() const override { return ObjInfo::kRegression; }
 
   void GetGradient(const HostDeviceVector<bst_float>& preds,
                    const MetaInfo &info, int,
@@ -675,5 +667,65 @@ XGBOOST_REGISTER_OBJECTIVE(TweedieRegression, "reg:tweedie")
 .describe("Tweedie regression for insurance data.")
 .set_body([]() { return new TweedieRegression(); });
 
+class MeanAbsoluteError : public ObjFunction {
+ public:
+  void Configure(Args const&) override {}
+  ObjInfo Task() const override { return {ObjInfo::kRegression, true, true}; }
+
+  void GetGradient(HostDeviceVector<bst_float> const& preds, const MetaInfo& info, int iter,
+                   HostDeviceVector<GradientPair>* out_gpair) override {
+    CheckRegInputs(info, preds);
+    auto labels = info.labels.View(ctx_->gpu_id);
+
+    out_gpair->SetDevice(ctx_->gpu_id);
+    out_gpair->Resize(info.labels.Size());
+    auto gpair = linalg::MakeVec(out_gpair);
+
+    preds.SetDevice(ctx_->gpu_id);
+    auto predt = linalg::MakeVec(&preds);
+    info.weights_.SetDevice(ctx_->gpu_id);
+    common::OptionalWeights weight{ctx_->IsCPU() ? info.weights_.ConstHostSpan()
+                                                 : info.weights_.ConstDeviceSpan()};
+
+    linalg::ElementWiseKernel(ctx_, labels, [=] XGBOOST_DEVICE(size_t i, float const y) mutable {
+      auto sign = [](auto x) {
+        return (x > static_cast<decltype(x)>(0)) - (x < static_cast<decltype(x)>(0));
+      };
+      auto sample_id = std::get<0>(linalg::UnravelIndex(i, labels.Shape()));
+      auto grad = sign(predt(i) - y) * weight[i];
+      auto hess = weight[sample_id];
+      gpair(i) = GradientPair{grad, hess};
+    });
+  }
+
+  void UpdateTreeLeaf(HostDeviceVector<bst_node_t> const& position, MetaInfo const& info,
+                      HostDeviceVector<float> const& prediction, RegTree* p_tree) const override {
+    if (ctx_->IsCPU()) {
+      auto const& h_position = position.ConstHostVector();
+      detail::UpdateTreeLeafHost(ctx_, h_position, info, prediction, 0.5, p_tree);
+    } else {
+#if defined(XGBOOST_USE_CUDA)
+      position.SetDevice(ctx_->gpu_id);
+      auto d_position = position.ConstDeviceSpan();
+      detail::UpdateTreeLeafDevice(ctx_, d_position, info, prediction, 0.5, p_tree);
+#else
+      common::AssertGPUSupport();
+#endif  //  defined(XGBOOST_USE_CUDA)
+    }
+  }
+
+  const char* DefaultEvalMetric() const override { return "mae"; }
+
+  void SaveConfig(Json* p_out) const override {
+    auto& out = *p_out;
+    out["name"] = String("reg:absoluteerror");
+  }
+
+  void LoadConfig(Json const& in) override {}
+};
+
+XGBOOST_REGISTER_OBJECTIVE(MeanAbsoluteError, "reg:absoluteerror")
+    .describe("Mean absoluate error.")
+    .set_body([]() { return new MeanAbsoluteError(); });
 }  // namespace obj
 }  // namespace xgboost
diff --git a/src/tree/gpu_hist/row_partitioner.cuh b/src/tree/gpu_hist/row_partitioner.cuh
index 1b5a5222229e..9470b6447512 100644
--- a/src/tree/gpu_hist/row_partitioner.cuh
+++ b/src/tree/gpu_hist/row_partitioner.cuh
@@ -1,13 +1,17 @@
 /*!
- * Copyright 2017-2019 XGBoost contributors
+ * Copyright 2017-2022 XGBoost contributors
  */
 #pragma once
+#include <limits>
+#include <vector>
 #include "xgboost/base.h"
 #include "../../common/device_helpers.cuh"
+#include "xgboost/generic_parameters.h"
+#include "xgboost/task.h"
+#include "xgboost/tree_model.h"
 
 namespace xgboost {
 namespace tree {
-
 /*! \brief Count how many rows are assigned to left node. */
 __forceinline__ __device__ void AtomicIncrement(int64_t* d_count, bool increment) {
 #if __CUDACC_VER_MAJOR__ > 8
@@ -149,23 +153,48 @@ class RowPartitioner {
   }
 
   /**
-   * \brief Finalise the position of all training instances after tree
-   * construction is complete. Does not update any other meta information in
-   * this data structure, so should only be used at the end of training.
+   * \brief Finalise the position of all training instances after tree construction is
+   * complete. Does not update any other meta information in this data structure, so
+   * should only be used at the end of training.
+   *
+   *   When the task requires update leaf, this function will copy the node index into
+   *   p_out_position. The index is negated if it's being sampled in current iteration.
    *
-   * \param op          Device lambda. Should provide the row index  and current
-   * position as an argument and return the new position for this training
-   * instance.
+   * \param p_out_position Node index for each row.
+   * \param op Device lambda. Should provide the row index and current position as an
+   *           argument and return the new position for this training instance.
+   * \param sampled A device lambda to inform the partitioner whether a row is sampled.
    */
-  template <typename FinalisePositionOpT>
-  void FinalisePosition(FinalisePositionOpT op) {
+  template <typename FinalisePositionOpT, typename Sampledp>
+  void FinalisePosition(Context const* ctx, ObjInfo task,
+                        HostDeviceVector<bst_node_t>* p_out_position, FinalisePositionOpT op,
+                        Sampledp sampledp) {
     auto d_position = position_.Current();
     const auto d_ridx = ridx_.Current();
+    if (!task.UpdateTreeLeaf()) {
+      dh::LaunchN(position_.Size(), [=] __device__(size_t idx) {
+        auto position = d_position[idx];
+        RowIndexT ridx = d_ridx[idx];
+        bst_node_t new_position = op(ridx, position);
+        if (new_position == kIgnoredTreePosition) {
+          return;
+        }
+        d_position[idx] = new_position;
+      });
+      return;
+    }
+
+    p_out_position->SetDevice(ctx->gpu_id);
+    p_out_position->Resize(position_.Size());
+    auto sorted_position = p_out_position->DevicePointer();
     dh::LaunchN(position_.Size(), [=] __device__(size_t idx) {
       auto position = d_position[idx];
       RowIndexT ridx = d_ridx[idx];
       bst_node_t new_position = op(ridx, position);
-      if (new_position == kIgnoredTreePosition) return;
+      sorted_position[ridx] = sampledp(ridx) ? ~new_position : new_position;
+      if (new_position == kIgnoredTreePosition) {
+        return;
+      }
       d_position[idx] = new_position;
     });
   }
diff --git a/src/tree/hist/evaluate_splits.h b/src/tree/hist/evaluate_splits.h
index 053b485012bd..4e445a0680e5 100644
--- a/src/tree/hist/evaluate_splits.h
+++ b/src/tree/hist/evaluate_splits.h
@@ -390,7 +390,6 @@ void UpdatePredictionCacheImpl(GenericParameter const *ctx, RegTree const *p_las
 
   CHECK(p_last_tree);
   auto const &tree = *p_last_tree;
-  auto const &snode = hist_evaluator.Stats();
   auto evaluator = hist_evaluator.Evaluator();
   CHECK_EQ(out_preds.DeviceIdx(), GenericParameter::kCpuId);
   size_t n_nodes = p_last_tree->GetNodes().size();
@@ -401,9 +400,7 @@ void UpdatePredictionCacheImpl(GenericParameter const *ctx, RegTree const *p_las
     common::ParallelFor2d(space, ctx->Threads(), [&](size_t nidx, common::Range1d r) {
       if (!tree[nidx].IsDeleted() && tree[nidx].IsLeaf()) {
         auto const &rowset = part[nidx];
-        auto const &stats = snode[nidx];
-        auto leaf_value =
-            evaluator.CalcWeight(nidx, param, GradStats{stats.stats}) * param.learning_rate;
+        auto leaf_value = tree[nidx].LeafValue();
         for (const size_t *it = rowset.begin + r.begin(); it < rowset.begin + r.end(); ++it) {
           out_preds(*it) += leaf_value;
         }
diff --git a/src/tree/updater_approx.cc b/src/tree/updater_approx.cc
index 3bad6f7da4cc..4222cddb1ee9 100644
--- a/src/tree/updater_approx.cc
+++ b/src/tree/updater_approx.cc
@@ -19,6 +19,7 @@
 #include "param.h"
 #include "xgboost/base.h"
 #include "xgboost/json.h"
+#include "xgboost/tree_model.h"
 #include "xgboost/tree_updater.h"
 
 namespace xgboost {
@@ -154,6 +155,18 @@ class GloablApproxBuilder {
     monitor_->Stop(__func__);
   }
 
+  void LeafPartition(RegTree const &tree, common::Span<float> hess,
+                     std::vector<bst_node_t> *p_out_position) {
+    monitor_->Start(__func__);
+    if (!evaluator_.Task().UpdateTreeLeaf()) {
+      return;
+    }
+    for (auto const &part : partitioner_) {
+      part.LeafPartition(ctx_, tree, hess, p_out_position);
+    }
+    monitor_->Stop(__func__);
+  }
+
  public:
   explicit GloablApproxBuilder(TrainParam param, MetaInfo const &info, GenericParameter const *ctx,
                                std::shared_ptr<common::ColumnSampler> column_sampler, ObjInfo task,
@@ -164,8 +177,8 @@ class GloablApproxBuilder {
         ctx_{ctx},
         monitor_{monitor} {}
 
-  void UpdateTree(RegTree *p_tree, std::vector<GradientPair> const &gpair, common::Span<float> hess,
-                  DMatrix *p_fmat) {
+  void UpdateTree(DMatrix *p_fmat, std::vector<GradientPair> const &gpair, common::Span<float> hess,
+                  RegTree *p_tree, HostDeviceVector<bst_node_t> *p_out_position) {
     p_last_tree_ = p_tree;
     this->InitData(p_fmat, hess);
 
@@ -231,6 +244,9 @@ class GloablApproxBuilder {
       driver.Push(best_splits.begin(), best_splits.end());
       expand_set = driver.Pop();
     }
+
+    auto &h_position = p_out_position->HostVector();
+    this->LeafPartition(tree, hess, &h_position);
   }
 };
 
@@ -275,6 +291,7 @@ class GlobalApproxUpdater : public TreeUpdater {
     sampled->resize(h_gpair.size());
     std::copy(h_gpair.cbegin(), h_gpair.cend(), sampled->begin());
     auto &rnd = common::GlobalRandom();
+
     if (param.subsample != 1.0) {
       CHECK(param.sampling_method != TrainParam::kGradientBased)
           << "Gradient based sampling is not supported for approx tree method.";
@@ -292,6 +309,7 @@ class GlobalApproxUpdater : public TreeUpdater {
   char const *Name() const override { return "grow_histmaker"; }
 
   void Update(HostDeviceVector<GradientPair> *gpair, DMatrix *m,
+              common::Span<HostDeviceVector<bst_node_t>> out_position,
               const std::vector<RegTree *> &trees) override {
     float lr = param_.learning_rate;
     param_.learning_rate = lr / trees.size();
@@ -313,12 +331,14 @@ class GlobalApproxUpdater : public TreeUpdater {
 
     cached_ = m;
 
+    size_t t_idx = 0;
     for (auto p_tree : trees) {
       if (hist_param_.single_precision_histogram) {
-        this->f32_impl_->UpdateTree(p_tree, h_gpair, hess, m);
+        this->f32_impl_->UpdateTree(m, h_gpair, hess, p_tree, &out_position[t_idx]);
       } else {
-        this->f64_impl_->UpdateTree(p_tree, h_gpair, hess, m);
+        this->f64_impl_->UpdateTree(m, h_gpair, hess, p_tree, &out_position[t_idx]);
       }
+      ++t_idx;
     }
     param_.learning_rate = lr;
   }
@@ -335,6 +355,8 @@ class GlobalApproxUpdater : public TreeUpdater {
     }
     return true;
   }
+
+  bool HasNodePosition() const override { return true; }
 };
 
 DMLC_REGISTRY_FILE_TAG(grow_histmaker);
diff --git a/src/tree/updater_approx.h b/src/tree/updater_approx.h
index ec54da19e5b0..bb37f99ec61d 100644
--- a/src/tree/updater_approx.h
+++ b/src/tree/updater_approx.h
@@ -1,5 +1,5 @@
 /*!
- * Copyright 2021 XGBoost contributors
+ * Copyright 2021-2022 XGBoost contributors
  *
  * \brief Implementation for the approx tree method.
  */
@@ -18,6 +18,7 @@
 #include "hist/expand_entry.h"
 #include "hist/param.h"
 #include "param.h"
+#include "xgboost/generic_parameters.h"
 #include "xgboost/json.h"
 #include "xgboost/tree_updater.h"
 
@@ -122,6 +123,12 @@ class ApproxRowPartitioner {
 
   auto const &Partitions() const { return row_set_collection_; }
 
+  void LeafPartition(Context const *ctx, RegTree const &tree, common::Span<float const> hess,
+                     std::vector<bst_node_t> *p_out_position) const {
+    partition_builder_.LeafPartition(ctx, tree, this->Partitions(), p_out_position,
+                                     [&](size_t idx) -> bool { return hess[idx] - .0f == .0f; });
+  }
+
   auto operator[](bst_node_t nidx) { return row_set_collection_[nidx]; }
   auto const &operator[](bst_node_t nidx) const { return row_set_collection_[nidx]; }
 
diff --git a/src/tree/updater_colmaker.cc b/src/tree/updater_colmaker.cc
index e3d716f2cba8..6d63a00a139a 100644
--- a/src/tree/updater_colmaker.cc
+++ b/src/tree/updater_colmaker.cc
@@ -96,9 +96,9 @@ class ColMaker: public TreeUpdater {
     }
   }
 
-  void Update(HostDeviceVector<GradientPair> *gpair,
-              DMatrix* dmat,
-              const std::vector<RegTree*> &trees) override {
+  void Update(HostDeviceVector<GradientPair> *gpair, DMatrix *dmat,
+              common::Span<HostDeviceVector<bst_node_t>> out_position,
+              const std::vector<RegTree *> &trees) override {
     if (rabit::IsDistributed()) {
       LOG(FATAL) << "Updater `grow_colmaker` or `exact` tree method doesn't "
                     "support distributed training.";
diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu
index cb7dd9b7e8e4..569188fd5374 100644
--- a/src/tree/updater_gpu_hist.cu
+++ b/src/tree/updater_gpu_hist.cu
@@ -11,6 +11,9 @@
 #include <utility>
 #include <vector>
 
+#include "xgboost/base.h"
+#include "xgboost/data.h"
+#include "xgboost/generic_parameters.h"
 #include "xgboost/host_device_vector.h"
 #include "xgboost/parameter.h"
 #include "xgboost/span.h"
@@ -35,6 +38,8 @@
 #include "gpu_hist/histogram.cuh"
 #include "gpu_hist/evaluate_splits.cuh"
 #include "gpu_hist/expand_entry.cuh"
+#include "xgboost/task.h"
+#include "xgboost/tree_model.h"
 
 namespace xgboost {
 namespace tree {
@@ -161,9 +166,9 @@ template <typename GradientSumT>
 struct GPUHistMakerDevice {
  private:
   GPUHistEvaluator<GradientSumT> evaluator_;
+  Context const* ctx_;
 
  public:
-  int device_id;
   EllpackPageImpl const* page;
   common::Span<FeatureType const> feature_types;
   BatchParam batch_param;
@@ -195,12 +200,12 @@ struct GPUHistMakerDevice {
   // Storing split categories for last node.
   dh::caching_device_vector<uint32_t> node_categories;
 
-  GPUHistMakerDevice(int _device_id, EllpackPageImpl const* _page,
+  GPUHistMakerDevice(Context const* ctx, EllpackPageImpl const* _page,
                      common::Span<FeatureType const> _feature_types, bst_uint _n_rows,
                      TrainParam _param, uint32_t column_sampler_seed, uint32_t n_features,
                      BatchParam _batch_param)
-      : evaluator_{_param, n_features, _device_id},
-        device_id(_device_id),
+      : evaluator_{_param, n_features, ctx->gpu_id},
+        ctx_(ctx),
         page(_page),
         feature_types{_feature_types},
         param(std::move(_param)),
@@ -216,14 +221,15 @@ struct GPUHistMakerDevice {
     node_sum_gradients.resize(param.MaxNodes());
 
     // Init histogram
-    hist.Init(device_id, page->Cuts().TotalBins());
-    monitor.Init(std::string("GPUHistMakerDevice") + std::to_string(device_id));
-    feature_groups.reset(new FeatureGroups(
-        page->Cuts(), page->is_dense, dh::MaxSharedMemoryOptin(device_id), sizeof(GradientSumT)));
+    hist.Init(ctx_->gpu_id, page->Cuts().TotalBins());
+    monitor.Init(std::string("GPUHistMakerDevice") + std::to_string(ctx_->gpu_id));
+    feature_groups.reset(new FeatureGroups(page->Cuts(), page->is_dense,
+                                           dh::MaxSharedMemoryOptin(ctx_->gpu_id),
+                                           sizeof(GradientSumT)));
   }
 
   ~GPUHistMakerDevice() {  // NOLINT
-    dh::safe_cuda(cudaSetDevice(device_id));
+    dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
   }
 
   // Reset values for each update iteration
@@ -235,10 +241,10 @@ struct GPUHistMakerDevice {
     this->column_sampler.Init(num_columns, info.feature_weights.HostVector(),
                               param.colsample_bynode, param.colsample_bylevel,
                               param.colsample_bytree);
-    dh::safe_cuda(cudaSetDevice(device_id));
+    dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
 
     this->evaluator_.Reset(page->Cuts(), feature_types, task, dmat->Info().num_col_, param,
-                           device_id);
+                           ctx_->gpu_id);
 
     this->interaction_constraints.Reset();
     std::fill(node_sum_gradients.begin(), node_sum_gradients.end(), GradientPairPrecise{});
@@ -256,7 +262,7 @@ struct GPUHistMakerDevice {
     histogram_rounding = CreateRoundingFactor<GradientSumT>(this->gpair);
 
     row_partitioner.reset();  // Release the device memory first before reallocating
-    row_partitioner.reset(new RowPartitioner(device_id,  sample.sample_rows));
+    row_partitioner.reset(new RowPartitioner(ctx_->gpu_id,  sample.sample_rows));
     hist.Reset();
   }
 
@@ -264,10 +270,10 @@ struct GPUHistMakerDevice {
     int nidx = RegTree::kRoot;
     GPUTrainingParam gpu_param(param);
     auto sampled_features = column_sampler.GetFeatureSet(0);
-    sampled_features->SetDevice(device_id);
+    sampled_features->SetDevice(ctx_->gpu_id);
     common::Span<bst_feature_t> feature_set =
         interaction_constraints.Query(sampled_features->DeviceSpan(), nidx);
-    auto matrix = page->GetDeviceAccessor(device_id);
+    auto matrix = page->GetDeviceAccessor(ctx_->gpu_id);
     EvaluateSplitInputs<GradientSumT> inputs{nidx,
                                              root_sum,
                                              gpu_param,
@@ -287,14 +293,14 @@ struct GPUHistMakerDevice {
     dh::TemporaryArray<DeviceSplitCandidate> splits_out(2);
     GPUTrainingParam gpu_param(param);
     auto left_sampled_features = column_sampler.GetFeatureSet(tree.GetDepth(left_nidx));
-    left_sampled_features->SetDevice(device_id);
+    left_sampled_features->SetDevice(ctx_->gpu_id);
     common::Span<bst_feature_t> left_feature_set =
         interaction_constraints.Query(left_sampled_features->DeviceSpan(), left_nidx);
     auto right_sampled_features = column_sampler.GetFeatureSet(tree.GetDepth(right_nidx));
-    right_sampled_features->SetDevice(device_id);
+    right_sampled_features->SetDevice(ctx_->gpu_id);
     common::Span<bst_feature_t> right_feature_set =
         interaction_constraints.Query(right_sampled_features->DeviceSpan(), left_nidx);
-    auto matrix = page->GetDeviceAccessor(device_id);
+    auto matrix = page->GetDeviceAccessor(ctx_->gpu_id);
 
     EvaluateSplitInputs<GradientSumT> left{left_nidx,
                                            candidate.split.left_sum,
@@ -325,8 +331,8 @@ struct GPUHistMakerDevice {
     hist.AllocateHistogram(nidx);
     auto d_node_hist = hist.GetNodeHistogram(nidx);
     auto d_ridx = row_partitioner->GetRows(nidx);
-    BuildGradientHistogram(page->GetDeviceAccessor(device_id),
-                           feature_groups->DeviceAccessor(device_id), gpair,
+    BuildGradientHistogram(page->GetDeviceAccessor(ctx_->gpu_id),
+                           feature_groups->DeviceAccessor(ctx_->gpu_id), gpair,
                            d_ridx, d_node_hist, histogram_rounding);
   }
 
@@ -351,7 +357,7 @@ struct GPUHistMakerDevice {
   void UpdatePosition(int nidx, RegTree* p_tree) {
     RegTree::Node split_node = (*p_tree)[nidx];
     auto split_type = p_tree->NodeSplitType(nidx);
-    auto d_matrix = page->GetDeviceAccessor(device_id);
+    auto d_matrix = page->GetDeviceAccessor(ctx_->gpu_id);
     auto node_cats = dh::ToSpan(node_categories);
 
     row_partitioner->UpdatePosition(
@@ -384,7 +390,8 @@ struct GPUHistMakerDevice {
   // After tree update is finished, update the position of all training
   // instances to their final leaf. This information is used later to update the
   // prediction cache
-  void FinalisePosition(RegTree const* p_tree, DMatrix* p_fmat) {
+  void FinalisePosition(RegTree const* p_tree, DMatrix* p_fmat, ObjInfo task,
+                        HostDeviceVector<bst_node_t>* p_out_position) {
     dh::TemporaryArray<RegTree::Node> d_nodes(p_tree->GetNodes().size());
     dh::safe_cuda(cudaMemcpyAsync(d_nodes.data().get(), p_tree->GetNodes().data(),
                                   d_nodes.size() * sizeof(RegTree::Node),
@@ -405,17 +412,21 @@ struct GPUHistMakerDevice {
 
     if (row_partitioner->GetRows().size() != p_fmat->Info().num_row_) {
       row_partitioner.reset();  // Release the device memory first before reallocating
-      row_partitioner.reset(new RowPartitioner(device_id, p_fmat->Info().num_row_));
+      row_partitioner.reset(new RowPartitioner(ctx_->gpu_id, p_fmat->Info().num_row_));
+    }
+    if (task.UpdateTreeLeaf() && !p_fmat->SingleColBlock() && param.subsample != 1.0) {
+      // see comment in the `FinalisePositionInPage`.
+      LOG(FATAL) << "Current objective function can not be used with subsampled external memory.";
     }
     if (page->n_rows == p_fmat->Info().num_row_) {
-      FinalisePositionInPage(page, dh::ToSpan(d_nodes),
-                             dh::ToSpan(d_split_types), dh::ToSpan(d_categories),
-                             dh::ToSpan(d_categories_segments));
+      FinalisePositionInPage(page, dh::ToSpan(d_nodes), dh::ToSpan(d_split_types),
+                             dh::ToSpan(d_categories), dh::ToSpan(d_categories_segments), task,
+                             p_out_position);
     } else {
-      for (auto& batch : p_fmat->GetBatches<EllpackPage>(batch_param)) {
-        FinalisePositionInPage(batch.Impl(), dh::ToSpan(d_nodes),
-                               dh::ToSpan(d_split_types), dh::ToSpan(d_categories),
-                               dh::ToSpan(d_categories_segments));
+      for (auto const& batch : p_fmat->GetBatches<EllpackPage>(batch_param)) {
+        FinalisePositionInPage(batch.Impl(), dh::ToSpan(d_nodes), dh::ToSpan(d_split_types),
+                               dh::ToSpan(d_categories), dh::ToSpan(d_categories_segments), task,
+                               p_out_position);
       }
     }
   }
@@ -424,9 +435,13 @@ struct GPUHistMakerDevice {
                               const common::Span<RegTree::Node> d_nodes,
                               common::Span<FeatureType const> d_feature_types,
                               common::Span<uint32_t const> categories,
-                              common::Span<RegTree::Segment> categories_segments) {
-    auto d_matrix = page->GetDeviceAccessor(device_id);
+                              common::Span<RegTree::Segment> categories_segments,
+                              ObjInfo task,
+                              HostDeviceVector<bst_node_t>* p_out_position) {
+    auto d_matrix = page->GetDeviceAccessor(ctx_->gpu_id);
+    auto d_gpair = this->gpair;
     row_partitioner->FinalisePosition(
+        ctx_, task, p_out_position,
         [=] __device__(size_t row_id, int position) {
           // What happens if user prune the tree?
           if (!d_matrix.IsInRange(row_id)) {
@@ -457,13 +472,20 @@ struct GPUHistMakerDevice {
             }
             node = d_nodes[position];
           }
+
           return position;
+        },
+        [d_gpair] __device__(size_t ridx) {
+          // FIXME(jiamingy): Doesn't work when sampling is used with external memory as
+          // the sampler compacts the gradient vector.
+          return d_gpair[ridx].GetHess() - .0f == 0.f;
         });
   }
 
-  void UpdatePredictionCache(linalg::VectorView<float> out_preds_d) {
-    dh::safe_cuda(cudaSetDevice(device_id));
-    CHECK_EQ(out_preds_d.DeviceIdx(), device_id);
+  void UpdatePredictionCache(linalg::VectorView<float> out_preds_d, RegTree const* p_tree) {
+    CHECK(p_tree);
+    dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
+    CHECK_EQ(out_preds_d.DeviceIdx(), ctx_->gpu_id);
     auto d_ridx = row_partitioner->GetRows();
 
     GPUTrainingParam param_d(param);
@@ -476,12 +498,15 @@ struct GPUHistMakerDevice {
     auto d_node_sum_gradients = device_node_sum_gradients.data().get();
     auto tree_evaluator = evaluator_.GetEvaluator();
 
-    dh::LaunchN(d_ridx.size(), [=, out_preds_d = out_preds_d] __device__(int local_idx) mutable {
-      int pos = d_position[local_idx];
-      bst_float weight =
-          tree_evaluator.CalcWeight(pos, param_d, GradStats{d_node_sum_gradients[pos]});
-      static_assert(!std::is_const<decltype(out_preds_d)>::value, "");
-      out_preds_d(d_ridx[local_idx]) += weight * param_d.learning_rate;
+    auto const& h_nodes = p_tree->GetNodes();
+    dh::caching_device_vector<RegTree::Node> nodes(h_nodes.size());
+    dh::safe_cuda(cudaMemcpyAsync(nodes.data().get(), h_nodes.data(),
+                                  h_nodes.size() * sizeof(RegTree::Node), cudaMemcpyHostToDevice));
+    auto d_nodes = dh::ToSpan(nodes);
+    dh::LaunchN(d_ridx.size(), [=] XGBOOST_DEVICE(size_t idx) mutable {
+      bst_node_t nidx = d_position[idx];
+      auto weight = d_nodes[nidx].LeafValue();
+      out_preds_d(d_ridx[idx]) += weight;
     });
     row_partitioner.reset();
   }
@@ -610,7 +635,8 @@ struct GPUHistMakerDevice {
   }
 
   void UpdateTree(HostDeviceVector<GradientPair>* gpair_all, DMatrix* p_fmat, ObjInfo task,
-                  RegTree* p_tree, dh::AllReducer* reducer) {
+                  RegTree* p_tree, dh::AllReducer* reducer,
+                  HostDeviceVector<bst_node_t>* p_out_position) {
     auto& tree = *p_tree;
     Driver<GPUExpandEntry> driver(static_cast<TrainParam::TreeGrowPolicy>(param.grow_policy));
 
@@ -641,7 +667,7 @@ struct GPUHistMakerDevice {
 
         int left_child_nidx = tree[candidate.nid].LeftChild();
         int right_child_nidx = tree[candidate.nid].RightChild();
-        // Only create child entries if needed
+        // Only create child entries if needed_
         if (GPUExpandEntry::ChildIsValid(param, tree.GetDepth(left_child_nidx),
                                          num_leaves)) {
           monitor.Start("UpdatePosition");
@@ -671,7 +697,7 @@ struct GPUHistMakerDevice {
     }
 
     monitor.Start("FinalisePosition");
-    this->FinalisePosition(p_tree, p_fmat);
+    this->FinalisePosition(p_tree, p_fmat, task, p_out_position);
     monitor.Stop("FinalisePosition");
   }
 };
@@ -682,7 +708,7 @@ class GPUHistMakerSpecialised {
   explicit GPUHistMakerSpecialised(ObjInfo task) : task_{task} {};
   void Configure(const Args& args, GenericParameter const* generic_param) {
     param_.UpdateAllowUnknown(args);
-    generic_param_ = generic_param;
+    ctx_ = generic_param;
     hist_maker_param_.UpdateAllowUnknown(args);
     dh::CheckComputeCapability();
 
@@ -694,20 +720,24 @@ class GPUHistMakerSpecialised {
   }
 
   void Update(HostDeviceVector<GradientPair>* gpair, DMatrix* dmat,
+              common::Span<HostDeviceVector<bst_node_t>> out_position,
               const std::vector<RegTree*>& trees) {
     monitor_.Start("Update");
 
     // rescale learning rate according to size of trees
     float lr = param_.learning_rate;
     param_.learning_rate = lr / trees.size();
+
     // build tree
     try {
+      size_t t_idx{0};
       for (xgboost::RegTree* tree : trees) {
-        this->UpdateTree(gpair, dmat, tree);
+        this->UpdateTree(gpair, dmat, tree, &out_position[t_idx]);
 
         if (hist_maker_param_.debug_synchronize) {
           this->CheckTreesSynchronized(tree);
         }
+        ++t_idx;
       }
       dh::safe_cuda(cudaGetLastError());
     } catch (const std::exception& e) {
@@ -719,41 +749,36 @@ class GPUHistMakerSpecialised {
   }
 
   void InitDataOnce(DMatrix* dmat) {
-    device_ = generic_param_->gpu_id;
-    CHECK_GE(device_, 0) << "Must have at least one device";
+    CHECK_GE(ctx_->gpu_id, 0) << "Must have at least one device";
     info_ = &dmat->Info();
-    reducer_.Init({device_});  // NOLINT
+    reducer_.Init({ctx_->gpu_id});  // NOLINT
 
     // Synchronise the column sampling seed
     uint32_t column_sampling_seed = common::GlobalRandom()();
     rabit::Broadcast(&column_sampling_seed, sizeof(column_sampling_seed), 0);
 
     BatchParam batch_param{
-      device_,
+      ctx_->gpu_id,
       param_.max_bin,
     };
     auto page = (*dmat->GetBatches<EllpackPage>(batch_param).begin()).Impl();
-    dh::safe_cuda(cudaSetDevice(device_));
-    info_->feature_types.SetDevice(device_);
-    maker.reset(new GPUHistMakerDevice<GradientSumT>(device_,
-                                                     page,
-                                                     info_->feature_types.ConstDeviceSpan(),
-                                                     info_->num_row_,
-                                                     param_,
-                                                     column_sampling_seed,
-                                                     info_->num_col_,
-                                                     batch_param));
+    dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
+    info_->feature_types.SetDevice(ctx_->gpu_id);
+    maker.reset(new GPUHistMakerDevice<GradientSumT>(
+        ctx_, page, info_->feature_types.ConstDeviceSpan(), info_->num_row_, param_,
+        column_sampling_seed, info_->num_col_, batch_param));
 
     p_last_fmat_ = dmat;
     initialised_ = true;
   }
 
-  void InitData(DMatrix* dmat) {
+  void InitData(DMatrix* dmat, RegTree const* p_tree) {
     if (!initialised_) {
       monitor_.Start("InitDataOnce");
       this->InitDataOnce(dmat);
       monitor_.Stop("InitDataOnce");
     }
+    p_last_tree_ = p_tree;
   }
 
   // Only call this method for testing
@@ -771,13 +796,14 @@ class GPUHistMakerSpecialised {
     CHECK(*local_tree == reference_tree);
   }
 
-  void UpdateTree(HostDeviceVector<GradientPair>* gpair, DMatrix* p_fmat, RegTree* p_tree) {
+  void UpdateTree(HostDeviceVector<GradientPair>* gpair, DMatrix* p_fmat, RegTree* p_tree,
+                  HostDeviceVector<bst_node_t>* p_out_position) {
     monitor_.Start("InitData");
-    this->InitData(p_fmat);
+    this->InitData(p_fmat, p_tree);
     monitor_.Stop("InitData");
 
-    gpair->SetDevice(device_);
-    maker->UpdateTree(gpair, p_fmat, task_, p_tree, &reducer_);
+    gpair->SetDevice(ctx_->gpu_id);
+    maker->UpdateTree(gpair, p_fmat, task_, p_tree, &reducer_, p_out_position);
   }
 
   bool UpdatePredictionCache(const DMatrix *data,
@@ -786,7 +812,7 @@ class GPUHistMakerSpecialised {
       return false;
     }
     monitor_.Start("UpdatePredictionCache");
-    maker->UpdatePredictionCache(p_out_preds);
+    maker->UpdatePredictionCache(p_out_preds, p_last_tree_);
     monitor_.Stop("UpdatePredictionCache");
     return true;
   }
@@ -800,12 +826,12 @@ class GPUHistMakerSpecialised {
   bool initialised_ { false };
 
   GPUHistMakerTrainParam hist_maker_param_;
-  GenericParameter const* generic_param_;
+  Context const* ctx_;
 
   dh::AllReducer reducer_;
 
   DMatrix* p_last_fmat_ { nullptr };
-  int device_{-1};
+  RegTree const* p_last_tree_{nullptr};
   ObjInfo task_;
 
   common::Monitor monitor_;
@@ -859,17 +885,17 @@ class GPUHistMaker : public TreeUpdater {
   }
 
   void Update(HostDeviceVector<GradientPair>* gpair, DMatrix* dmat,
+              common::Span<HostDeviceVector<bst_node_t>> out_position,
               const std::vector<RegTree*>& trees) override {
     if (hist_maker_param_.single_precision_histogram) {
-      float_maker_->Update(gpair, dmat, trees);
+      float_maker_->Update(gpair, dmat, out_position, trees);
     } else {
-      double_maker_->Update(gpair, dmat, trees);
+      double_maker_->Update(gpair, dmat, out_position, trees);
     }
   }
 
-  bool
-  UpdatePredictionCache(const DMatrix *data,
-                        linalg::VectorView<bst_float> p_out_preds) override {
+  bool UpdatePredictionCache(const DMatrix* data,
+                             linalg::VectorView<bst_float> p_out_preds) override {
     if (hist_maker_param_.single_precision_histogram) {
       return float_maker_->UpdatePredictionCache(data, p_out_preds);
     } else {
@@ -881,6 +907,8 @@ class GPUHistMaker : public TreeUpdater {
     return "grow_gpu_hist";
   }
 
+  bool HasNodePosition() const override { return true; }
+
  private:
   GPUHistMakerTrainParam hist_maker_param_;
   ObjInfo task_;
diff --git a/src/tree/updater_histmaker.cc b/src/tree/updater_histmaker.cc
index 0a85d2d73832..27fc42455d2c 100644
--- a/src/tree/updater_histmaker.cc
+++ b/src/tree/updater_histmaker.cc
@@ -24,9 +24,9 @@ DMLC_REGISTRY_FILE_TAG(updater_histmaker);
 
 class HistMaker: public BaseMaker {
  public:
-  void Update(HostDeviceVector<GradientPair> *gpair,
-              DMatrix *p_fmat,
-              const std::vector<RegTree*> &trees) override {
+  void Update(HostDeviceVector<GradientPair> *gpair, DMatrix *p_fmat,
+              common::Span<HostDeviceVector<bst_node_t>> out_position,
+              const std::vector<RegTree *> &trees) override {
     interaction_constraints_.Configure(param_, p_fmat->Info().num_col_);
     // rescale learning rate according to size of trees
     float lr = param_.learning_rate;
diff --git a/src/tree/updater_prune.cc b/src/tree/updater_prune.cc
index f71f1c698cb9..dcda4a3b34a2 100644
--- a/src/tree/updater_prune.cc
+++ b/src/tree/updater_prune.cc
@@ -50,9 +50,9 @@ class TreePruner: public TreeUpdater {
   }
 
   // update the tree, do pruning
-  void Update(HostDeviceVector<GradientPair> *gpair,
-              DMatrix *p_fmat,
-              const std::vector<RegTree*> &trees) override {
+  void Update(HostDeviceVector<GradientPair>* gpair, DMatrix* p_fmat,
+              common::Span<HostDeviceVector<bst_node_t>> out_position,
+              const std::vector<RegTree*>& trees) override {
     pruner_monitor_.Start("PrunerUpdate");
     // rescale learning rate according to size of trees
     float lr = param_.learning_rate;
@@ -61,7 +61,7 @@ class TreePruner: public TreeUpdater {
       this->DoPrune(tree);
     }
     param_.learning_rate = lr;
-    syncher_->Update(gpair, p_fmat, trees);
+    syncher_->Update(gpair, p_fmat, out_position, trees);
     pruner_monitor_.Stop("PrunerUpdate");
   }
 
diff --git a/src/tree/updater_quantile_hist.cc b/src/tree/updater_quantile_hist.cc
index 0e1b6db47691..011733b4582a 100644
--- a/src/tree/updater_quantile_hist.cc
+++ b/src/tree/updater_quantile_hist.cc
@@ -36,6 +36,7 @@ void QuantileHistMaker::Configure(const Args &args) {
 }
 
 void QuantileHistMaker::Update(HostDeviceVector<GradientPair> *gpair, DMatrix *dmat,
+                               common::Span<HostDeviceVector<bst_node_t>> out_position,
                                const std::vector<RegTree *> &trees) {
   // rescale learning rate according to size of trees
   float lr = param_.learning_rate;
@@ -53,12 +54,15 @@ void QuantileHistMaker::Update(HostDeviceVector<GradientPair> *gpair, DMatrix *d
     }
   }
 
+  size_t t_idx{0};
   for (auto p_tree : trees) {
+    auto &t_row_position = out_position[t_idx];
     if (hist_maker_param_.single_precision_histogram) {
-      this->float_builder_->UpdateTree(gpair, dmat, p_tree);
+      this->float_builder_->UpdateTree(gpair, dmat, p_tree, &t_row_position);
     } else {
-      this->double_builder_->UpdateTree(gpair, dmat, p_tree);
+      this->double_builder_->UpdateTree(gpair, dmat, p_tree, &t_row_position);
     }
+    ++t_idx;
   }
 
   param_.learning_rate = lr;
@@ -169,13 +173,29 @@ void QuantileHistMaker::Builder<GradientSumT>::BuildHistogram(
   }
 }
 
+template <typename GradientSumT>
+void QuantileHistMaker::Builder<GradientSumT>::LeafPartition(
+    RegTree const &tree, common::Span<GradientPair const> gpair,
+    std::vector<bst_node_t> *p_out_position) {
+  monitor_->Start(__func__);
+  if (!evaluator_->Task().UpdateTreeLeaf()) {
+    return;
+  }
+  for (auto const &part : partitioner_) {
+    part.LeafPartition(ctx_, tree, gpair, p_out_position);
+  }
+  monitor_->Stop(__func__);
+}
+
 template <typename GradientSumT>
 void QuantileHistMaker::Builder<GradientSumT>::ExpandTree(
-    DMatrix *p_fmat, RegTree *p_tree, const std::vector<GradientPair> &gpair_h) {
+    DMatrix *p_fmat, RegTree *p_tree, const std::vector<GradientPair> &gpair_h,
+    HostDeviceVector<bst_node_t> *p_out_position) {
   monitor_->Start(__func__);
 
   Driver<CPUExpandEntry> driver(static_cast<TrainParam::TreeGrowPolicy>(param_.grow_policy));
   driver.Push(this->InitRoot(p_fmat, p_tree, gpair_h));
+  auto const &tree = *p_tree;
   bst_node_t num_leaves{1};
   auto expand_set = driver.Pop();
 
@@ -208,7 +228,6 @@ void QuantileHistMaker::Builder<GradientSumT>::ExpandTree(
     std::vector<CPUExpandEntry> best_splits;
     if (!valid_candidates.empty()) {
       this->BuildHistogram(p_fmat, p_tree, valid_candidates, gpair_h);
-      auto const &tree = *p_tree;
       for (auto const &candidate : valid_candidates) {
         int left_child_nidx = tree[candidate.nid].LeftChild();
         int right_child_nidx = tree[candidate.nid].RightChild();
@@ -228,12 +247,15 @@ void QuantileHistMaker::Builder<GradientSumT>::ExpandTree(
     expand_set = driver.Pop();
   }
 
+  auto &h_out_position = p_out_position->HostVector();
+  this->LeafPartition(tree, gpair_h, &h_out_position);
   monitor_->Stop(__func__);
 }
 
 template <typename GradientSumT>
-void QuantileHistMaker::Builder<GradientSumT>::UpdateTree(HostDeviceVector<GradientPair> *gpair,
-                                                          DMatrix *p_fmat, RegTree *p_tree) {
+void QuantileHistMaker::Builder<GradientSumT>::UpdateTree(
+    HostDeviceVector<GradientPair> *gpair, DMatrix *p_fmat, RegTree *p_tree,
+    HostDeviceVector<bst_node_t> *p_out_position) {
   monitor_->Start(__func__);
 
   std::vector<GradientPair> *gpair_ptr = &(gpair->HostVector());
@@ -246,8 +268,7 @@ void QuantileHistMaker::Builder<GradientSumT>::UpdateTree(HostDeviceVector<Gradi
 
   this->InitData(p_fmat, *p_tree, gpair_ptr);
 
-  ExpandTree(p_fmat, p_tree, *gpair_ptr);
-
+  ExpandTree(p_fmat, p_tree, *gpair_ptr, p_out_position);
   monitor_->Stop(__func__);
 }
 
diff --git a/src/tree/updater_quantile_hist.h b/src/tree/updater_quantile_hist.h
index 3c03a371ebfb..6d5919abb75f 100644
--- a/src/tree/updater_quantile_hist.h
+++ b/src/tree/updater_quantile_hist.h
@@ -17,6 +17,7 @@
 #include <utility>
 #include <vector>
 
+#include "xgboost/base.h"
 #include "xgboost/data.h"
 #include "xgboost/json.h"
 
@@ -214,6 +215,15 @@ class HistRowPartitioner {
   size_t Size() const {
     return std::distance(row_set_collection_.begin(), row_set_collection_.end());
   }
+
+  void LeafPartition(Context const* ctx, RegTree const& tree,
+                     common::Span<GradientPair const> gpair,
+                     std::vector<bst_node_t>* p_out_position) const {
+    partition_builder_.LeafPartition(
+        ctx, tree, this->Partitions(), p_out_position,
+        [&](size_t idx) -> bool { return gpair[idx].GetHess() - .0f == .0f; });
+  }
+
   auto& operator[](bst_node_t nidx) { return row_set_collection_[nidx]; }
   auto const& operator[](bst_node_t nidx) const { return row_set_collection_[nidx]; }
 };
@@ -228,8 +238,8 @@ class QuantileHistMaker: public TreeUpdater {
   explicit QuantileHistMaker(ObjInfo task) : task_{task} {}
   void Configure(const Args& args) override;
 
-  void Update(HostDeviceVector<GradientPair>* gpair,
-              DMatrix* dmat,
+  void Update(HostDeviceVector<GradientPair>* gpair, DMatrix* dmat,
+              common::Span<HostDeviceVector<bst_node_t>> out_position,
               const std::vector<RegTree*>& trees) override;
 
   bool UpdatePredictionCache(const DMatrix *data,
@@ -266,6 +276,8 @@ class QuantileHistMaker: public TreeUpdater {
     return "grow_quantile_histmaker";
   }
 
+  bool HasNodePosition() const override { return true; }
+
  protected:
   CPUHistMakerTrainParam hist_maker_param_;
   // training parameter
@@ -289,7 +301,8 @@ class QuantileHistMaker: public TreeUpdater {
       monitor_->Init("Quantile::Builder");
     }
     // update one tree, growing
-    void UpdateTree(HostDeviceVector<GradientPair>* gpair, DMatrix* p_fmat, RegTree* p_tree);
+    void UpdateTree(HostDeviceVector<GradientPair>* gpair, DMatrix* p_fmat, RegTree* p_tree,
+                    HostDeviceVector<bst_node_t>* p_out_position);
 
     bool UpdatePredictionCache(DMatrix const* data, linalg::VectorView<float> out_preds) const;
 
@@ -308,7 +321,11 @@ class QuantileHistMaker: public TreeUpdater {
                         std::vector<CPUExpandEntry> const& valid_candidates,
                         std::vector<GradientPair> const& gpair);
 
-    void ExpandTree(DMatrix* p_fmat, RegTree* p_tree, const std::vector<GradientPair>& gpair_h);
+    void LeafPartition(RegTree const& tree, common::Span<GradientPair const> gpair,
+                       std::vector<bst_node_t>* p_out_position);
+
+    void ExpandTree(DMatrix* p_fmat, RegTree* p_tree, const std::vector<GradientPair>& gpair_h,
+                    HostDeviceVector<bst_node_t>* p_out_position);
 
    private:
     const size_t n_trees_;
diff --git a/src/tree/updater_refresh.cc b/src/tree/updater_refresh.cc
index d17c1e1444f7..8e82ae9f914c 100644
--- a/src/tree/updater_refresh.cc
+++ b/src/tree/updater_refresh.cc
@@ -42,9 +42,9 @@ class TreeRefresher: public TreeUpdater {
     return true;
   }
   // update the tree, do pruning
-  void Update(HostDeviceVector<GradientPair> *gpair,
-              DMatrix *p_fmat,
-              const std::vector<RegTree*> &trees) override {
+  void Update(HostDeviceVector<GradientPair> *gpair, DMatrix *p_fmat,
+              common::Span<HostDeviceVector<bst_node_t>> out_position,
+              const std::vector<RegTree *> &trees) override {
     if (trees.size() == 0) return;
     const std::vector<GradientPair> &gpair_h = gpair->ConstHostVector();
     // thread temporal space
diff --git a/src/tree/updater_sync.cc b/src/tree/updater_sync.cc
index 4f7c7a1a85a6..a4c1486fbf90 100644
--- a/src/tree/updater_sync.cc
+++ b/src/tree/updater_sync.cc
@@ -31,9 +31,9 @@ class TreeSyncher: public TreeUpdater {
     return "prune";
   }
 
-  void Update(HostDeviceVector<GradientPair>* ,
-              DMatrix*,
-              const std::vector<RegTree*> &trees) override {
+  void Update(HostDeviceVector<GradientPair>*, DMatrix*,
+              common::Span<HostDeviceVector<bst_node_t>> out_position,
+              const std::vector<RegTree*>& trees) override {
     if (rabit::GetWorldSize() == 1) return;
     std::string s_model;
     common::MemoryBufferStream fs(&s_model);
diff --git a/tests/cpp/common/test_stats.cc b/tests/cpp/common/test_stats.cc
new file mode 100644
index 000000000000..2a1e375c0f20
--- /dev/null
+++ b/tests/cpp/common/test_stats.cc
@@ -0,0 +1,58 @@
+/*!
+ * Copyright 2022 by XGBoost Contributors
+ */
+#include <gtest/gtest.h>
+#include <xgboost/generic_parameters.h>
+
+#include "../../../src/common/stats.h"
+
+namespace xgboost {
+namespace common {
+TEST(Stats, Quantile) {
+  {
+    linalg::Tensor<float, 1> arr({20.f, 0.f, 15.f, 50.f, 40.f, 0.f, 35.f}, {7}, Context::kCpuId);
+    std::vector<size_t> index{0, 2, 3, 4, 6};
+    auto h_arr = arr.HostView();
+    auto beg = MakeIndexTransformIter([&](size_t i) { return h_arr(index[i]); });
+    auto end = beg + index.size();
+    auto q = Quantile(0.40f, beg, end);
+    ASSERT_EQ(q, 26.0);
+
+    q = Quantile(0.20f, beg, end);
+    ASSERT_EQ(q, 16.0);
+
+    q = Quantile(0.10f, beg, end);
+    ASSERT_EQ(q, 15.0);
+  }
+
+  {
+    std::vector<float> vec{1., 2., 3., 4., 5.};
+    auto beg = MakeIndexTransformIter([&](size_t i) { return vec[i]; });
+    auto end = beg + vec.size();
+    auto q = Quantile(0.5f, beg, end);
+    ASSERT_EQ(q, 3.);
+  }
+}
+
+TEST(Stats, WeightedQuantile) {
+  linalg::Tensor<float, 1> arr({1.f, 2.f, 3.f, 4.f, 5.f}, {5}, Context::kCpuId);
+  linalg::Tensor<float, 1> weight({1.f, 1.f, 1.f, 1.f, 1.f}, {5}, Context::kCpuId);
+
+  auto h_arr = arr.HostView();
+  auto h_weight = weight.HostView();
+
+  auto beg = MakeIndexTransformIter([&](size_t i) { return h_arr(i); });
+  auto end = beg + arr.Size();
+  auto w = MakeIndexTransformIter([&](size_t i) { return h_weight(i); });
+
+  auto q = WeightedQuantile(0.50f, beg, end, w);
+  ASSERT_EQ(q, 3);
+
+  q = WeightedQuantile(0.0, beg, end, w);
+  ASSERT_EQ(q, 1);
+
+  q = WeightedQuantile(1.0, beg, end, w);
+  ASSERT_EQ(q, 5);
+}
+}  // namespace common
+}  // namespace xgboost
diff --git a/tests/cpp/common/test_stats.cu b/tests/cpp/common/test_stats.cu
new file mode 100644
index 000000000000..eee92921d931
--- /dev/null
+++ b/tests/cpp/common/test_stats.cu
@@ -0,0 +1,77 @@
+/*!
+ * Copyright 2022 by XGBoost Contributors
+ */
+#include <gtest/gtest.h>
+#include <utility>
+#include <vector>
+
+#include "../../../src/common/stats.cuh"
+#include "xgboost/base.h"
+#include "xgboost/generic_parameters.h"
+#include "xgboost/host_device_vector.h"
+#include "xgboost/linalg.h"
+
+namespace xgboost {
+namespace common {
+namespace {
+class StatsGPU : public ::testing::Test {
+ private:
+  linalg::Tensor<float, 1> arr_{
+      {1.f, 2.f, 3.f, 4.f, 5.f,
+       2.f, 4.f, 5.f, 3.f, 1.f},
+      {10}, 0};
+  linalg::Tensor<size_t, 1> indptr_{{0, 5, 10}, {3}, 0};
+  HostDeviceVector<float> resutls_;
+  using TestSet = std::vector<std::pair<float, float>>;
+  Context ctx_;
+
+  void Check(float expected) {
+    auto const& h_results = resutls_.HostVector();
+    ASSERT_EQ(h_results.size(), indptr_.Size() - 1);
+    ASSERT_EQ(h_results.front(), expected);
+    EXPECT_EQ(h_results.back(), expected);
+  }
+
+ public:
+  void SetUp() override { ctx_.gpu_id = 0; }
+  void Weighted() {
+    auto d_arr = arr_.View(0);
+    auto d_key = indptr_.View(0);
+
+    auto key_it = dh::MakeTransformIterator<size_t>(thrust::make_counting_iterator(0ul),
+                                                    [=] __device__(size_t i) { return d_key(i); });
+    auto val_it = dh::MakeTransformIterator<float>(
+        thrust::make_counting_iterator(0ul), [=] XGBOOST_DEVICE(size_t i) { return d_arr(i); });
+    linalg::Tensor<float, 1> weights{{10}, 0};
+    linalg::ElementWiseTransformDevice(weights.View(0),
+                                       [=] XGBOOST_DEVICE(size_t, float) { return 1.0; });
+    auto w_it = weights.Data()->ConstDevicePointer();
+    for (auto const& pair : TestSet{{0.0f, 1.0f}, {0.5f, 3.0f}, {1.0f, 5.0f}}) {
+      SegmentedWeightedQuantile(&ctx_, pair.first, key_it, key_it + indptr_.Size(), val_it,
+                                val_it + arr_.Size(), w_it, w_it + weights.Size(), &resutls_);
+      this->Check(pair.second);
+    }
+  }
+
+  void NonWeighted() {
+    auto d_arr = arr_.View(0);
+    auto d_key = indptr_.View(0);
+
+    auto key_it = dh::MakeTransformIterator<size_t>(thrust::make_counting_iterator(0ul),
+                                                    [=] __device__(size_t i) { return d_key(i); });
+    auto val_it = dh::MakeTransformIterator<float>(
+        thrust::make_counting_iterator(0ul), [=] XGBOOST_DEVICE(size_t i) { return d_arr(i); });
+
+    for (auto const& pair : TestSet{{0.0f, 1.0f}, {0.5f, 3.0f}, {1.0f, 5.0f}}) {
+      SegmentedQuantile(&ctx_, pair.first, key_it, key_it + indptr_.Size(), val_it,
+                        val_it + arr_.Size(), &resutls_);
+      this->Check(pair.second);
+    }
+  }
+};
+}  // anonymous namespace
+
+TEST_F(StatsGPU, Quantile) { this->NonWeighted(); }
+TEST_F(StatsGPU, WeightedQuantile) { this->Weighted(); }
+}  // namespace common
+}  // namespace xgboost
diff --git a/tests/cpp/gbm/test_gbtree.cc b/tests/cpp/gbm/test_gbtree.cc
index c416d134307c..f9fe7d38660d 100644
--- a/tests/cpp/gbm/test_gbtree.cc
+++ b/tests/cpp/gbm/test_gbtree.cc
@@ -1,5 +1,5 @@
 /*!
- * Copyright 2019-2021 XGBoost contributors
+ * Copyright 2019-2022 XGBoost contributors
  */
 #include <gtest/gtest.h>
 #include <dmlc/filesystem.h>
@@ -69,13 +69,13 @@ TEST(GBTree, PredictionCache) {
   auto p_m = RandomDataGenerator{kRows, kCols, 0}.GenerateDMatrix();
   auto gpair = GenerateRandomGradients(kRows);
   PredictionCacheEntry out_predictions;
-  gbtree.DoBoost(p_m.get(), &gpair, &out_predictions);
+  gbtree.DoBoost(p_m.get(), &gpair, &out_predictions, nullptr);
 
   gbtree.PredictBatch(p_m.get(), &out_predictions, false, 0, 0);
   ASSERT_EQ(1, out_predictions.version);
   std::vector<float> first_iter = out_predictions.predictions.HostVector();
   // Add 1 more boosted round
-  gbtree.DoBoost(p_m.get(), &gpair, &out_predictions);
+  gbtree.DoBoost(p_m.get(), &gpair, &out_predictions, nullptr);
   gbtree.PredictBatch(p_m.get(), &out_predictions, false, 0, 0);
   ASSERT_EQ(2, out_predictions.version);
   // Update the cache for all rounds
@@ -83,7 +83,7 @@ TEST(GBTree, PredictionCache) {
   gbtree.PredictBatch(p_m.get(), &out_predictions, false, 0, 0);
   ASSERT_EQ(2, out_predictions.version);
 
-  gbtree.DoBoost(p_m.get(), &gpair, &out_predictions);
+  gbtree.DoBoost(p_m.get(), &gpair, &out_predictions, nullptr);
   // drop the cache.
   gbtree.PredictBatch(p_m.get(), &out_predictions, false, 1, 2);
   ASSERT_EQ(0, out_predictions.version);
diff --git a/tests/cpp/helpers.cc b/tests/cpp/helpers.cc
index 05c138781e0d..68faa09642ed 100644
--- a/tests/cpp/helpers.cc
+++ b/tests/cpp/helpers.cc
@@ -548,7 +548,7 @@ std::unique_ptr<GradientBooster> CreateTrainedGBM(
 
   PredictionCacheEntry predts;
 
-  gbm->DoBoost(p_dmat.get(), &gpair, &predts);
+  gbm->DoBoost(p_dmat.get(), &gpair, &predts, nullptr);
 
   return gbm;
 }
diff --git a/tests/cpp/objective/test_regression_obj.cc b/tests/cpp/objective/test_regression_obj.cc
index ef4529934337..a26f69476152 100644
--- a/tests/cpp/objective/test_regression_obj.cc
+++ b/tests/cpp/objective/test_regression_obj.cc
@@ -1,11 +1,14 @@
 /*!
- * Copyright 2017-2021 XGBoost contributors
+ * Copyright 2017-2022 XGBoost contributors
  */
 #include <gtest/gtest.h>
-#include <xgboost/objective.h>
 #include <xgboost/generic_parameters.h>
 #include <xgboost/json.h>
+#include <xgboost/objective.h>
+
+#include "../../../src/objective/adaptive.h"
 #include "../helpers.h"
+
 namespace xgboost {
 
 TEST(Objective, DeclareUnifiedTest(LinearRegressionGPair)) {
@@ -378,4 +381,113 @@ TEST(Objective, CoxRegressionGPair) {
                    { 0,    0,    0,  0.160f,  0.186f,  0.348f, 0.610f,  0.639f});
 }
 #endif
+
+TEST(Objective, DeclareUnifiedTest(AbsoluteError)) {
+  Context ctx = CreateEmptyGenericParam(GPUIDX);
+  std::unique_ptr<ObjFunction> obj{ObjFunction::Create("reg:absoluteerror", &ctx)};
+  obj->Configure({});
+  CheckConfigReload(obj, "reg:absoluteerror");
+
+  MetaInfo info;
+  std::vector<float> labels{0.f, 3.f, 2.f, 5.f, 4.f, 7.f};
+  info.labels.Reshape(6, 1);
+  info.labels.Data()->HostVector() = labels;
+  info.num_row_ = labels.size();
+  HostDeviceVector<float> predt{1.f, 2.f, 3.f, 4.f, 5.f, 6.f};
+  info.weights_.HostVector() = {1.f, 1.f, 1.f, 1.f, 1.f, 1.f};
+
+  CheckObjFunction(obj, predt.HostVector(), labels, info.weights_.HostVector(),
+                   {1.f, -1.f, 1.f, -1.f, 1.f, -1.f}, info.weights_.HostVector());
+
+  RegTree tree;
+  tree.ExpandNode(0, /*split_index=*/1, 2, true, 0.0f, 2.f, 3.f, 4.f, 2.f, 1.f, 1.f);
+
+  HostDeviceVector<bst_node_t> position(labels.size(), 0);
+  auto& h_position = position.HostVector();
+  for (size_t i = 0; i < labels.size(); ++i) {
+    if (i < labels.size() / 2) {
+      h_position[i] = 1;  // left
+    } else {
+      h_position[i] = 2;  // right
+    }
+  }
+
+  auto& h_predt = predt.HostVector();
+  for (size_t i = 0; i < h_predt.size(); ++i) {
+    h_predt[i] = labels[i] + i;
+  }
+
+  obj->UpdateTreeLeaf(position, info, predt, &tree);
+  ASSERT_EQ(tree[1].LeafValue(), -1);
+  ASSERT_EQ(tree[2].LeafValue(), -4);
+}
+
+TEST(Objective, DeclareUnifiedTest(AbsoluteErrorLeaf)) {
+  Context ctx = CreateEmptyGenericParam(GPUIDX);
+  std::unique_ptr<ObjFunction> obj{ObjFunction::Create("reg:absoluteerror", &ctx)};
+  obj->Configure({});
+
+  MetaInfo info;
+  info.labels.Reshape(16, 1);
+  info.num_row_ = info.labels.Size();
+  CHECK_EQ(info.num_row_, 16);
+  auto h_labels = info.labels.HostView().Values();
+  std::iota(h_labels.begin(), h_labels.end(), 0);
+  HostDeviceVector<float> predt(h_labels.size());
+  auto& h_predt = predt.HostVector();
+  for (size_t i = 0; i < h_predt.size(); ++i) {
+    h_predt[i] = h_labels[i] + i;
+  }
+
+  HostDeviceVector<bst_node_t> position(info.labels.Size(), 0);
+  auto& h_position = position.HostVector();
+  for (int32_t i = 0; i < 3; ++i) {
+    h_position[i] = ~i;  // negation for sampled nodes.
+  }
+  for (size_t i = 3; i < 8; ++i) {
+    h_position[i] = 3;
+  }
+  // empty leaf for node 4
+  for (size_t i = 8; i < 13; ++i) {
+    h_position[i] = 5;
+  }
+  for (size_t i = 13; i < h_labels.size(); ++i) {
+    h_position[i] = 6;
+  }
+
+  RegTree tree;
+  tree.ExpandNode(0, /*split_index=*/1, 2, true, 0.0f, 2.f, 3.f, 4.f, 2.f, 1.f, 1.f);
+  tree.ExpandNode(1, /*split_index=*/1, 2, true, 0.0f, 2.f, 3.f, 4.f, 2.f, 1.f, 1.f);
+  tree.ExpandNode(2, /*split_index=*/1, 2, true, 0.0f, 2.f, 3.f, 4.f, 2.f, 1.f, 1.f);
+  ASSERT_EQ(tree.GetNumLeaves(), 4);
+
+  auto empty_leaf = tree[4].LeafValue();
+  obj->UpdateTreeLeaf(position, info, predt, &tree);
+  ASSERT_EQ(tree[3].LeafValue(), -5);
+  ASSERT_EQ(tree[4].LeafValue(), empty_leaf);
+  ASSERT_EQ(tree[5].LeafValue(), -10);
+  ASSERT_EQ(tree[6].LeafValue(), -14);
+}
+
+TEST(Adaptive, DeclareUnifiedTest(MissingLeaf)) {
+  std::vector<bst_node_t> missing{1, 3};
+
+  std::vector<bst_node_t> h_nidx = {2, 4, 5};
+  std::vector<size_t> h_nptr = {0, 4, 8, 16};
+
+  obj::detail::FillMissingLeaf(missing, &h_nidx, &h_nptr);
+
+  ASSERT_EQ(h_nidx[0], missing[0]);
+  ASSERT_EQ(h_nidx[2], missing[1]);
+  ASSERT_EQ(h_nidx[1], 2);
+  ASSERT_EQ(h_nidx[3], 4);
+  ASSERT_EQ(h_nidx[4], 5);
+
+  ASSERT_EQ(h_nptr[0], 0);
+  ASSERT_EQ(h_nptr[1], 0);  // empty
+  ASSERT_EQ(h_nptr[2], 4);
+  ASSERT_EQ(h_nptr[3], 4);  // empty
+  ASSERT_EQ(h_nptr[4], 8);
+  ASSERT_EQ(h_nptr[5], 16);
+}
 }  // namespace xgboost
diff --git a/tests/cpp/predictor/test_cpu_predictor.cc b/tests/cpp/predictor/test_cpu_predictor.cc
index 1a466ed3ff10..f43747abdd9e 100644
--- a/tests/cpp/predictor/test_cpu_predictor.cc
+++ b/tests/cpp/predictor/test_cpu_predictor.cc
@@ -1,5 +1,5 @@
 /*!
- * Copyright 2017-2020 XGBoost contributors
+ * Copyright 2017-2022 XGBoost contributors
  */
 #include <dmlc/filesystem.h>
 #include <gtest/gtest.h>
@@ -222,7 +222,7 @@ void TestUpdatePredictionCache(bool use_subsampling) {
   PredictionCacheEntry predtion_cache;
   predtion_cache.predictions.Resize(kRows*kClasses, 0);
   // after one training iteration predtion_cache is filled with cached in QuantileHistMaker::Builder prediction values
-  gbm->DoBoost(dmat.get(), &gpair, &predtion_cache);
+  gbm->DoBoost(dmat.get(), &gpair, &predtion_cache, nullptr);
 
   PredictionCacheEntry out_predictions;
   // perform fair prediction on the same input data, should be equal to cached result
diff --git a/tests/cpp/tree/gpu_hist/test_row_partitioner.cu b/tests/cpp/tree/gpu_hist/test_row_partitioner.cu
index 9b16cca5362d..c8aaf82dcb3e 100644
--- a/tests/cpp/tree/gpu_hist/test_row_partitioner.cu
+++ b/tests/cpp/tree/gpu_hist/test_row_partitioner.cu
@@ -1,7 +1,8 @@
 /*!
- * Copyright 2019-2021 by XGBoost Contributors
+ * Copyright 2019-2022 by XGBoost Contributors
  */
 #include <gtest/gtest.h>
+#include <algorithm>
 #include <vector>
 
 #include <thrust/device_vector.h>
@@ -10,6 +11,10 @@
 
 #include "../../../../src/tree/gpu_hist/row_partitioner.cuh"
 #include "../../helpers.h"
+#include "xgboost/base.h"
+#include "xgboost/generic_parameters.h"
+#include "xgboost/task.h"
+#include "xgboost/tree_model.h"
 
 namespace xgboost {
 namespace tree {
@@ -103,17 +108,58 @@ TEST(RowPartitioner, Basic) { TestUpdatePosition(); }
 
 void TestFinalise() {
   const int kNumRows = 10;
-  RowPartitioner rp(0, kNumRows);
-  rp.FinalisePosition([=]__device__(RowPartitioner::RowIndexT ridx, int position)
-  {
-    return 7;
-  });
-  auto position = rp.GetPositionHost();
-  for(auto p:position)
+
+  ObjInfo task{ObjInfo::kRegression, false, false};
+  HostDeviceVector<bst_node_t> position;
+  Context ctx;
+  ctx.gpu_id = 0;
+
   {
-    EXPECT_EQ(p, 7);
+    RowPartitioner rp(0, kNumRows);
+    rp.FinalisePosition(
+        &ctx, task, &position,
+        [=] __device__(RowPartitioner::RowIndexT ridx, int position) { return 7; },
+        [] XGBOOST_DEVICE(size_t idx) { return false; });
+
+    auto position = rp.GetPositionHost();
+    for (auto p : position) {
+      EXPECT_EQ(p, 7);
+    }
+  }
+
+  /**
+   * Test for sampling.
+   */
+  dh::device_vector<float> hess(kNumRows);
+  for (size_t i = 0; i < hess.size(); ++i) {
+    // removed rows, 0, 3, 6, 9
+    if (i % 3 == 0) {
+      hess[i] = 0;
+    } else {
+      hess[i] = i;
+    }
+  }
+
+  auto d_hess = dh::ToSpan(hess);
+
+  RowPartitioner rp(0, kNumRows);
+  rp.FinalisePosition(
+      &ctx, task, &position,
+      [] __device__(RowPartitioner::RowIndexT ridx, bst_node_t position) {
+        return ridx % 2 == 0 ? 1 : 2;
+      },
+      [d_hess] __device__(size_t ridx) { return d_hess[ridx] - 0.f == 0.f; });
+
+  auto const& h_position = position.ConstHostVector();
+  for (size_t ridx = 0; ridx < h_position.size(); ++ridx) {
+    if (ridx % 3 == 0) {
+      ASSERT_LT(h_position[ridx], 0);
+    } else {
+      ASSERT_EQ(h_position[ridx], ridx % 2 == 0 ? 1 : 2);
+    }
   }
 }
+
 TEST(RowPartitioner, Finalise) { TestFinalise(); }
 
 void TestIncorrectRow() {
diff --git a/tests/cpp/tree/test_approx.cc b/tests/cpp/tree/test_approx.cc
index a37c0973627e..2e2fd4a0b3d7 100644
--- a/tests/cpp/tree/test_approx.cc
+++ b/tests/cpp/tree/test_approx.cc
@@ -26,7 +26,7 @@ TEST(Approx, Partitioner) {
   std::transform(grad.HostVector().cbegin(), grad.HostVector().cend(), hess.begin(),
                  [](auto gpair) { return gpair.GetHess(); });
 
-  for (auto const &page : Xy->GetBatches<GHistIndexMatrix>({64, hess, true})) {
+  for (auto const& page : Xy->GetBatches<GHistIndexMatrix>({64, hess, true})) {
     bst_feature_t const split_ind = 0;
     {
       auto min_value = page.cut.MinValues()[split_ind];
@@ -44,9 +44,9 @@ TEST(Approx, Partitioner) {
       float split_value = page.cut.Values().at(ptr / 2);
       RegTree tree;
       GetSplit(&tree, split_value, &candidates);
-      auto left_nidx = tree[RegTree::kRoot].LeftChild();
       partitioner.UpdatePosition(&ctx, page, candidates, &tree);
 
+      auto left_nidx = tree[RegTree::kRoot].LeftChild();
       auto elem = partitioner[left_nidx];
       ASSERT_LT(elem.Size(), n_samples);
       ASSERT_GT(elem.Size(), 1);
@@ -54,6 +54,7 @@ TEST(Approx, Partitioner) {
         auto value = page.cut.Values().at(page.index[*it]);
         ASSERT_LE(value, split_value);
       }
+
       auto right_nidx = tree[RegTree::kRoot].RightChild();
       elem = partitioner[right_nidx];
       for (auto it = elem.begin; it != elem.end; ++it) {
@@ -63,5 +64,78 @@ TEST(Approx, Partitioner) {
     }
   }
 }
+namespace {
+void TestLeafPartition(size_t n_samples) {
+  size_t const n_features = 2, base_rowid = 0;
+  common::RowSetCollection row_set;
+  ApproxRowPartitioner partitioner{n_samples, base_rowid};
+
+  auto Xy = RandomDataGenerator{n_samples, n_features, 0}.GenerateDMatrix(true);
+  GenericParameter ctx;
+  std::vector<CPUExpandEntry> candidates{{0, 0, 0.4}};
+  RegTree tree;
+  std::vector<float> hess(n_samples, 0);
+  // emulate sampling
+  auto not_sampled = [](size_t i) {
+    size_t const kSampleFactor{3};
+    return i % kSampleFactor != 0;
+  };
+  size_t n{0};
+  for (size_t i = 0; i < hess.size(); ++i) {
+    if (not_sampled(i)) {
+      hess[i] = 1.0f;
+      ++n;
+    }
+  }
+
+  std::vector<size_t> h_nptr;
+  float split_value{0};
+  for (auto const& page : Xy->GetBatches<GHistIndexMatrix>({Context::kCpuId, 64})) {
+    bst_feature_t const split_ind = 0;
+    auto ptr = page.cut.Ptrs()[split_ind + 1];
+    split_value = page.cut.Values().at(ptr / 2);
+    GetSplit(&tree, split_value, &candidates);
+    partitioner.UpdatePosition(&ctx, page, candidates, &tree);
+    std::vector<bst_node_t> position;
+    partitioner.LeafPartition(&ctx, tree, hess, &position);
+    std::sort(position.begin(), position.end());
+    size_t beg = std::distance(
+        position.begin(),
+        std::find_if(position.begin(), position.end(), [&](bst_node_t nidx) { return nidx >= 0; }));
+    std::vector<size_t> nptr;
+    common::RunLengthEncode(position.cbegin() + beg, position.cend(), &nptr);
+    std::transform(nptr.begin(), nptr.end(), nptr.begin(), [&](size_t x) { return x + beg; });
+    auto n_uniques = std::unique(position.begin() + beg, position.end()) - (position.begin() + beg);
+    ASSERT_EQ(nptr.size(), n_uniques + 1);
+    ASSERT_EQ(nptr[0], beg);
+    ASSERT_EQ(nptr.back(), n_samples);
+
+    h_nptr = nptr;
+  }
+
+  if (h_nptr.front() == n_samples) {
+    return;
+  }
+
+  ASSERT_GE(h_nptr.size(), 2);
+
+  for (auto const& page : Xy->GetBatches<SparsePage>()) {
+    auto batch = page.GetView();
+    size_t left{0};
+    for (size_t i = 0; i < batch.Size(); ++i) {
+      if (not_sampled(i) && batch[i].front().fvalue < split_value) {
+        left++;
+      }
+    }
+    ASSERT_EQ(left, h_nptr[1] - h_nptr[0]);  // equal to number of sampled assigned to left
+  }
+}
+}  // anonymous namespace
+
+TEST(Approx, LeafPartition) {
+  for (auto n_samples : {0ul, 1ul, 128ul, 256ul}) {
+    TestLeafPartition(n_samples);
+  }
+}
 }  // namespace tree
 }  // namespace xgboost
diff --git a/tests/cpp/tree/test_gpu_hist.cu b/tests/cpp/tree/test_gpu_hist.cu
index 82f40465deb2..3c93c283917a 100644
--- a/tests/cpp/tree/test_gpu_hist.cu
+++ b/tests/cpp/tree/test_gpu_hist.cu
@@ -1,5 +1,5 @@
 /*!
- * Copyright 2017-2021 XGBoost contributors
+ * Copyright 2017-2022 XGBoost contributors
  */
 #include <gtest/gtest.h>
 #include <thrust/device_vector.h>
@@ -13,6 +13,7 @@
 #include "../helpers.h"
 #include "../histogram_helpers.h"
 
+#include "xgboost/generic_parameters.h"
 #include "xgboost/json.h"
 #include "../../../src/data/sparse_page_source.h"
 #include "../../../src/tree/updater_gpu_hist.cu"
@@ -22,7 +23,6 @@
 
 namespace xgboost {
 namespace tree {
-
 TEST(GpuHist, DeviceHistogram) {
   // Ensures that node allocates correctly after reaching `kStopGrowingSize`.
   dh::safe_cuda(cudaSetDevice(0));
@@ -81,8 +81,9 @@ void TestBuildHist(bool use_shared_memory_histograms) {
   param.Init(args);
   auto page = BuildEllpackPage(kNRows, kNCols);
   BatchParam batch_param{};
-  GPUHistMakerDevice<GradientSumT> maker(0, page.get(), {}, kNRows, param,
-                                         kNCols, kNCols, batch_param);
+  Context ctx{CreateEmptyGenericParam(0)};
+  GPUHistMakerDevice<GradientSumT> maker(&ctx, page.get(), {}, kNRows, param, kNCols, kNCols,
+                                         batch_param);
   xgboost::SimpleLCG gen;
   xgboost::SimpleRealUniformDistribution<bst_float> dist(0.0f, 1.0f);
   HostDeviceVector<GradientPair> gpair(kNRows);
@@ -158,14 +159,14 @@ TEST(GpuHist, ApplySplit) {
   BatchParam bparam;
   bparam.gpu_id = 0;
   bparam.max_bin = 3;
+  Context ctx{CreateEmptyGenericParam(0)};
 
   for (auto& ellpack : m->GetBatches<EllpackPage>(bparam)){
     auto impl = ellpack.Impl();
     HostDeviceVector<FeatureType> feature_types(10, FeatureType::kCategorical);
     feature_types.SetDevice(bparam.gpu_id);
     tree::GPUHistMakerDevice<GradientPairPrecise> updater(
-        0, impl, feature_types.ConstDeviceSpan(), n_rows, tparam, 0, n_cols,
-        bparam);
+        &ctx, impl, feature_types.ConstDeviceSpan(), n_rows, tparam, 0, n_cols, bparam);
     updater.ApplySplit(candidate, &tree);
 
     ASSERT_EQ(tree.GetSplitTypes().size(), 3);
@@ -224,8 +225,9 @@ TEST(GpuHist, EvaluateRootSplit) {
   // Initialize GPUHistMakerDevice
   auto page = BuildEllpackPage(kNRows, kNCols);
   BatchParam batch_param{};
-  GPUHistMakerDevice<GradientPairPrecise> maker(
-      0, page.get(), {}, kNRows, param, kNCols, kNCols, batch_param);
+  Context ctx{CreateEmptyGenericParam(0)};
+  GPUHistMakerDevice<GradientPairPrecise> maker(&ctx, page.get(), {}, kNRows, param, kNCols, kNCols,
+                                                batch_param);
   // Initialize GPUHistMakerDevice::node_sum_gradients
   maker.node_sum_gradients = {};
 
@@ -348,7 +350,8 @@ void UpdateTree(HostDeviceVector<GradientPair>* gpair, DMatrix* dmat,
   GenericParameter generic_param(CreateEmptyGenericParam(0));
   hist_maker.Configure(args, &generic_param);
 
-  hist_maker.Update(gpair, dmat, {tree});
+  std::vector<HostDeviceVector<bst_node_t>> position(1);
+  hist_maker.Update(gpair, dmat, common::Span<HostDeviceVector<bst_node_t>>{position}, {tree});
   auto cache = linalg::VectorView<float>{preds->DeviceSpan(), {preds->Size()}, 0};
   hist_maker.UpdatePredictionCache(dmat, cache);
 }
@@ -483,7 +486,7 @@ TEST(GpuHist, ExternalMemoryWithSampling) {
   auto preds_h = preds.ConstHostVector();
   auto preds_ext_h = preds_ext.ConstHostVector();
   for (int i = 0; i < kRows; i++) {
-    EXPECT_NEAR(preds_h[i], preds_ext_h[i], 1e-3);
+    ASSERT_NEAR(preds_h[i], preds_ext_h[i], 1e-3);
   }
 }
 
diff --git a/tests/cpp/tree/test_histmaker.cc b/tests/cpp/tree/test_histmaker.cc
index 56878b159d4b..90dc0a411294 100644
--- a/tests/cpp/tree/test_histmaker.cc
+++ b/tests/cpp/tree/test_histmaker.cc
@@ -39,7 +39,8 @@ TEST(GrowHistMaker, InteractionConstraint) {
     updater->Configure(Args{
         {"interaction_constraints", "[[0, 1]]"},
         {"num_feature", std::to_string(kCols)}});
-    updater->Update(&gradients, p_dmat.get(), {&tree});
+    std::vector<HostDeviceVector<bst_node_t>> position(1);
+    updater->Update(&gradients, p_dmat.get(), position, {&tree});
 
     ASSERT_EQ(tree.NumExtraNodes(), 4);
     ASSERT_EQ(tree[0].SplitIndex(), 1);
@@ -55,7 +56,8 @@ TEST(GrowHistMaker, InteractionConstraint) {
     std::unique_ptr<TreeUpdater> updater{
         TreeUpdater::Create("grow_histmaker", &param, ObjInfo{ObjInfo::kRegression})};
     updater->Configure(Args{{"num_feature", std::to_string(kCols)}});
-    updater->Update(&gradients, p_dmat.get(), {&tree});
+    std::vector<HostDeviceVector<bst_node_t>> position(1);
+    updater->Update(&gradients, p_dmat.get(), position, {&tree});
 
     ASSERT_EQ(tree.NumExtraNodes(), 10);
     ASSERT_EQ(tree[0].SplitIndex(), 1);
diff --git a/tests/cpp/tree/test_prediction_cache.cc b/tests/cpp/tree/test_prediction_cache.cc
index ebe66cf575b3..3e30e0699358 100644
--- a/tests/cpp/tree/test_prediction_cache.cc
+++ b/tests/cpp/tree/test_prediction_cache.cc
@@ -77,7 +77,8 @@ class TestPredictionCache : public ::testing::Test {
       std::vector<RegTree *> trees{&tree};
       auto gpair = GenerateRandomGradients(n_samples_);
       updater->Configure(Args{{"max_bin", "64"}});
-      updater->Update(&gpair, Xy_.get(), trees);
+      std::vector<HostDeviceVector<bst_node_t>> position(1);
+      updater->Update(&gpair, Xy_.get(), position, trees);
       HostDeviceVector<float> out_prediction_cached;
       out_prediction_cached.SetDevice(ctx.gpu_id);
       out_prediction_cached.Resize(n_samples_);
diff --git a/tests/cpp/tree/test_prune.cc b/tests/cpp/tree/test_prune.cc
index dc6a8da21d72..77f78b1399d9 100644
--- a/tests/cpp/tree/test_prune.cc
+++ b/tests/cpp/tree/test_prune.cc
@@ -43,22 +43,23 @@ TEST(Updater, Prune) {
   pruner->Configure(cfg);
 
   // loss_chg < min_split_loss;
+  std::vector<HostDeviceVector<bst_node_t>> position(trees.size());
   tree.ExpandNode(0, 0, 0, true, 0.0f, 0.3f, 0.4f, 0.0f, 0.0f,
                   /*left_sum=*/0.0f, /*right_sum=*/0.0f);
-  pruner->Update(&gpair, p_dmat.get(), trees);
+  pruner->Update(&gpair, p_dmat.get(), position, trees);
 
   ASSERT_EQ(tree.NumExtraNodes(), 0);
 
   // loss_chg > min_split_loss;
   tree.ExpandNode(0, 0, 0, true, 0.0f, 0.3f, 0.4f, 11.0f, 0.0f,
                   /*left_sum=*/0.0f, /*right_sum=*/0.0f);
-  pruner->Update(&gpair, p_dmat.get(), trees);
+  pruner->Update(&gpair, p_dmat.get(), position, trees);
 
   ASSERT_EQ(tree.NumExtraNodes(), 2);
 
   // loss_chg == min_split_loss;
   tree.Stat(0).loss_chg = 10;
-  pruner->Update(&gpair, p_dmat.get(), trees);
+  pruner->Update(&gpair, p_dmat.get(), position, trees);
 
   ASSERT_EQ(tree.NumExtraNodes(), 2);
 
@@ -74,7 +75,7 @@ TEST(Updater, Prune) {
                   /*left_sum=*/0.0f, /*right_sum=*/0.0f);
   cfg.emplace_back(std::make_pair("max_depth", "1"));
   pruner->Configure(cfg);
-  pruner->Update(&gpair, p_dmat.get(), trees);
+  pruner->Update(&gpair, p_dmat.get(), position, trees);
 
   ASSERT_EQ(tree.NumExtraNodes(), 2);
 
@@ -84,7 +85,7 @@ TEST(Updater, Prune) {
                   /*left_sum=*/0.0f, /*right_sum=*/0.0f);
   cfg.emplace_back(std::make_pair("min_split_loss", "0"));
   pruner->Configure(cfg);
-  pruner->Update(&gpair, p_dmat.get(), trees);
+  pruner->Update(&gpair, p_dmat.get(), position, trees);
   ASSERT_EQ(tree.NumExtraNodes(), 2);
 }
 }  // namespace tree
diff --git a/tests/cpp/tree/test_refresh.cc b/tests/cpp/tree/test_refresh.cc
index 5b71f0841e19..f0abd0a871aa 100644
--- a/tests/cpp/tree/test_refresh.cc
+++ b/tests/cpp/tree/test_refresh.cc
@@ -44,7 +44,8 @@ TEST(Updater, Refresh) {
   tree.Stat(cright).base_weight = 1.3;
 
   refresher->Configure(cfg);
-  refresher->Update(&gpair, p_dmat.get(), trees);
+  std::vector<HostDeviceVector<bst_node_t>> position;
+  refresher->Update(&gpair, p_dmat.get(), position, trees);
 
   bst_float constexpr kEps = 1e-6;
   ASSERT_NEAR(-0.183392, tree[cright].LeafValue(), kEps);
diff --git a/tests/cpp/tree/test_tree_stat.cc b/tests/cpp/tree/test_tree_stat.cc
index 772420ce0f23..723ca34ebc93 100644
--- a/tests/cpp/tree/test_tree_stat.cc
+++ b/tests/cpp/tree/test_tree_stat.cc
@@ -27,7 +27,8 @@ class UpdaterTreeStatTest : public ::testing::Test {
     up->Configure(Args{});
     RegTree tree;
     tree.param.num_feature = kCols;
-    up->Update(&gpairs_, p_dmat_.get(), {&tree});
+    std::vector<HostDeviceVector<bst_node_t>> position(1);
+    up->Update(&gpairs_, p_dmat_.get(), position, {&tree});
 
     tree.WalkTree([&tree](bst_node_t nidx) {
       if (tree[nidx].IsLeaf()) {
@@ -87,13 +88,15 @@ class UpdaterEtaTest : public ::testing::Test {
       RegTree tree_0;
       {
         tree_0.param.num_feature = kCols;
-        up_0->Update(&gpairs_, p_dmat_.get(), {&tree_0});
+        std::vector<HostDeviceVector<bst_node_t>> position(1);
+        up_0->Update(&gpairs_, p_dmat_.get(), position, {&tree_0});
       }
 
       RegTree tree_1;
       {
         tree_1.param.num_feature = kCols;
-        up_1->Update(&gpairs_, p_dmat_.get(), {&tree_1});
+        std::vector<HostDeviceVector<bst_node_t>> position(1);
+        up_1->Update(&gpairs_, p_dmat_.get(), position, {&tree_1});
       }
       tree_0.WalkTree([&](bst_node_t nidx) {
         if (tree_0[nidx].IsLeaf()) {
@@ -149,7 +152,8 @@ class TestMinSplitLoss : public ::testing::Test {
     up->Configure(args);
 
     RegTree tree;
-    up->Update(&gpair_, dmat_.get(), {&tree});
+    std::vector<HostDeviceVector<bst_node_t>> position(1);
+    up->Update(&gpair_, dmat_.get(), position, {&tree});
 
     auto n_nodes = tree.NumExtraNodes();
     return n_nodes;
diff --git a/tests/python-gpu/test_gpu_prediction.py b/tests/python-gpu/test_gpu_prediction.py
index 38f4db07d366..4e41e637f7de 100644
--- a/tests/python-gpu/test_gpu_prediction.py
+++ b/tests/python-gpu/test_gpu_prediction.py
@@ -249,6 +249,8 @@ def predict_df(x):
            tm.dataset_strategy, shap_parameter_strategy)
     @settings(deadline=None, print_blob=True)
     def test_shap(self, num_rounds, dataset, param):
+        if dataset.name.endswith("-l1"):  # not supported by the exact tree method
+            return
         param.update({"predictor": "gpu_predictor", "gpu_id": 0})
         param = dataset.set_params(param)
         dmat = dataset.get_dmat()
@@ -263,6 +265,8 @@ def test_shap(self, num_rounds, dataset, param):
            tm.dataset_strategy, shap_parameter_strategy)
     @settings(deadline=None, max_examples=20, print_blob=True)
     def test_shap_interactions(self, num_rounds, dataset, param):
+        if dataset.name.endswith("-l1"):  # not supported by the exact tree method
+            return
         param.update({"predictor": "gpu_predictor", "gpu_id": 0})
         param = dataset.set_params(param)
         dmat = dataset.get_dmat()
diff --git a/tests/python-gpu/test_gpu_updaters.py b/tests/python-gpu/test_gpu_updaters.py
index a3427b566360..e9d2bf06e229 100644
--- a/tests/python-gpu/test_gpu_updaters.py
+++ b/tests/python-gpu/test_gpu_updaters.py
@@ -90,6 +90,8 @@ def test_gpu_hist_device_dmatrix(self, param, num_rounds, dataset):
            tm.dataset_strategy)
     @settings(deadline=None, print_blob=True)
     def test_external_memory(self, param, num_rounds, dataset):
+        if dataset.name.endswith("-l1"):
+            return
         # We cannot handle empty dataset yet
         assume(len(dataset.y) > 0)
         param['tree_method'] = 'gpu_hist'
diff --git a/tests/python-gpu/test_gpu_with_dask.py b/tests/python-gpu/test_gpu_with_dask.py
index 1f0339e913ec..2074ce073648 100644
--- a/tests/python-gpu/test_gpu_with_dask.py
+++ b/tests/python-gpu/test_gpu_with_dask.py
@@ -1,7 +1,7 @@
 """Copyright 2019-2022 XGBoost contributors"""
 import sys
 import os
-from typing import Type, TypeVar, Any, Dict, List, Tuple
+from typing import Type, TypeVar, Any, Dict, List
 import pytest
 import numpy as np
 import asyncio
@@ -198,9 +198,19 @@ def run_gpu_hist(
         dtrain=m,
         num_boost_round=num_rounds,
         evals=[(m, "train")],
-    )["history"]
+    )["history"]["train"][dataset.metric]
     note(history)
-    assert tm.non_increasing(history["train"][dataset.metric])
+
+    # See note on `ObjFunction::UpdateTreeLeaf`.
+    update_leaf = dataset.name.endswith("-l1")
+    if update_leaf and len(history) == 2:
+        assert history[0] + 1e-2 >= history[-1]
+        return
+    if update_leaf and len(history) > 2:
+        assert history[0] >= history[-1]
+        return
+    else:
+        assert tm.non_increasing(history)
 
 
 @pytest.mark.skipif(**tm.no_cudf())
@@ -305,8 +315,7 @@ def test_dask_classifier(
 
     def test_empty_dmatrix(self, local_cuda_cluster: LocalCUDACluster) -> None:
         with Client(local_cuda_cluster) as client:
-            parameters = {'tree_method': 'gpu_hist',
-                          'debug_synchronize': True}
+            parameters = {'tree_method': 'gpu_hist', 'debug_synchronize': True}
             run_empty_dmatrix_reg(client, parameters)
             run_empty_dmatrix_cls(client, parameters)
 
diff --git a/tests/python/test_updaters.py b/tests/python/test_updaters.py
index cdf40d843b1a..4b56d37d4493 100644
--- a/tests/python/test_updaters.py
+++ b/tests/python/test_updaters.py
@@ -40,6 +40,8 @@ class TestTreeMethod:
            tm.dataset_strategy)
     @settings(deadline=None, print_blob=True)
     def test_exact(self, param, num_rounds, dataset):
+        if dataset.name.endswith("-l1"):
+            return
         param['tree_method'] = 'exact'
         param = dataset.set_params(param)
         result = train_result(param, dataset.get_dmat(), num_rounds)
diff --git a/tests/python/test_with_dask.py b/tests/python/test_with_dask.py
index 21e7983cf59f..a023112321e7 100644
--- a/tests/python/test_with_dask.py
+++ b/tests/python/test_with_dask.py
@@ -35,6 +35,7 @@
 import dask.array as da
 from xgboost.dask import DaskDMatrix
 
+dask.config.set({"distributed.scheduler.allowed-failures": False})
 
 if hasattr(HealthCheck, 'function_scoped_fixture'):
     suppress = [HealthCheck.function_scoped_fixture]
@@ -673,7 +674,8 @@ def test_empty_dmatrix_training_continuation(client: "Client") -> None:
 def run_empty_dmatrix_reg(client: "Client", parameters: dict) -> None:
     def _check_outputs(out: xgb.dask.TrainReturnT, predictions: np.ndarray) -> None:
         assert isinstance(out['booster'], xgb.dask.Booster)
-        assert len(out['history']['validation']['rmse']) == 2
+        for _, v in out['history']['validation'].items():
+            assert len(v) == 2
         assert isinstance(predictions, np.ndarray)
         assert predictions.shape[0] == 1
 
@@ -866,6 +868,8 @@ def test_empty_dmatrix(tree_method) -> None:
             parameters = {'tree_method': tree_method}
             run_empty_dmatrix_reg(client, parameters)
             run_empty_dmatrix_cls(client, parameters)
+            parameters = {'tree_method': tree_method, "objective": "reg:absoluteerror"}
+            run_empty_dmatrix_reg(client, parameters)
 
 
 async def run_from_dask_array_asyncio(scheduler_address: str) -> xgb.dask.TrainReturnT:
@@ -1284,7 +1288,12 @@ def is_stump():
         def minimum_bin():
             return "max_bin" in params and params["max_bin"] == 2
 
-        if minimum_bin() and is_stump():
+        # See note on `ObjFunction::UpdateTreeLeaf`.
+        update_leaf = dataset.name.endswith("-l1")
+        if update_leaf and len(history) >= 2:
+            assert history[0] >= history[-1]
+            return
+        elif minimum_bin() and is_stump():
             assert tm.non_increasing(history, tolerance=1e-3)
         else:
             assert tm.non_increasing(history)
@@ -1304,7 +1313,7 @@ def test_hist(
            dataset=tm.dataset_strategy)
     @settings(deadline=None, suppress_health_check=suppress, print_blob=True)
     def test_approx(
-            self, client: "Client", params: Dict, dataset: tm.TestDataset
+        self, client: "Client", params: Dict, dataset: tm.TestDataset
     ) -> None:
         num_rounds = 30
         self.run_updater_test(client, params, num_rounds, dataset, 'approx')
diff --git a/tests/python/testing.py b/tests/python/testing.py
index 64417af42ab9..8633e4caa52d 100644
--- a/tests/python/testing.py
+++ b/tests/python/testing.py
@@ -327,6 +327,9 @@ def make_categorical(
         TestDataset(
             "calif_housing", get_california_housing, "reg:squarederror", "rmse"
         ),
+        TestDataset(
+            "calif_housing-l1", get_california_housing, "reg:absoluteerror", "mae"
+        ),
         TestDataset("digits", get_digits, "multi:softmax", "mlogloss"),
         TestDataset("cancer", get_cancer, "binary:logistic", "logloss"),
         TestDataset(
@@ -336,6 +339,7 @@ def make_categorical(
             "rmse",
         ),
         TestDataset("sparse", get_sparse, "reg:squarederror", "rmse"),
+        TestDataset("sparse-l1", get_sparse, "reg:absoluteerror", "mae"),
         TestDataset(
             "empty",
             lambda: (np.empty((0, 100)), np.empty(0)),

From 686caad40c7214a410c6b653e6babe1435d698d9 Mon Sep 17 00:00:00 2001
From: Bobby Wang <wbo4958@gmail.com>
Date: Wed, 27 Apr 2022 23:34:22 +0800
Subject: [PATCH 14/16] [jvm-package] remove the coalesce in barrier mode
 (#7846)

---
 .../java/ml/dmlc/xgboost4j/gpu/java/BoosterTest.java |  2 +-
 .../xgboost4j/scala/rapids/spark/GpuPreXGBoost.scala | 11 +++--------
 .../xgboost4j/scala/rapids/spark/GpuTestSuite.scala  | 12 ++++--------
 3 files changed, 8 insertions(+), 17 deletions(-)

diff --git a/jvm-packages/xgboost4j-gpu/src/test/java/ml/dmlc/xgboost4j/gpu/java/BoosterTest.java b/jvm-packages/xgboost4j-gpu/src/test/java/ml/dmlc/xgboost4j/gpu/java/BoosterTest.java
index 1a8608f74845..c6109a236ddc 100644
--- a/jvm-packages/xgboost4j-gpu/src/test/java/ml/dmlc/xgboost4j/gpu/java/BoosterTest.java
+++ b/jvm-packages/xgboost4j-gpu/src/test/java/ml/dmlc/xgboost4j/gpu/java/BoosterTest.java
@@ -69,7 +69,7 @@ public void testBooster() throws XGBoostError {
       .hasHeader().build();
 
     int maxBin = 16;
-    int round = 100;
+    int round = 10;
     //set params
     Map<String, Object> paramMap = new HashMap<String, Object>() {
       {
diff --git a/jvm-packages/xgboost4j-spark-gpu/src/main/scala/ml/dmlc/xgboost4j/scala/rapids/spark/GpuPreXGBoost.scala b/jvm-packages/xgboost4j-spark-gpu/src/main/scala/ml/dmlc/xgboost4j/scala/rapids/spark/GpuPreXGBoost.scala
index 5176a9cc0106..756b7b54b161 100644
--- a/jvm-packages/xgboost4j-spark-gpu/src/main/scala/ml/dmlc/xgboost4j/scala/rapids/spark/GpuPreXGBoost.scala
+++ b/jvm-packages/xgboost4j-spark-gpu/src/main/scala/ml/dmlc/xgboost4j/scala/rapids/spark/GpuPreXGBoost.scala
@@ -407,14 +407,9 @@ object GpuPreXGBoost extends PreXGBoostProvider {
   }
 
   private def repartitionInputData(dataFrame: DataFrame, nWorkers: Int): DataFrame = {
-    // We can't check dataFrame.rdd.getNumPartitions == nWorkers here, since dataFrame.rdd is
-    // a lazy variable. If we call it here, we will not directly extract RDD[Table] again,
-    // instead, we will involve Columnar -> Row -> Columnar and decrease the performance
-    if (nWorkers == 1) {
-      dataFrame.coalesce(1)
-    } else {
-      dataFrame.repartition(nWorkers)
-    }
+    // we can't involve any coalesce operation here, since Barrier mode will check
+    // the RDD patterns which does not allow coalesce.
+    dataFrame.repartition(nWorkers)
   }
 
   private def repartitionForGroup(
diff --git a/jvm-packages/xgboost4j-spark-gpu/src/test/scala/ml/dmlc/xgboost4j/scala/rapids/spark/GpuTestSuite.scala b/jvm-packages/xgboost4j-spark-gpu/src/test/scala/ml/dmlc/xgboost4j/scala/rapids/spark/GpuTestSuite.scala
index 173ddadb8257..4d82459fa53f 100644
--- a/jvm-packages/xgboost4j-spark-gpu/src/test/scala/ml/dmlc/xgboost4j/scala/rapids/spark/GpuTestSuite.scala
+++ b/jvm-packages/xgboost4j-spark-gpu/src/test/scala/ml/dmlc/xgboost4j/scala/rapids/spark/GpuTestSuite.scala
@@ -39,13 +39,8 @@ trait GpuTestSuite extends FunSuite with TmpFolderSuite {
 
   def enableCsvConf(): SparkConf = {
     new SparkConf()
-      .set(RapidsConf.ENABLE_READ_CSV_DATES.key, "true")
-      .set(RapidsConf.ENABLE_READ_CSV_BYTES.key, "true")
-      .set(RapidsConf.ENABLE_READ_CSV_SHORTS.key, "true")
-      .set(RapidsConf.ENABLE_READ_CSV_INTEGERS.key, "true")
-      .set(RapidsConf.ENABLE_READ_CSV_LONGS.key, "true")
-      .set(RapidsConf.ENABLE_READ_CSV_FLOATS.key, "true")
-      .set(RapidsConf.ENABLE_READ_CSV_DOUBLES.key, "true")
+      .set("spark.rapids.sql.csv.read.float.enabled", "true")
+      .set("spark.rapids.sql.csv.read.double.enabled", "true")
   }
 
   def withGpuSparkSession[U](conf: SparkConf = new SparkConf())(f: SparkSession => U): U = {
@@ -246,12 +241,13 @@ object SparkSessionHolder extends Logging {
     Locale.setDefault(Locale.US)
 
     val builder = SparkSession.builder()
-      .master("local[1]")
+      .master("local[2]")
       .config("spark.sql.adaptive.enabled", "false")
       .config("spark.rapids.sql.enabled", "false")
       .config("spark.rapids.sql.test.enabled", "false")
       .config("spark.plugins", "com.nvidia.spark.SQLPlugin")
       .config("spark.rapids.memory.gpu.pooling.enabled", "false") // Disable RMM for unit tests.
+      .config("spark.sql.files.maxPartitionBytes", "1000")
       .appName("XGBoost4j-Spark-Gpu unit test")
 
     builder.getOrCreate()

From a94e1b172e6ec5dcb7f80e36aadd4e9b20a75853 Mon Sep 17 00:00:00 2001
From: Bobby Wang <wbo4958@gmail.com>
Date: Thu, 28 Apr 2022 02:05:38 +0800
Subject: [PATCH 15/16] [jvm-packages] Fix model compatibility (#7845)

---
 .../params/DefaultXGBoostParamsReader.scala   | 21 ++++++++++++++-----
 1 file changed, 16 insertions(+), 5 deletions(-)

diff --git a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/params/DefaultXGBoostParamsReader.scala b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/params/DefaultXGBoostParamsReader.scala
index bb75bb342cb1..d7d4fca771c5 100644
--- a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/params/DefaultXGBoostParamsReader.scala
+++ b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/params/DefaultXGBoostParamsReader.scala
@@ -1,5 +1,5 @@
 /*
- Copyright (c) 2014 by Contributors
+ Copyright (c) 2014-2022 by Contributors
 
  Licensed under the Apache License, Version 2.0 (the "License");
  you may not use this file except in compliance with the License.
@@ -16,18 +16,22 @@
 
 package ml.dmlc.xgboost4j.scala.spark.params
 
+import ml.dmlc.xgboost4j.scala.spark
+import org.apache.commons.logging.LogFactory
 import org.apache.hadoop.fs.Path
 import org.json4s.{DefaultFormats, JValue}
 import org.json4s.JsonAST.JObject
 import org.json4s.jackson.JsonMethods.{compact, parse, render}
 
 import org.apache.spark.SparkContext
-import org.apache.spark.ml.param.{Param, Params}
+import org.apache.spark.ml.param.Params
 import org.apache.spark.ml.util.MLReader
 
 // This originates from apache-spark DefaultPramsReader copy paste
 private[spark] object DefaultXGBoostParamsReader {
 
+  private val logger = LogFactory.getLog("XGBoostSpark")
+
   private val paramNameCompatibilityMap: Map[String, String] = Map("silent" -> "verbosity")
 
   private val paramValueCompatibilityMap: Map[String, Map[Any, Any]] =
@@ -126,9 +130,16 @@ private[spark] object DefaultXGBoostParamsReader {
     metadata.params match {
       case JObject(pairs) =>
         pairs.foreach { case (paramName, jsonValue) =>
-          val param = instance.getParam(handleBrokenlyChangedName(paramName))
-          val value = param.jsonDecode(compact(render(jsonValue)))
-          instance.set(param, handleBrokenlyChangedValue(paramName, value))
+          val finalName = handleBrokenlyChangedName(paramName)
+          // For the deleted parameters, we'd better to remove it instead of throwing an exception.
+          // So we need to check if the parameter exists instead of blindly setting it.
+          if (instance.hasParam(finalName)) {
+            val param = instance.getParam(finalName)
+            val value = param.jsonDecode(compact(render(jsonValue)))
+            instance.set(param, handleBrokenlyChangedValue(paramName, value))
+          } else {
+            logger.warn(s"$finalName is no longer used in ${spark.VERSION}")
+          }
         }
       case _ =>
         throw new IllegalArgumentException(

From f7db16add1627d828c463551629b1b69602c124c Mon Sep 17 00:00:00 2001
From: Michael Allman <msa@allman.ms>
Date: Thu, 28 Apr 2022 00:44:30 -0700
Subject: [PATCH 16/16] Ignore all Java exceptions when looking for Linux musl
 support (#7844)

---
 .../src/main/java/ml/dmlc/xgboost4j/java/NativeLibLoader.java   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/NativeLibLoader.java b/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/NativeLibLoader.java
index e6e6542a5288..f10bab9241a4 100644
--- a/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/NativeLibLoader.java
+++ b/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/NativeLibLoader.java
@@ -100,7 +100,7 @@ static boolean isMuslBased() {
         });
 
         return muslRelatedMemoryMappedFilename.isPresent();
-      } catch (IOException ignored) {
+      } catch (Exception ignored) {
         // ignored
       }
       return false;