From 4d99c58a5f2a4e586dede954182a30f547f98169 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Tue, 18 Aug 2020 19:55:41 +0800
Subject: [PATCH 01/15] Feature weights (#5962)

---
 amalgamation/xgboost-all0.cc                  |  1 +
 demo/guide-python/feature_weights.py          | 49 +++++++++++++
 demo/json-model/json_parser.py                |  2 +-
 doc/parameter.rst                             |  6 +-
 include/xgboost/c_api.h                       | 28 ++++++++
 include/xgboost/data.h                        | 29 ++------
 python-package/xgboost/core.py                |  7 +-
 python-package/xgboost/data.py                | 45 ++++++++----
 python-package/xgboost/sklearn.py             | 27 ++++++--
 src/c_api/c_api.cc                            | 11 +++
 src/common/common.h                           | 12 ++++
 src/common/random.cc                          | 38 +++++++++++
 src/common/random.h                           | 65 +++++++++++-------
 src/data/data.cc                              | 25 +++++++
 src/data/data.cu                              | 24 +++++++
 src/tree/updater_colmaker.cc                  |  6 +-
 src/tree/updater_gpu_hist.cu                  |  6 +-
 src/tree/updater_quantile_hist.cc             | 10 +--
 tests/cpp/common/test_common.cc               | 13 ++++
 tests/cpp/common/test_random.cc               | 68 ++++++++++++++++---
 tests/cpp/tree/test_gpu_hist.cu               |  9 ++-
 .../test_device_quantile_dmatrix.py           | 14 ++++
 tests/python/test_demos.py                    | 16 +++--
 tests/python/test_dmatrix.py                  | 41 +++++++++--
 tests/python/test_with_sklearn.py             | 61 +++++++++++++++++
 25 files changed, 509 insertions(+), 104 deletions(-)
 create mode 100644 demo/guide-python/feature_weights.py
 create mode 100644 src/common/random.cc
 create mode 100644 tests/cpp/common/test_common.cc

diff --git a/amalgamation/xgboost-all0.cc b/amalgamation/xgboost-all0.cc
index 8220135d9e32..792b43797ce5 100644
--- a/amalgamation/xgboost-all0.cc
+++ b/amalgamation/xgboost-all0.cc
@@ -68,6 +68,7 @@
 #include "../src/learner.cc"
 #include "../src/logging.cc"
 #include "../src/common/common.cc"
+#include "../src/common/random.cc"
 #include "../src/common/charconv.cc"
 #include "../src/common/timer.cc"
 #include "../src/common/quantile.cc"
diff --git a/demo/guide-python/feature_weights.py b/demo/guide-python/feature_weights.py
new file mode 100644
index 000000000000..b9cee8c050af
--- /dev/null
+++ b/demo/guide-python/feature_weights.py
@@ -0,0 +1,49 @@
+'''Using feature weight to change column sampling.
+
+    .. versionadded:: 1.3.0
+'''
+
+import numpy as np
+import xgboost
+from matplotlib import pyplot as plt
+import argparse
+
+
+def main(args):
+    rng = np.random.RandomState(1994)
+
+    kRows = 1000
+    kCols = 10
+
+    X = rng.randn(kRows, kCols)
+    y = rng.randn(kRows)
+    fw = np.ones(shape=(kCols,))
+    for i in range(kCols):
+        fw[i] *= float(i)
+
+    dtrain = xgboost.DMatrix(X, y)
+    dtrain.set_info(feature_weights=fw)
+
+    bst = xgboost.train({'tree_method': 'hist',
+                         'colsample_bynode': 0.5},
+                        dtrain, num_boost_round=10,
+                        evals=[(dtrain, 'd')])
+    featue_map = bst.get_fscore()
+    # feature zero has 0 weight
+    assert featue_map.get('f0', None) is None
+    assert max(featue_map.values()) == featue_map.get('f9')
+
+    if args.plot:
+        xgboost.plot_importance(bst)
+        plt.show()
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        '--plot',
+        type=int,
+        default=1,
+        help='Set to 0 to disable plotting the evaluation history.')
+    args = parser.parse_args()
+    main(args)
diff --git a/demo/json-model/json_parser.py b/demo/json-model/json_parser.py
index eedcbf9c2287..c41a44d881c8 100644
--- a/demo/json-model/json_parser.py
+++ b/demo/json-model/json_parser.py
@@ -94,7 +94,7 @@ def __str__(self):
 
 class Model:
     '''Gradient boosted tree model.'''
-    def __init__(self, m: dict):
+    def __init__(self, model: dict):
         '''Construct the Model from JSON object.
 
          parameters
diff --git a/doc/parameter.rst b/doc/parameter.rst
index 626ddf10f8ab..7e7e774a2bfa 100644
--- a/doc/parameter.rst
+++ b/doc/parameter.rst
@@ -107,6 +107,10 @@ Parameters for Tree Booster
     'colsample_bynode':0.5}`` with 64 features will leave 8 features to choose from at
     each split.
 
+    On Python interface, one can set the ``feature_weights`` for DMatrix to define the
+    probability of each feature being selected when using column sampling.  There's a
+    similar parameter for ``fit`` method in sklearn interface.
+
 * ``lambda`` [default=1, alias: ``reg_lambda``]
 
   - L2 regularization term on weights. Increasing this value will make model more conservative.
@@ -224,7 +228,7 @@ Parameters for Tree Booster
     list is a group of indices of features that are allowed to interact with each other.
     See tutorial for more information
 
-Additional parameters for ``hist`` and ```gpu_hist`` tree method
+Additional parameters for ``hist`` and ``gpu_hist`` tree method
 ================================================================
 
 * ``single_precision_histogram``, [default=``false``]
diff --git a/include/xgboost/c_api.h b/include/xgboost/c_api.h
index 794cbdf19e8f..4db461d11b1c 100644
--- a/include/xgboost/c_api.h
+++ b/include/xgboost/c_api.h
@@ -483,6 +483,34 @@ XGB_DLL int XGDMatrixGetStrFeatureInfo(DMatrixHandle handle, const char *field,
                                        bst_ulong *size,
                                        const char ***out_features);
 
+/*!
+ * \brief Set meta info from dense matrix.  Valid field names are:
+ *
+ *  - label
+ *  - weight
+ *  - base_margin
+ *  - group
+ *  - label_lower_bound
+ *  - label_upper_bound
+ *  - feature_weights
+ *
+ * \param handle An instance of data matrix
+ * \param field  Feild name
+ * \param data   Pointer to consecutive memory storing data.
+ * \param size   Size of the data, this is relative to size of type.  (Meaning NOT number
+ *               of bytes.)
+ * \param type   Indicator of data type.  This is defined in xgboost::DataType enum class.
+ *
+ *    float    = 1
+ *    double   = 2
+ *    uint32_t = 3
+ *    uint64_t = 4
+ *
+ * \return 0 when success, -1 when failure happens
+ */
+XGB_DLL int XGDMatrixSetDenseInfo(DMatrixHandle handle, const char *field,
+                                  void *data, bst_ulong size, int type);
+
 /*!
  * \brief (deprecated) Use XGDMatrixSetUIntInfo instead. Set group of the training matrix
  * \param handle a instance of data matrix
diff --git a/include/xgboost/data.h b/include/xgboost/data.h
index 1ee292a89edb..f74dbd2c5a76 100644
--- a/include/xgboost/data.h
+++ b/include/xgboost/data.h
@@ -88,34 +88,17 @@ class MetaInfo {
    * \brief Type of each feature.  Automatically set when feature_type_names is specifed.
    */
   HostDeviceVector<FeatureType> feature_types;
+  /*
+   * \brief Weight of each feature, used to define the probability of each feature being
+   *        selected when using column sampling.
+   */
+  HostDeviceVector<float> feature_weigths;
 
   /*! \brief default constructor */
   MetaInfo()  = default;
   MetaInfo(MetaInfo&& that) = default;
   MetaInfo& operator=(MetaInfo&& that) = default;
-  MetaInfo& operator=(MetaInfo const& that) {
-    this->num_row_ = that.num_row_;
-    this->num_col_ = that.num_col_;
-    this->num_nonzero_ = that.num_nonzero_;
-
-    this->labels_.Resize(that.labels_.Size());
-    this->labels_.Copy(that.labels_);
-
-    this->group_ptr_ = that.group_ptr_;
-
-    this->weights_.Resize(that.weights_.Size());
-    this->weights_.Copy(that.weights_);
-
-    this->base_margin_.Resize(that.base_margin_.Size());
-    this->base_margin_.Copy(that.base_margin_);
-
-    this->labels_lower_bound_.Resize(that.labels_lower_bound_.Size());
-    this->labels_lower_bound_.Copy(that.labels_lower_bound_);
-
-    this->labels_upper_bound_.Resize(that.labels_upper_bound_.Size());
-    this->labels_upper_bound_.Copy(that.labels_upper_bound_);
-    return *this;
-  }
+  MetaInfo& operator=(MetaInfo const& that) = delete;
 
   /*!
    * \brief Validate all metainfo.
diff --git a/python-package/xgboost/core.py b/python-package/xgboost/core.py
index 4bc77783ee91..cf22453245ac 100644
--- a/python-package/xgboost/core.py
+++ b/python-package/xgboost/core.py
@@ -455,7 +455,8 @@ def set_info(self,
                  label_lower_bound=None,
                  label_upper_bound=None,
                  feature_names=None,
-                 feature_types=None):
+                 feature_types=None,
+                 feature_weights=None):
         '''Set meta info for DMatrix.'''
         if label is not None:
             self.set_label(label)
@@ -473,6 +474,10 @@ def set_info(self,
             self.feature_names = feature_names
         if feature_types is not None:
             self.feature_types = feature_types
+        if feature_weights is not None:
+            from .data import dispatch_meta_backend
+            dispatch_meta_backend(matrix=self, data=feature_weights,
+                                  name='feature_weights')
 
     def get_float_info(self, field):
         """Get float property from the DMatrix.
diff --git a/python-package/xgboost/data.py b/python-package/xgboost/data.py
index 9491efd1c38c..e4c05dcc244e 100644
--- a/python-package/xgboost/data.py
+++ b/python-package/xgboost/data.py
@@ -530,22 +530,38 @@ def dispatch_data_backend(data, missing, threads,
     raise TypeError('Not supported type for data.' + str(type(data)))
 
 
+def _to_data_type(dtype: str, name: str):
+    dtype_map = {'float32': 1, 'float64': 2, 'uint32': 3, 'uint64': 4}
+    if dtype not in dtype_map.keys():
+        raise TypeError(
+            f'Expecting float32, float64, uint32, uint64, got {dtype} ' +
+            f'for {name}.')
+    return dtype_map[dtype]
+
+
+def _validate_meta_shape(data):
+    if hasattr(data, 'shape'):
+        assert len(data.shape) == 1 or (
+            len(data.shape) == 2 and
+            (data.shape[1] == 0 or data.shape[1] == 1))
+
+
 def _meta_from_numpy(data, field, dtype, handle):
     data = _maybe_np_slice(data, dtype)
-    if dtype == 'uint32':
-        c_data = c_array(ctypes.c_uint32, data)
-        _check_call(_LIB.XGDMatrixSetUIntInfo(handle,
-                                              c_str(field),
-                                              c_array(ctypes.c_uint, data),
-                                              c_bst_ulong(len(data))))
-    elif dtype == 'float':
-        c_data = c_array(ctypes.c_float, data)
-        _check_call(_LIB.XGDMatrixSetFloatInfo(handle,
-                                               c_str(field),
-                                               c_data,
-                                               c_bst_ulong(len(data))))
-    else:
-        raise TypeError('Unsupported type ' + str(dtype) + ' for:' + field)
+    interface = data.__array_interface__
+    assert interface.get('mask', None) is None, 'Masked array is not supported'
+    size = data.shape[0]
+
+    c_type = _to_data_type(str(data.dtype), field)
+    ptr = interface['data'][0]
+    ptr = ctypes.c_void_p(ptr)
+    _check_call(_LIB.XGDMatrixSetDenseInfo(
+        handle,
+        c_str(field),
+        ptr,
+        c_bst_ulong(size),
+        c_type
+    ))
 
 
 def _meta_from_list(data, field, dtype, handle):
@@ -595,6 +611,7 @@ def _meta_from_dt(data, field, dtype, handle):
 def dispatch_meta_backend(matrix: DMatrix, data, name: str, dtype: str = None):
     '''Dispatch for meta info.'''
     handle = matrix.handle
+    _validate_meta_shape(data)
     if data is None:
         return
     if _is_list(data):
diff --git a/python-package/xgboost/sklearn.py b/python-package/xgboost/sklearn.py
index f533f7f3477d..c6c34dce1c99 100644
--- a/python-package/xgboost/sklearn.py
+++ b/python-package/xgboost/sklearn.py
@@ -441,6 +441,7 @@ def load_model(self, fname):
     def fit(self, X, y, sample_weight=None, base_margin=None,
             eval_set=None, eval_metric=None, early_stopping_rounds=None,
             verbose=True, xgb_model=None, sample_weight_eval_set=None,
+            feature_weights=None,
             callbacks=None):
         # pylint: disable=invalid-name,attribute-defined-outside-init
         """Fit gradient boosting model
@@ -459,9 +460,6 @@ def fit(self, X, y, sample_weight=None, base_margin=None,
             A list of (X, y) tuple pairs to use as validation sets, for which
             metrics will be computed.
             Validation metrics will help us track the performance of the model.
-        sample_weight_eval_set : list, optional
-            A list of the form [L_1, L_2, ..., L_n], where each L_i is a list of
-            instance weights on the i-th validation set.
         eval_metric : str, list of str, or callable, optional
             If a str, should be a built-in evaluation metric to use. See
             doc/parameter.rst.
@@ -490,6 +488,13 @@ def fit(self, X, y, sample_weight=None, base_margin=None,
         xgb_model : str
             file name of stored XGBoost model or 'Booster' instance XGBoost model to be
             loaded before training (allows training continuation).
+        sample_weight_eval_set : list, optional
+            A list of the form [L_1, L_2, ..., L_n], where each L_i is a list of
+            instance weights on the i-th validation set.
+        feature_weights: array_like
+            Weight for each feature, defines the probability of each feature
+            being selected when colsample is being used.  All values must be
+            greater than 0, otherwise a `ValueError` is thrown.
         callbacks : list of callback functions
             List of callback functions that are applied at end of each iteration.
             It is possible to use predefined callbacks by using :ref:`callback_api`.
@@ -498,6 +503,7 @@ def fit(self, X, y, sample_weight=None, base_margin=None,
             .. code-block:: python
 
                 [xgb.callback.reset_learning_rate(custom_rates)]
+
         """
         self.n_features_in_ = X.shape[1]
 
@@ -505,6 +511,7 @@ def fit(self, X, y, sample_weight=None, base_margin=None,
                                 base_margin=base_margin,
                                 missing=self.missing,
                                 nthread=self.n_jobs)
+        train_dmatrix.set_info(feature_weights=feature_weights)
 
         evals_result = {}
 
@@ -759,7 +766,7 @@ def __init__(self, objective="binary:logistic", **kwargs):
     def fit(self, X, y, sample_weight=None, base_margin=None,
             eval_set=None, eval_metric=None,
             early_stopping_rounds=None, verbose=True, xgb_model=None,
-            sample_weight_eval_set=None, callbacks=None):
+            sample_weight_eval_set=None, feature_weights=None, callbacks=None):
         # pylint: disable = attribute-defined-outside-init,arguments-differ
 
         evals_result = {}
@@ -821,6 +828,7 @@ def fit(self, X, y, sample_weight=None, base_margin=None,
         train_dmatrix = DMatrix(X, label=training_labels, weight=sample_weight,
                                 base_margin=base_margin,
                                 missing=self.missing, nthread=self.n_jobs)
+        train_dmatrix.set_info(feature_weights=feature_weights)
 
         self._Booster = train(xgb_options, train_dmatrix,
                               self.get_num_boosting_rounds(),
@@ -1101,10 +1109,10 @@ def __init__(self, objective='rank:pairwise', **kwargs):
             raise ValueError("please use XGBRanker for ranking task")
 
     def fit(self, X, y, group, sample_weight=None, base_margin=None,
-            eval_set=None,
-            sample_weight_eval_set=None, eval_group=None, eval_metric=None,
+            eval_set=None, sample_weight_eval_set=None,
+            eval_group=None, eval_metric=None,
             early_stopping_rounds=None, verbose=False, xgb_model=None,
-            callbacks=None):
+            feature_weights=None, callbacks=None):
         # pylint: disable = attribute-defined-outside-init,arguments-differ
         """Fit gradient boosting ranker
 
@@ -1170,6 +1178,10 @@ def fit(self, X, y, group, sample_weight=None, base_margin=None,
         xgb_model : str
             file name of stored XGBoost model or 'Booster' instance XGBoost
             model to be loaded before training (allows training continuation).
+        feature_weights: array_like
+            Weight for each feature, defines the probability of each feature
+            being selected when colsample is being used.  All values must be
+            greater than 0, otherwise a `ValueError` is thrown.
         callbacks : list of callback functions
             List of callback functions that are applied at end of each
             iteration.  It is possible to use predefined callbacks by using
@@ -1205,6 +1217,7 @@ def _dmat_init(group, **params):
         train_dmatrix = DMatrix(data=X, label=y, weight=sample_weight,
                                 base_margin=base_margin,
                                 missing=self.missing, nthread=self.n_jobs)
+        train_dmatrix.set_info(feature_weights=feature_weights)
         train_dmatrix.set_group(group)
 
         evals_result = {}
diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index aa6ecf43a784..397f83e69bf8 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -316,6 +316,17 @@ XGB_DLL int XGDMatrixGetStrFeatureInfo(DMatrixHandle handle, const char *field,
   API_END();
 }
 
+XGB_DLL int XGDMatrixSetDenseInfo(DMatrixHandle handle, const char *field,
+                                  void *data, xgboost::bst_ulong size,
+                                  int type) {
+  API_BEGIN();
+  CHECK_HANDLE();
+  auto &info = static_cast<std::shared_ptr<DMatrix> *>(handle)->get()->Info();
+  CHECK(type >= 1 && type <= 4);
+  info.SetInfo(field, data, static_cast<DataType>(type), size);
+  API_END();
+}
+
 XGB_DLL int XGDMatrixSetGroup(DMatrixHandle handle,
                               const unsigned* group,
                               xgboost::bst_ulong len) {
diff --git a/src/common/common.h b/src/common/common.h
index b0bd6b6d6cec..a4397d1c89aa 100644
--- a/src/common/common.h
+++ b/src/common/common.h
@@ -9,12 +9,15 @@
 #include <xgboost/base.h>
 #include <xgboost/logging.h>
 
+#include <algorithm>
 #include <exception>
+#include <functional>
 #include <limits>
 #include <type_traits>
 #include <vector>
 #include <string>
 #include <sstream>
+#include <numeric>
 
 #if defined(__CUDACC__)
 #include <thrust/system/cuda/error.h>
@@ -160,6 +163,15 @@ inline void AssertOneAPISupport() {
 #endif  // XGBOOST_USE_ONEAPI
 }
 
+template <typename Idx, typename V, typename Comp = std::less<V>>
+std::vector<Idx> ArgSort(std::vector<V> const &array, Comp comp = std::less<V>{}) {
+  std::vector<Idx> result(array.size());
+  std::iota(result.begin(), result.end(), 0);
+  std::stable_sort(
+      result.begin(), result.end(),
+      [&array, comp](Idx const &l, Idx const &r) { return comp(array[l], array[r]); });
+  return result;
+}
 }  // namespace common
 }  // namespace xgboost
 #endif  // XGBOOST_COMMON_COMMON_H_
diff --git a/src/common/random.cc b/src/common/random.cc
new file mode 100644
index 000000000000..f386cad916b2
--- /dev/null
+++ b/src/common/random.cc
@@ -0,0 +1,38 @@
+/*!
+ * Copyright 2020 by XGBoost Contributors
+ * \file random.cc
+ */
+#include "random.h"
+
+namespace xgboost {
+namespace common {
+std::shared_ptr<HostDeviceVector<bst_feature_t>> ColumnSampler::ColSample(
+    std::shared_ptr<HostDeviceVector<bst_feature_t>> p_features,
+    float colsample) {
+  if (colsample == 1.0f) {
+    return p_features;
+  }
+  const auto &features = p_features->HostVector();
+  CHECK_GT(features.size(), 0);
+
+  int n = std::max(1, static_cast<int>(colsample * features.size()));
+  auto p_new_features = std::make_shared<HostDeviceVector<bst_feature_t>>();
+  auto &new_features = *p_new_features;
+
+  if (feature_weights_.size() != 0) {
+    new_features.HostVector() = WeightedSamplingWithoutReplacement(
+        p_features->HostVector(), feature_weights_, n);
+  } else {
+    new_features.Resize(features.size());
+    std::copy(features.begin(), features.end(),
+              new_features.HostVector().begin());
+    std::shuffle(new_features.HostVector().begin(),
+                 new_features.HostVector().end(), rng_);
+    new_features.Resize(n);
+  }
+  std::sort(new_features.HostVector().begin(), new_features.HostVector().end());
+  return p_new_features;
+}
+
+}  // namespace common
+}  // namespace xgboost
diff --git a/src/common/random.h b/src/common/random.h
index 45af80ce030b..7fd461d22d0f 100644
--- a/src/common/random.h
+++ b/src/common/random.h
@@ -1,5 +1,5 @@
 /*!
- * Copyright 2015 by Contributors
+ * Copyright 2015-2020 by Contributors
  * \file random.h
  * \brief Utility related to random.
  * \author Tianqi Chen
@@ -10,14 +10,17 @@
 #include <rabit/rabit.h>
 #include <xgboost/logging.h>
 #include <algorithm>
+#include <functional>
 #include <vector>
 #include <limits>
 #include <map>
 #include <memory>
 #include <numeric>
 #include <random>
+#include <utility>
 
 #include "xgboost/host_device_vector.h"
+#include "common.h"
 
 namespace xgboost {
 namespace common {
@@ -75,6 +78,38 @@ using GlobalRandomEngine = RandomEngine;
  */
 GlobalRandomEngine& GlobalRandom(); // NOLINT(*)
 
+/*
+ * Original paper:
+ * Weighted Random Sampling (2005; Efraimidis, Spirakis)
+ *
+ * Blog:
+ * https://timvieira.github.io/blog/post/2019/09/16/algorithms-for-sampling-without-replacement/
+*/
+template <typename T>
+std::vector<T> WeightedSamplingWithoutReplacement(
+    std::vector<T> const &array, std::vector<float> const &weights, size_t n) {
+  // ES sampling.
+  CHECK_EQ(array.size(), weights.size());
+  std::vector<float> keys(weights.size());
+  std::uniform_real_distribution<float> dist;
+  auto& rng = GlobalRandom();
+  for (size_t i = 0; i < array.size(); ++i) {
+    auto w = std::max(weights.at(i), kRtEps);
+    auto u = dist(rng);
+    auto k = std::log(u) / w;
+    keys[i] = k;
+  }
+  auto ind = ArgSort<size_t>(keys, std::greater<>{});
+  ind.resize(n);
+
+  std::vector<T> results(ind.size());
+  for (size_t k = 0; k < ind.size(); ++k) {
+    auto idx = ind[k];
+    results[k] = array[idx];
+  }
+  return results;
+}
+
 /**
  * \class ColumnSampler
  *
@@ -82,36 +117,18 @@ GlobalRandomEngine& GlobalRandom(); // NOLINT(*)
  * colsample_bynode parameters. Should be initialised before tree construction and to
  * reset when tree construction is completed.
  */
-
 class ColumnSampler {
   std::shared_ptr<HostDeviceVector<bst_feature_t>> feature_set_tree_;
   std::map<int, std::shared_ptr<HostDeviceVector<bst_feature_t>>> feature_set_level_;
+  std::vector<float> feature_weights_;
   float colsample_bylevel_{1.0f};
   float colsample_bytree_{1.0f};
   float colsample_bynode_{1.0f};
   GlobalRandomEngine rng_;
 
-  std::shared_ptr<HostDeviceVector<bst_feature_t>> ColSample(
-      std::shared_ptr<HostDeviceVector<bst_feature_t>> p_features, float colsample) {
-    if (colsample == 1.0f) return p_features;
-    const auto& features = p_features->HostVector();
-    CHECK_GT(features.size(), 0);
-    int n = std::max(1, static_cast<int>(colsample * features.size()));
-    auto p_new_features = std::make_shared<HostDeviceVector<bst_feature_t>>();
-    auto& new_features = *p_new_features;
-    new_features.Resize(features.size());
-    std::copy(features.begin(), features.end(),
-              new_features.HostVector().begin());
-    std::shuffle(new_features.HostVector().begin(),
-                 new_features.HostVector().end(), rng_);
-    new_features.Resize(n);
-    std::sort(new_features.HostVector().begin(),
-              new_features.HostVector().end());
-
-    return p_new_features;
-  }
-
  public:
+  std::shared_ptr<HostDeviceVector<bst_feature_t>> ColSample(
+      std::shared_ptr<HostDeviceVector<bst_feature_t>> p_features, float colsample);
   /**
    * \brief Column sampler constructor.
    * \note This constructor manually sets the rng seed
@@ -139,8 +156,10 @@ class ColumnSampler {
    * \param colsample_bytree
    * \param skip_index_0      (Optional) True to skip index 0.
    */
-  void Init(int64_t num_col, float colsample_bynode, float colsample_bylevel,
+  void Init(int64_t num_col, std::vector<float> feature_weights,
+            float colsample_bynode, float colsample_bylevel,
             float colsample_bytree, bool skip_index_0 = false) {
+    feature_weights_ = std::move(feature_weights);
     colsample_bylevel_ = colsample_bylevel;
     colsample_bytree_ = colsample_bytree;
     colsample_bynode_ = colsample_bynode;
diff --git a/src/data/data.cc b/src/data/data.cc
index 401a35081830..677812ebba7a 100644
--- a/src/data/data.cc
+++ b/src/data/data.cc
@@ -293,6 +293,9 @@ MetaInfo MetaInfo::Slice(common::Span<int32_t const> ridxs) const {
   } else {
     out.base_margin_.HostVector() = Gather(this->base_margin_.HostVector(), ridxs);
   }
+
+  out.feature_weigths.Resize(this->feature_weigths.Size());
+  out.feature_weigths.Copy(this->feature_weigths);
   return out;
 }
 
@@ -377,6 +380,16 @@ void MetaInfo::SetInfo(const char* key, const void* dptr, DataType dtype, size_t
     labels.resize(num);
     DISPATCH_CONST_PTR(dtype, dptr, cast_dptr,
                        std::copy(cast_dptr, cast_dptr + num, labels.begin()));
+  } else if (!std::strcmp(key, "feature_weights")) {
+    auto &h_feature_weights = feature_weigths.HostVector();
+    h_feature_weights.resize(num);
+    DISPATCH_CONST_PTR(
+        dtype, dptr, cast_dptr,
+        std::copy(cast_dptr, cast_dptr + num, h_feature_weights.begin()));
+    bool valid =
+        std::all_of(h_feature_weights.cbegin(), h_feature_weights.cend(),
+                    [](float w) { return w >= 0; });
+    CHECK(valid) << "Feature weight must be greater than 0.";
   } else {
     LOG(FATAL) << "Unknown key for MetaInfo: " << key;
   }
@@ -396,6 +409,8 @@ void MetaInfo::GetInfo(char const *key, bst_ulong *out_len, DataType dtype,
       vec = &this->labels_lower_bound_.HostVector();
     } else if (!std::strcmp(key, "label_upper_bound")) {
       vec = &this->labels_upper_bound_.HostVector();
+    } else if (!std::strcmp(key, "feature_weights")) {
+      vec = &this->feature_weigths.HostVector();
     } else {
       LOG(FATAL) << "Unknown float field name: " << key;
     }
@@ -497,6 +512,11 @@ void MetaInfo::Extend(MetaInfo const& that, bool accumulate_rows) {
     auto &h_feature_types = feature_types.HostVector();
     LoadFeatureType(this->feature_type_names, &h_feature_types);
   }
+  if (!that.feature_weigths.Empty()) {
+    this->feature_weigths.Resize(that.feature_weigths.Size());
+    this->feature_weigths.SetDevice(that.feature_weigths.DeviceIdx());
+    this->feature_weigths.Copy(that.feature_weigths);
+  }
 }
 
 void MetaInfo::Validate(int32_t device) const {
@@ -538,6 +558,11 @@ void MetaInfo::Validate(int32_t device) const {
     check_device(labels_lower_bound_);
     return;
   }
+  if (feature_weigths.Size() != 0) {
+    CHECK_EQ(feature_weigths.Size(), num_col_)
+        << "Size of feature_weights must equal to number of columns.";
+    check_device(feature_weigths);
+  }
   if (labels_upper_bound_.Size() != 0) {
     CHECK_EQ(labels_upper_bound_.Size(), num_row_)
         << "Size of label_upper_bound must equal to number of rows.";
diff --git a/src/data/data.cu b/src/data/data.cu
index 5e63a828c207..15260498734d 100644
--- a/src/data/data.cu
+++ b/src/data/data.cu
@@ -58,6 +58,15 @@ void CopyGroupInfoImpl(ArrayInterface column, std::vector<bst_group_t>* out) {
   std::partial_sum(out->begin(), out->end(), out->begin());
 }
 
+namespace {
+// thrust::all_of tries to copy lambda function.
+struct AllOfOp {
+  __device__ bool operator()(float w) {
+    return w >= 0;
+  }
+};
+}  // anonymous namespace
+
 void MetaInfo::SetInfo(const char * c_key, std::string const& interface_str) {
   Json j_interface = Json::Load({interface_str.c_str(), interface_str.size()});
   auto const& j_arr = get<Array>(j_interface);
@@ -82,6 +91,21 @@ void MetaInfo::SetInfo(const char * c_key, std::string const& interface_str) {
   } else if (key == "group") {
     CopyGroupInfoImpl(array_interface, &group_ptr_);
     return;
+  } else if (key == "label_lower_bound") {
+    CopyInfoImpl(array_interface, &labels_lower_bound_);
+    return;
+  } else if (key == "label_upper_bound") {
+    CopyInfoImpl(array_interface, &labels_upper_bound_);
+    return;
+  } else if (key == "feature_weights") {
+    CopyInfoImpl(array_interface, &feature_weigths);
+    auto d_feature_weights = feature_weigths.ConstDeviceSpan();
+    auto valid =
+        thrust::all_of(thrust::device, d_feature_weights.data(),
+                       d_feature_weights.data() + d_feature_weights.size(),
+                       AllOfOp{});
+    CHECK(valid) << "Feature weight must be greater than 0.";
+    return;
   } else {
     LOG(FATAL) << "Unknown metainfo: " << key;
   }
diff --git a/src/tree/updater_colmaker.cc b/src/tree/updater_colmaker.cc
index 951cfdb5ec27..45cdb0ba9163 100644
--- a/src/tree/updater_colmaker.cc
+++ b/src/tree/updater_colmaker.cc
@@ -235,8 +235,10 @@ class ColMaker: public TreeUpdater {
         }
       }
       {
-        column_sampler_.Init(fmat.Info().num_col_, param_.colsample_bynode,
-                             param_.colsample_bylevel, param_.colsample_bytree);
+        column_sampler_.Init(fmat.Info().num_col_,
+                             fmat.Info().feature_weigths.ConstHostVector(),
+                             param_.colsample_bynode, param_.colsample_bylevel,
+                             param_.colsample_bytree);
       }
       {
         // setup temp space for each thread
diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu
index 5cbe75350402..3535a59d6f85 100644
--- a/src/tree/updater_gpu_hist.cu
+++ b/src/tree/updater_gpu_hist.cu
@@ -266,8 +266,10 @@ struct GPUHistMakerDevice {
   // Note that the column sampler must be passed by value because it is not
   // thread safe
   void Reset(HostDeviceVector<GradientPair>* dh_gpair, DMatrix* dmat, int64_t num_columns) {
-    this->column_sampler.Init(num_columns, param.colsample_bynode,
-      param.colsample_bylevel, param.colsample_bytree);
+    auto const& info = dmat->Info();
+    this->column_sampler.Init(num_columns, info.feature_weigths.HostVector(),
+                              param.colsample_bynode, param.colsample_bylevel,
+                              param.colsample_bytree);
     dh::safe_cuda(cudaSetDevice(device_id));
     this->interaction_constraints.Reset();
     std::fill(node_sum_gradients.begin(), node_sum_gradients.end(),
diff --git a/src/tree/updater_quantile_hist.cc b/src/tree/updater_quantile_hist.cc
index 37a90dfebd74..95d3c2008ef9 100644
--- a/src/tree/updater_quantile_hist.cc
+++ b/src/tree/updater_quantile_hist.cc
@@ -841,11 +841,13 @@ void QuantileHistMaker::Builder<GradientSumT>::InitData(const GHistIndexMatrix&
   // store a pointer to the tree
   p_last_tree_ = &tree;
   if (data_layout_ == kDenseDataOneBased) {
-    column_sampler_.Init(info.num_col_, param_.colsample_bynode, param_.colsample_bylevel,
-            param_.colsample_bytree, true);
+    column_sampler_.Init(info.num_col_, info.feature_weigths.ConstHostVector(),
+                         param_.colsample_bynode, param_.colsample_bylevel,
+                         param_.colsample_bytree, true);
   } else {
-    column_sampler_.Init(info.num_col_, param_.colsample_bynode, param_.colsample_bylevel,
-            param_.colsample_bytree,  false);
+    column_sampler_.Init(info.num_col_, info.feature_weigths.ConstHostVector(),
+                         param_.colsample_bynode, param_.colsample_bylevel,
+                         param_.colsample_bytree, false);
   }
   if (data_layout_ == kDenseDataZeroBased || data_layout_ == kDenseDataOneBased) {
     /* specialized code for dense data:
diff --git a/tests/cpp/common/test_common.cc b/tests/cpp/common/test_common.cc
new file mode 100644
index 000000000000..006860b11af2
--- /dev/null
+++ b/tests/cpp/common/test_common.cc
@@ -0,0 +1,13 @@
+#include <gtest/gtest.h>
+#include "../../../src/common/common.h"
+
+namespace xgboost {
+namespace common {
+TEST(ArgSort, Basic) {
+  std::vector<float> inputs {3.0, 2.0, 1.0};
+  auto ret = ArgSort<bst_feature_t>(inputs);
+  std::vector<bst_feature_t> sol{2, 1, 0};
+  ASSERT_EQ(ret, sol);
+}
+}  // namespace common
+}  // namespace xgboost
diff --git a/tests/cpp/common/test_random.cc b/tests/cpp/common/test_random.cc
index dc7b38554162..9b2a1515543f 100644
--- a/tests/cpp/common/test_random.cc
+++ b/tests/cpp/common/test_random.cc
@@ -8,9 +8,10 @@ namespace common {
 TEST(ColumnSampler, Test) {
   int n = 128;
   ColumnSampler cs;
+  std::vector<float> feature_weights;
 
   // No node sampling
-  cs.Init(n, 1.0f, 0.5f, 0.5f);
+  cs.Init(n, feature_weights, 1.0f, 0.5f, 0.5f);
   auto set0 = cs.GetFeatureSet(0);
   ASSERT_EQ(set0->Size(), 32);
 
@@ -23,7 +24,7 @@ TEST(ColumnSampler, Test) {
   ASSERT_EQ(set2->Size(), 32);
 
   // Node sampling
-  cs.Init(n, 0.5f, 1.0f, 0.5f);
+  cs.Init(n, feature_weights, 0.5f, 1.0f, 0.5f);
   auto set3 = cs.GetFeatureSet(0);
   ASSERT_EQ(set3->Size(), 32);
 
@@ -33,19 +34,19 @@ TEST(ColumnSampler, Test) {
   ASSERT_EQ(set4->Size(), 32);
 
   // No level or node sampling, should be the same at different depth
-  cs.Init(n, 1.0f, 1.0f, 0.5f);
+  cs.Init(n, feature_weights, 1.0f, 1.0f, 0.5f);
   ASSERT_EQ(cs.GetFeatureSet(0)->HostVector(),
             cs.GetFeatureSet(1)->HostVector());
 
-  cs.Init(n, 1.0f, 1.0f, 1.0f);
+  cs.Init(n, feature_weights, 1.0f, 1.0f, 1.0f);
   auto set5 = cs.GetFeatureSet(0);
   ASSERT_EQ(set5->Size(), n);
-  cs.Init(n, 1.0f, 1.0f, 1.0f);
+  cs.Init(n, feature_weights, 1.0f, 1.0f, 1.0f);
   auto set6 = cs.GetFeatureSet(0);
   ASSERT_EQ(set5->HostVector(), set6->HostVector());
 
   // Should always be a minimum of one feature
-  cs.Init(n, 1e-16f, 1e-16f, 1e-16f);
+  cs.Init(n, feature_weights, 1e-16f, 1e-16f, 1e-16f);
   ASSERT_EQ(cs.GetFeatureSet(0)->Size(), 1);
 }
 
@@ -56,13 +57,13 @@ TEST(ColumnSampler, ThreadSynchronisation) {
   size_t iterations = 10;
   size_t levels = 5;
   std::vector<bst_feature_t> reference_result;
-  bool success =
-      true;  // Cannot use google test asserts in multithreaded region
+  std::vector<float> feature_weights;
+  bool success = true; // Cannot use google test asserts in multithreaded region
 #pragma omp parallel num_threads(num_threads)
   {
     for (auto j = 0ull; j < iterations; j++) {
       ColumnSampler cs(j);
-      cs.Init(n, 0.5f, 0.5f, 0.5f);
+      cs.Init(n, feature_weights, 0.5f, 0.5f, 0.5f);
       for (auto level = 0ull; level < levels; level++) {
         auto result = cs.GetFeatureSet(level)->ConstHostVector();
 #pragma omp single
@@ -76,5 +77,54 @@ TEST(ColumnSampler, ThreadSynchronisation) {
   }
   ASSERT_TRUE(success);
 }
+
+TEST(ColumnSampler, WeightedSampling) {
+  auto test_basic = [](int first) {
+    std::vector<float> feature_weights(2);
+    feature_weights[0] = std::abs(first - 1.0f);
+    feature_weights[1] = first - 0.0f;
+    ColumnSampler cs{0};
+    cs.Init(2, feature_weights, 1.0, 1.0, 0.5);
+    auto feature_sets = cs.GetFeatureSet(0);
+    auto const &h_feat_set = feature_sets->HostVector();
+    ASSERT_EQ(h_feat_set.size(), 1);
+    ASSERT_EQ(h_feat_set[0], first - 0);
+  };
+
+  test_basic(0);
+  test_basic(1);
+
+  size_t constexpr kCols = 64;
+  std::vector<float> feature_weights(kCols);
+  SimpleLCG rng;
+  SimpleRealUniformDistribution<float> dist(.0f, 12.0f);
+  std::generate(feature_weights.begin(), feature_weights.end(), [&]() { return dist(&rng); });
+  ColumnSampler cs{0};
+  cs.Init(kCols, feature_weights, 0.5f, 1.0f, 1.0f);
+  std::vector<bst_feature_t> features(kCols);
+  std::iota(features.begin(), features.end(), 0);
+  std::vector<float> freq(kCols, 0);
+  for (size_t i = 0; i < 1024; ++i) {
+    auto fset = cs.GetFeatureSet(0);
+    ASSERT_EQ(kCols * 0.5, fset->Size());
+    auto const& h_fset = fset->HostVector();
+    for (auto f : h_fset) {
+      freq[f] += 1.0f;
+    }
+  }
+
+  auto norm = std::accumulate(freq.cbegin(), freq.cend(), .0f);
+  for (auto& f : freq) {
+    f /= norm;
+  }
+  norm = std::accumulate(feature_weights.cbegin(), feature_weights.cend(), .0f);
+  for (auto& f : feature_weights) {
+    f /= norm;
+  }
+
+  for (size_t i = 0; i < feature_weights.size(); ++i) {
+    EXPECT_NEAR(freq[i], feature_weights[i], 1e-2);
+  }
+}
 }  // namespace common
 }  // namespace xgboost
diff --git a/tests/cpp/tree/test_gpu_hist.cu b/tests/cpp/tree/test_gpu_hist.cu
index fd5c9f43fb2a..153cafb88fd8 100644
--- a/tests/cpp/tree/test_gpu_hist.cu
+++ b/tests/cpp/tree/test_gpu_hist.cu
@@ -204,12 +204,11 @@ TEST(GpuHist, EvaluateRootSplit) {
   ASSERT_EQ(maker.hist.Data().size(), hist.size());
   thrust::copy(hist.begin(), hist.end(),
     maker.hist.Data().begin());
+  std::vector<float> feature_weights;
 
-  maker.column_sampler.Init(kNCols,
-    param.colsample_bynode,
-    param.colsample_bylevel,
-    param.colsample_bytree,
-    false);
+  maker.column_sampler.Init(kNCols, feature_weights, param.colsample_bynode,
+                            param.colsample_bylevel, param.colsample_bytree,
+                            false);
 
   RegTree tree;
   MetaInfo info;
diff --git a/tests/python-gpu/test_device_quantile_dmatrix.py b/tests/python-gpu/test_device_quantile_dmatrix.py
index f0978a0afaf4..c44de28bd2ff 100644
--- a/tests/python-gpu/test_device_quantile_dmatrix.py
+++ b/tests/python-gpu/test_device_quantile_dmatrix.py
@@ -16,6 +16,20 @@ def test_dmatrix_numpy_init(self):
                            match='is not supported for DeviceQuantileDMatrix'):
             xgb.DeviceQuantileDMatrix(data, np.ones(5, dtype=np.float64))
 
+    @pytest.mark.skipif(**tm.no_cupy())
+    def test_dmatrix_feature_weights(self):
+        import cupy as cp
+        rng = cp.random.RandomState(1994)
+        data = rng.randn(5, 5)
+        m = xgb.DMatrix(data)
+
+        feature_weights = rng.uniform(size=5)
+        m.set_info(feature_weights=feature_weights)
+
+        cp.testing.assert_array_equal(
+            cp.array(m.get_float_info('feature_weights')),
+            feature_weights.astype(np.float32))
+
     @pytest.mark.skipif(**tm.no_cupy())
     def test_dmatrix_cupy_init(self):
         import cupy as cp
diff --git a/tests/python/test_demos.py b/tests/python/test_demos.py
index 8b6535dbff45..25c1c4de6c1f 100644
--- a/tests/python/test_demos.py
+++ b/tests/python/test_demos.py
@@ -1,12 +1,10 @@
 import os
 import subprocess
-import sys
 import pytest
 import testing as tm
 
 
-CURRENT_DIR = os.path.dirname(__file__)
-ROOT_DIR = os.path.dirname(os.path.dirname(CURRENT_DIR))
+ROOT_DIR = tm.PROJECT_ROOT
 DEMO_DIR = os.path.join(ROOT_DIR, 'demo')
 PYTHON_DEMO_DIR = os.path.join(DEMO_DIR, 'guide-python')
 
@@ -19,21 +17,27 @@ def test_basic_walkthrough():
     os.remove('dump.raw.txt')
 
 
+@pytest.mark.skipif(**tm.no_matplotlib())
 def test_custom_multiclass_objective():
     script = os.path.join(PYTHON_DEMO_DIR, 'custom_softmax.py')
     cmd = ['python', script, '--plot=0']
     subprocess.check_call(cmd)
 
 
+@pytest.mark.skipif(**tm.no_matplotlib())
 def test_custom_rmsle_objective():
-    major, minor = sys.version_info[:2]
-    if minor < 6:
-        pytest.skip('Skipping RMLSE test due to Python version being too low.')
     script = os.path.join(PYTHON_DEMO_DIR, 'custom_rmsle.py')
     cmd = ['python', script, '--plot=0']
     subprocess.check_call(cmd)
 
 
+@pytest.mark.skipif(**tm.no_matplotlib())
+def test_feature_weights_demo():
+    script = os.path.join(PYTHON_DEMO_DIR, 'feature_weights.py')
+    cmd = ['python', script, '--plot=0']
+    subprocess.check_call(cmd)
+
+
 @pytest.mark.skipif(**tm.no_sklearn())
 def test_sklearn_demo():
     script = os.path.join(PYTHON_DEMO_DIR, 'sklearn_examples.py')
diff --git a/tests/python/test_dmatrix.py b/tests/python/test_dmatrix.py
index ecf5f60411bf..f641ea2c54f4 100644
--- a/tests/python/test_dmatrix.py
+++ b/tests/python/test_dmatrix.py
@@ -99,6 +99,11 @@ def test_slice(self):
         X = rng.randn(100, 100)
         y = rng.randint(low=0, high=3, size=100)
         d = xgb.DMatrix(X, y)
+        np.testing.assert_equal(d.get_label(), y.astype(np.float32))
+
+        fw = rng.uniform(size=100).astype(np.float32)
+        d.set_info(feature_weights=fw)
+
         eval_res_0 = {}
         booster = xgb.train(
             {'num_class': 3, 'objective': 'multi:softprob'}, d,
@@ -106,19 +111,23 @@ def test_slice(self):
 
         predt = booster.predict(d)
         predt = predt.reshape(100 * 3, 1)
+
         d.set_base_margin(predt)
 
         ridxs = [1, 2, 3, 4, 5, 6]
-        d = d.slice(ridxs)
-        sliced_margin = d.get_float_info('base_margin')
+        sliced = d.slice(ridxs)
+
+        sliced_margin = sliced.get_float_info('base_margin')
         assert sliced_margin.shape[0] == len(ridxs) * 3
 
         eval_res_1 = {}
-        xgb.train({'num_class': 3, 'objective': 'multi:softprob'}, d,
-                  num_boost_round=2, evals=[(d, 'd')], evals_result=eval_res_1)
+        xgb.train({'num_class': 3, 'objective': 'multi:softprob'}, sliced,
+                  num_boost_round=2, evals=[(sliced, 'd')],
+                  evals_result=eval_res_1)
 
         eval_res_0 = eval_res_0['d']['merror']
         eval_res_1 = eval_res_1['d']['merror']
+
         for i in range(len(eval_res_0)):
             assert abs(eval_res_0[i] - eval_res_1[i]) < 0.02
 
@@ -196,13 +205,33 @@ def test_get_info(self):
         dtrain.get_float_info('base_margin')
         dtrain.get_uint_info('group_ptr')
 
+    def test_feature_weights(self):
+        kRows = 10
+        kCols = 50
+        rng = np.random.RandomState(1994)
+        fw = rng.uniform(size=kCols)
+        X = rng.randn(kRows, kCols)
+        m = xgb.DMatrix(X)
+        m.set_info(feature_weights=fw)
+        np.testing.assert_allclose(fw, m.get_float_info('feature_weights'))
+        # Handle empty
+        m.set_info(feature_weights=np.empty((0, 0)))
+
+        assert m.get_float_info('feature_weights').shape[0] == 0
+
+        fw -= 1
+
+        def assign_weight():
+            m.set_info(feature_weights=fw)
+        self.assertRaises(ValueError, assign_weight)
+
     def test_sparse_dmatrix_csr(self):
         nrow = 100
         ncol = 1000
         x = rand(nrow, ncol, density=0.0005, format='csr', random_state=rng)
         assert x.indices.max() < ncol - 1
         x.data[:] = 1
-        dtrain = xgb.DMatrix(x, label=np.random.binomial(1, 0.3, nrow))
+        dtrain = xgb.DMatrix(x, label=rng.binomial(1, 0.3, nrow))
         assert (dtrain.num_row(), dtrain.num_col()) == (nrow, ncol)
         watchlist = [(dtrain, 'train')]
         param = {'max_depth': 3, 'objective': 'binary:logistic', 'verbosity': 0}
@@ -215,7 +244,7 @@ def test_sparse_dmatrix_csc(self):
         x = rand(nrow, ncol, density=0.0005, format='csc', random_state=rng)
         assert x.indices.max() < nrow - 1
         x.data[:] = 1
-        dtrain = xgb.DMatrix(x, label=np.random.binomial(1, 0.3, nrow))
+        dtrain = xgb.DMatrix(x, label=rng.binomial(1, 0.3, nrow))
         assert (dtrain.num_row(), dtrain.num_col()) == (nrow, ncol)
         watchlist = [(dtrain, 'train')]
         param = {'max_depth': 3, 'objective': 'binary:logistic', 'verbosity': 0}
diff --git a/tests/python/test_with_sklearn.py b/tests/python/test_with_sklearn.py
index 7f62a3e83052..ce0b57e823ff 100644
--- a/tests/python/test_with_sklearn.py
+++ b/tests/python/test_with_sklearn.py
@@ -1,3 +1,5 @@
+import collections
+import importlib.util
 import numpy as np
 import xgboost as xgb
 from xgboost.sklearn import XGBoostLabelEncoder
@@ -654,6 +656,7 @@ def test_validation_weights_xgbmodel():
                 eval_set=[(X_train, y_train), (X_test, y_test)],
                 sample_weight_eval_set=[weights_train])
 
+
 def test_validation_weights_xgbclassifier():
     from sklearn.datasets import make_hastie_10_2
 
@@ -920,6 +923,64 @@ def test_pandas_input():
                                np.array([0, 1]))
 
 
+def run_feature_weights(increasing):
+    with TemporaryDirectory() as tmpdir:
+        kRows = 512
+        kCols = 64
+        colsample_bynode = 0.5
+        reg = xgb.XGBRegressor(tree_method='hist',
+                               colsample_bynode=colsample_bynode)
+        X = rng.randn(kRows, kCols)
+        y = rng.randn(kRows)
+        fw = np.ones(shape=(kCols,))
+        for i in range(kCols):
+            if increasing:
+                fw[i] *= float(i)
+            else:
+                fw[i] *= float(kCols - i)
+
+        reg.fit(X, y, feature_weights=fw)
+        model_path = os.path.join(tmpdir, 'model.json')
+        reg.save_model(model_path)
+        with open(model_path) as fd:
+            model = json.load(fd)
+
+        parser_path = os.path.join(tm.PROJECT_ROOT, 'demo', 'json-model',
+                                   'json_parser.py')
+        spec = importlib.util.spec_from_file_location("JsonParser",
+                                                      parser_path)
+        foo = importlib.util.module_from_spec(spec)
+        spec.loader.exec_module(foo)
+        model = foo.Model(model)
+        splits = {}
+        total_nodes = 0
+        for tree in model.trees:
+            n_nodes = len(tree.nodes)
+            total_nodes += n_nodes
+            for n in range(n_nodes):
+                if tree.is_leaf(n):
+                    continue
+                if splits.get(tree.split_index(n), None) is None:
+                    splits[tree.split_index(n)] = 1
+                else:
+                    splits[tree.split_index(n)] += 1
+
+        od = collections.OrderedDict(sorted(splits.items()))
+        tuples = [(k, v) for k, v in od.items()]
+        k, v = list(zip(*tuples))
+        w = np.polyfit(k, v, deg=1)
+        return w
+
+
+def test_feature_weights():
+    poly_increasing = run_feature_weights(True)
+    poly_decreasing = run_feature_weights(False)
+    # Approxmated test, this is dependent on the implementation of random
+    # number generator in std library.
+    assert poly_increasing[0] > 0.08
+    assert poly_decreasing[0] < -0.08
+
+
 class TestBoostFromPrediction(unittest.TestCase):
     def run_boost_from_prediction(self, tree_method):
         from sklearn.datasets import load_breast_cancer

From 989ddd036f1c9829d29abc05e9b2478f74d793ef Mon Sep 17 00:00:00 2001
From: Qi Zhang <qzhang90@gatech.edu>
Date: Tue, 18 Aug 2020 17:47:17 -0400
Subject: [PATCH 02/15] Swap byte-order in binary serializer to support
 big-endian arch (#5813)

* fixed some endian issues

* Use dmlc::ByteSwap() to simplify code

* Fix lint check

* [CI] Add test for s390x

* Download latest CMake on s390x

* Fix a bug in my code

* Save magic number in dmatrix with byteswap on big-endian machine

* Save version in binary with byteswap on big-endian machine

* Load scalar with byteswap in MetaInfo

* Add a debugging message

* Handle arrays correctly when byteswapping

* EOF can also be 255

* Handle magic number in MetaInfo carefully

* Skip Tree.Load test for big-endian, since the test manually builds little-endian binary model

* Handle missing packages in Python tests

* Don't use boto3 in model compatibility tests

* Add s390 Docker file for local testing

* Add model compatibility tests

* Add R compatibility test

* Revert "Add R compatibility test"

This reverts commit c2d2bdcb7dbae133cbb927fcd20f7e83ee2b18a8.

Co-authored-by: Qi Zhang <q.zhang@ibm.com>
Co-authored-by: Hyunsu Cho <chohyu01@cs.washington.edu>
---
 .travis.yml                              | 40 +++++++++++-------------
 include/xgboost/tree_model.h             | 35 +++++++++++++++++++++
 python-package/xgboost/core.py           |  8 +++--
 src/common/version.cc                    | 12 +++----
 src/data/data.cc                         | 20 +++++++-----
 src/data/simple_dmatrix.cc               |  5 ++-
 src/data/sparse_page_source.h            |  4 +--
 src/gbm/gbtree_model.cc                  | 34 +++++++++++++++++---
 src/gbm/gbtree_model.h                   | 15 +++++++++
 src/learner.cc                           | 35 +++++++++++++++++++--
 src/tree/tree_model.cc                   | 40 ++++++++++++++++++++++--
 tests/ci_build/Dockerfile.s390x          | 27 ++++++++++++++++
 tests/cpp/common/test_json.cc            |  3 +-
 tests/cpp/tree/test_tree_model.cc        |  2 ++
 tests/python/test_demos.py               |  2 ++
 tests/python/test_early_stopping.py      |  1 +
 tests/python/test_model_compatibility.py | 15 +++------
 tests/python/testing.py                  | 10 ++++--
 tests/travis/run_test.sh                 | 16 ++++++++++
 tests/travis/setup.sh                    |  9 ++++++
 20 files changed, 266 insertions(+), 67 deletions(-)
 create mode 100644 tests/ci_build/Dockerfile.s390x

diff --git a/.travis.yml b/.travis.yml
index d0f72423b6ea..5f782ffe472a 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -1,38 +1,33 @@
-# disable sudo for container build.
 sudo: required
 
-# Enabling test OS X
-os:
-  - linux
-  - osx
-
 osx_image: xcode10.1
 dist: bionic
 
-# Use Build Matrix to do lint and build seperately
 env:
-  matrix:
-    # python package test
-    - TASK=python_test
-    # test installation of Python source distribution
-    - TASK=python_sdist_test
-    # java package test
-    - TASK=java_test
-    # cmake test
-    - TASK=cmake_test
-
   global:
     - secure: "PR16i9F8QtNwn99C5NDp8nptAS+97xwDtXEJJfEiEVhxPaaRkOp0MPWhogCaK0Eclxk1TqkgWbdXFknwGycX620AzZWa/A1K3gAs+GrpzqhnPMuoBJ0Z9qxXTbSJvCyvMbYwVrjaxc/zWqdMU8waWz8A7iqKGKs/SqbQ3rO6v7c="
     - secure: "dAGAjBokqm/0nVoLMofQni/fWIBcYSmdq4XvCBX1ZAMDsWnuOfz/4XCY6h2lEI1rVHZQ+UdZkc9PioOHGPZh5BnvE49/xVVWr9c4/61lrDOlkD01ZjSAeoV0fAZq+93V/wPl4QV+MM+Sem9hNNzFSbN5VsQLAiWCSapWsLdKzqA="
 
-matrix:
-  exclude:
+jobs:
+  include:
     - os: linux
+      arch: amd64
+      env: TASK=python_sdist_test
+    - os: osx
+      arch: amd64
       env: TASK=python_test
-    - os: linux
+    - os: osx
+      arch: amd64
+      env: TASK=python_sdist_test
+    - os: osx
+      arch: amd64
       env: TASK=java_test
-    - os: linux
+    - os: osx
+      arch: amd64
       env: TASK=cmake_test
+    - os: linux
+      arch: s390x
+      env: TASK=s390x_test
 
 # dependent brew packages
 addons:
@@ -47,6 +42,9 @@ addons:
       - wget
       - r
     update: true
+  apt:
+    packages:
+      - snapd
 
 before_install:
   - source tests/travis/travis_setup_env.sh
diff --git a/include/xgboost/tree_model.h b/include/xgboost/tree_model.h
index e7f6dc8ec089..fd9c69df3e7b 100644
--- a/include/xgboost/tree_model.h
+++ b/include/xgboost/tree_model.h
@@ -59,6 +59,21 @@ struct TreeParam : public dmlc::Parameter<TreeParam> {
     num_nodes = 1;
     deprecated_num_roots = 1;
   }
+
+  // Swap byte order for all fields. Useful for transporting models between machines with different
+  // endianness (big endian vs little endian)
+  inline TreeParam ByteSwap() const {
+    TreeParam x = *this;
+    dmlc::ByteSwap(&x.deprecated_num_roots, sizeof(x.deprecated_num_roots), 1);
+    dmlc::ByteSwap(&x.num_nodes, sizeof(x.num_nodes), 1);
+    dmlc::ByteSwap(&x.num_deleted, sizeof(x.num_deleted), 1);
+    dmlc::ByteSwap(&x.deprecated_max_depth, sizeof(x.deprecated_max_depth), 1);
+    dmlc::ByteSwap(&x.num_feature, sizeof(x.num_feature), 1);
+    dmlc::ByteSwap(&x.size_leaf_vector, sizeof(x.size_leaf_vector), 1);
+    dmlc::ByteSwap(x.reserved, sizeof(x.reserved[0]), sizeof(x.reserved) / sizeof(x.reserved[0]));
+    return x;
+  }
+
   // declare the parameters
   DMLC_DECLARE_PARAMETER(TreeParam) {
     // only declare the parameters that can be set by the user.
@@ -97,6 +112,16 @@ struct RTreeNodeStat {
     return loss_chg == b.loss_chg && sum_hess == b.sum_hess &&
            base_weight == b.base_weight && leaf_child_cnt == b.leaf_child_cnt;
   }
+  // Swap byte order for all fields. Useful for transporting models between machines with different
+  // endianness (big endian vs little endian)
+  inline RTreeNodeStat ByteSwap() const {
+    RTreeNodeStat x = *this;
+    dmlc::ByteSwap(&x.loss_chg, sizeof(x.loss_chg), 1);
+    dmlc::ByteSwap(&x.sum_hess, sizeof(x.sum_hess), 1);
+    dmlc::ByteSwap(&x.base_weight, sizeof(x.base_weight), 1);
+    dmlc::ByteSwap(&x.leaf_child_cnt, sizeof(x.leaf_child_cnt), 1);
+    return x;
+  }
 };
 
 /*!
@@ -227,6 +252,16 @@ class RegTree : public Model {
              info_.leaf_value == b.info_.leaf_value;
     }
 
+    inline Node ByteSwap() const {
+      Node x = *this;
+      dmlc::ByteSwap(&x.parent_, sizeof(x.parent_), 1);
+      dmlc::ByteSwap(&x.cleft_, sizeof(x.cleft_), 1);
+      dmlc::ByteSwap(&x.cright_, sizeof(x.cright_), 1);
+      dmlc::ByteSwap(&x.sindex_, sizeof(x.sindex_), 1);
+      dmlc::ByteSwap(&x.info_, sizeof(x.info_), 1);
+      return x;
+    }
+
    private:
     /*!
      * \brief in leaf node, we have weights, in non-leaf nodes,
diff --git a/python-package/xgboost/core.py b/python-package/xgboost/core.py
index cf22453245ac..f2cd880ba074 100644
--- a/python-package/xgboost/core.py
+++ b/python-package/xgboost/core.py
@@ -1465,8 +1465,12 @@ def reshape_output(predt, rows):
                            ctypes.c_uint(iteration_range[1]))
 
         # once caching is supported, we can pass id(data) as cache id.
-        if isinstance(data, DataFrame):
-            data = data.values
+        try:
+            import pandas as pd
+            if isinstance(data, pd.DataFrame):
+                data = data.values
+        except ImportError:
+            pass
         if isinstance(data, np.ndarray):
             assert data.flags.c_contiguous
             arr = np.array(data.reshape(data.size), copy=False,
diff --git a/src/common/version.cc b/src/common/version.cc
index 3fb2e5c24392..e9d4fe9d13d8 100644
--- a/src/common/version.cc
+++ b/src/common/version.cc
@@ -49,9 +49,9 @@ Version::TripletT Version::Load(dmlc::Stream* fi) {
     LOG(FATAL) << msg;
   }
 
-  CHECK_EQ(fi->Read(&major, sizeof(major)), sizeof(major)) << msg;
-  CHECK_EQ(fi->Read(&minor, sizeof(major)), sizeof(minor)) << msg;
-  CHECK_EQ(fi->Read(&patch, sizeof(major)), sizeof(patch)) << msg;
+  CHECK(fi->Read(&major)) << msg;
+  CHECK(fi->Read(&minor)) << msg;
+  CHECK(fi->Read(&patch)) << msg;
 
   return std::make_tuple(major, minor, patch);
 }
@@ -69,9 +69,9 @@ void Version::Save(dmlc::Stream* fo) {
   std::tie(major, minor, patch) = Self();
   std::string verstr { u8"version:" };
   fo->Write(&verstr[0], verstr.size());
-  fo->Write(&major, sizeof(major));
-  fo->Write(&minor, sizeof(minor));
-  fo->Write(&patch, sizeof(patch));
+  fo->Write(major);
+  fo->Write(minor);
+  fo->Write(patch);
 }
 
 std::string Version::String(TripletT const& version) {
diff --git a/src/data/data.cc b/src/data/data.cc
index 677812ebba7a..8bd7c76cf59f 100644
--- a/src/data/data.cc
+++ b/src/data/data.cc
@@ -83,7 +83,7 @@ void LoadScalarField(dmlc::Stream* strm, const std::string& expected_name,
   CHECK(strm->Read(&is_scalar)) << invalid;
   CHECK(is_scalar)
     << invalid << "Expected field " << expected_name << " to be a scalar; got a vector";
-  CHECK(strm->Read(field, sizeof(T))) << invalid;
+  CHECK(strm->Read(field)) << invalid;
 }
 
 template <typename T>
@@ -653,14 +653,18 @@ DMatrix* DMatrix::Load(const std::string& uri,
     std::unique_ptr<dmlc::Stream> fi(dmlc::Stream::Create(fname.c_str(), "r", true));
     if (fi != nullptr) {
       common::PeekableInStream is(fi.get());
-      if (is.PeekRead(&magic, sizeof(magic)) == sizeof(magic) &&
-        magic == data::SimpleDMatrix::kMagic) {
-        DMatrix* dmat = new data::SimpleDMatrix(&is);
-        if (!silent) {
-          LOG(CONSOLE) << dmat->Info().num_row_ << 'x' << dmat->Info().num_col_ << " matrix with "
-            << dmat->Info().num_nonzero_ << " entries loaded from " << uri;
+      if (is.PeekRead(&magic, sizeof(magic)) == sizeof(magic)) {
+        if (!DMLC_IO_NO_ENDIAN_SWAP) {
+          dmlc::ByteSwap(&magic, sizeof(magic), 1);
+        }
+        if (magic == data::SimpleDMatrix::kMagic) {
+          DMatrix* dmat = new data::SimpleDMatrix(&is);
+          if (!silent) {
+            LOG(CONSOLE) << dmat->Info().num_row_ << 'x' << dmat->Info().num_col_ << " matrix with "
+              << dmat->Info().num_nonzero_ << " entries loaded from " << uri;
+          }
+          return dmat;
         }
-        return dmat;
       }
     }
   }
diff --git a/src/data/simple_dmatrix.cc b/src/data/simple_dmatrix.cc
index f054ff64a490..06fa385b48de 100644
--- a/src/data/simple_dmatrix.cc
+++ b/src/data/simple_dmatrix.cc
@@ -192,8 +192,7 @@ SimpleDMatrix::SimpleDMatrix(AdapterT* adapter, float missing, int nthread) {
 
 SimpleDMatrix::SimpleDMatrix(dmlc::Stream* in_stream) {
   int tmagic;
-  CHECK(in_stream->Read(&tmagic, sizeof(tmagic)) == sizeof(tmagic))
-      << "invalid input file format";
+  CHECK(in_stream->Read(&tmagic)) << "invalid input file format";
   CHECK_EQ(tmagic, kMagic) << "invalid format, magic number mismatch";
   info_.LoadBinary(in_stream);
   in_stream->Read(&sparse_page_.offset.HostVector());
@@ -203,7 +202,7 @@ SimpleDMatrix::SimpleDMatrix(dmlc::Stream* in_stream) {
 void SimpleDMatrix::SaveToLocalFile(const std::string& fname) {
     std::unique_ptr<dmlc::Stream> fo(dmlc::Stream::Create(fname.c_str(), "w"));
     int tmagic = kMagic;
-    fo->Write(&tmagic, sizeof(tmagic));
+    fo->Write(tmagic);
     info_.SaveBinary(fo.get());
     fo->Write(sparse_page_.offset.HostVector());
     fo->Write(sparse_page_.data.HostVector());
diff --git a/src/data/sparse_page_source.h b/src/data/sparse_page_source.h
index 108af403b1a3..6db6de9fad55 100644
--- a/src/data/sparse_page_source.h
+++ b/src/data/sparse_page_source.h
@@ -144,7 +144,7 @@ class ExternalMemoryPrefetcher : dmlc::DataIter<PageT> {
       std::unique_ptr<dmlc::Stream> finfo(
           dmlc::Stream::Create(info.name_info.c_str(), "r"));
       int tmagic;
-      CHECK_EQ(finfo->Read(&tmagic, sizeof(tmagic)), sizeof(tmagic));
+      CHECK(finfo->Read(&tmagic));
       CHECK_EQ(tmagic, kMagic) << "invalid format, magic number mismatch";
     }
     files_.resize(info.name_shards.size());
@@ -359,7 +359,7 @@ class SparsePageSource {
       std::unique_ptr<dmlc::Stream> fo(
           dmlc::Stream::Create(cache_info_.name_info.c_str(), "w"));
       int tmagic = kMagic;
-      fo->Write(&tmagic, sizeof(tmagic));
+      fo->Write(tmagic);
       // Either every row has query ID or none at all
       CHECK(qids.empty() || qids.size() == info.num_row_);
       info.SaveBinary(fo.get());
diff --git a/src/gbm/gbtree_model.cc b/src/gbm/gbtree_model.cc
index 8ebd8284c269..4a20b48f7d1d 100644
--- a/src/gbm/gbtree_model.cc
+++ b/src/gbm/gbtree_model.cc
@@ -12,18 +12,35 @@ namespace gbm {
 
 void GBTreeModel::Save(dmlc::Stream* fo) const {
   CHECK_EQ(param.num_trees, static_cast<int32_t>(trees.size()));
-  fo->Write(&param, sizeof(param));
+
+  if (DMLC_IO_NO_ENDIAN_SWAP) {
+    fo->Write(&param, sizeof(param));
+  } else {
+    auto x = param.ByteSwap();
+    fo->Write(&x, sizeof(x));
+  }
   for (const auto & tree : trees) {
     tree->Save(fo);
   }
   if (tree_info.size() != 0) {
-    fo->Write(dmlc::BeginPtr(tree_info), sizeof(int32_t) * tree_info.size());
+    if (DMLC_IO_NO_ENDIAN_SWAP) {
+      fo->Write(dmlc::BeginPtr(tree_info), sizeof(int32_t) * tree_info.size());
+    } else {
+      for (const auto& e : tree_info) {
+        auto x = e;
+        dmlc::ByteSwap(&x, sizeof(x), 1);
+        fo->Write(&x, sizeof(x));
+      }
+    }
   }
 }
 
 void GBTreeModel::Load(dmlc::Stream* fi) {
   CHECK_EQ(fi->Read(&param, sizeof(param)), sizeof(param))
       << "GBTree: invalid model file";
+  if (!DMLC_IO_NO_ENDIAN_SWAP) {
+    param = param.ByteSwap();
+  }
   trees.clear();
   trees_to_update.clear();
   for (int32_t i = 0; i < param.num_trees; ++i) {
@@ -33,9 +50,16 @@ void GBTreeModel::Load(dmlc::Stream* fi) {
   }
   tree_info.resize(param.num_trees);
   if (param.num_trees != 0) {
-    CHECK_EQ(
-        fi->Read(dmlc::BeginPtr(tree_info), sizeof(int32_t) * param.num_trees),
-        sizeof(int32_t) * param.num_trees);
+    if (DMLC_IO_NO_ENDIAN_SWAP) {
+      CHECK_EQ(
+          fi->Read(dmlc::BeginPtr(tree_info), sizeof(int32_t) * param.num_trees),
+          sizeof(int32_t) * param.num_trees);
+    } else {
+      for (auto& info : tree_info) {
+        CHECK_EQ(fi->Read(&info, sizeof(int32_t)), sizeof(int32_t));
+        dmlc::ByteSwap(&info, sizeof(info), 1);
+      }
+    }
   }
 }
 
diff --git a/src/gbm/gbtree_model.h b/src/gbm/gbtree_model.h
index 7ac7d8f470a2..5a89878d3816 100644
--- a/src/gbm/gbtree_model.h
+++ b/src/gbm/gbtree_model.h
@@ -61,6 +61,21 @@ struct GBTreeModelParam : public dmlc::Parameter<GBTreeModelParam> {
         .set_default(0)
         .describe("Reserved option for vector tree.");
   }
+
+  // Swap byte order for all fields. Useful for transporting models between machines with different
+  // endianness (big endian vs little endian)
+  inline GBTreeModelParam ByteSwap() const {
+    GBTreeModelParam x = *this;
+    dmlc::ByteSwap(&x.num_trees, sizeof(x.num_trees), 1);
+    dmlc::ByteSwap(&x.deprecated_num_roots, sizeof(x.deprecated_num_roots), 1);
+    dmlc::ByteSwap(&x.deprecated_num_feature, sizeof(x.deprecated_num_feature), 1);
+    dmlc::ByteSwap(&x.pad_32bit, sizeof(x.pad_32bit), 1);
+    dmlc::ByteSwap(&x.deprecated_num_pbuffer, sizeof(x.deprecated_num_pbuffer), 1);
+    dmlc::ByteSwap(&x.deprecated_num_output_group, sizeof(x.deprecated_num_output_group), 1);
+    dmlc::ByteSwap(&x.size_leaf_vector, sizeof(x.size_leaf_vector), 1);
+    dmlc::ByteSwap(x.reserved, sizeof(x.reserved[0]), sizeof(x.reserved) / sizeof(x.reserved[0]));
+    return x;
+  }
 };
 
 struct GBTreeModel : public Model {
diff --git a/src/learner.cc b/src/learner.cc
index 47080a5c12b9..8210c4d1c89b 100644
--- a/src/learner.cc
+++ b/src/learner.cc
@@ -128,6 +128,19 @@ struct LearnerModelParamLegacy : public dmlc::Parameter<LearnerModelParamLegacy>
     std::string str = get<String const>(j_param.at("base_score"));
     from_chars(str.c_str(), str.c_str() + str.size(), base_score);
   }
+  inline LearnerModelParamLegacy ByteSwap() const {
+    LearnerModelParamLegacy x = *this;
+    dmlc::ByteSwap(&x.base_score, sizeof(x.base_score), 1);
+    dmlc::ByteSwap(&x.num_feature, sizeof(x.num_feature), 1);
+    dmlc::ByteSwap(&x.num_class, sizeof(x.num_class), 1);
+    dmlc::ByteSwap(&x.contain_extra_attrs, sizeof(x.contain_extra_attrs), 1);
+    dmlc::ByteSwap(&x.contain_eval_metrics, sizeof(x.contain_eval_metrics), 1);
+    dmlc::ByteSwap(&x.major_version, sizeof(x.major_version), 1);
+    dmlc::ByteSwap(&x.minor_version, sizeof(x.minor_version), 1);
+    dmlc::ByteSwap(x.reserved, sizeof(x.reserved[0]), sizeof(x.reserved) / sizeof(x.reserved[0]));
+    return x;
+  }
+
   // declare parameters
   DMLC_DECLARE_PARAMETER(LearnerModelParamLegacy) {
     DMLC_DECLARE_FIELD(base_score)
@@ -694,7 +707,9 @@ class LearnerIO : public LearnerConfiguration {
     // read parameter
     CHECK_EQ(fi->Read(&mparam_, sizeof(mparam_)), sizeof(mparam_))
         << "BoostLearner: wrong model format";
-
+    if (!DMLC_IO_NO_ENDIAN_SWAP) {
+      mparam_ = mparam_.ByteSwap();
+    }
     CHECK(fi->Read(&tparam_.objective)) << "BoostLearner: wrong model format";
     CHECK(fi->Read(&tparam_.booster)) << "BoostLearner: wrong model format";
 
@@ -828,7 +843,12 @@ class LearnerIO : public LearnerConfiguration {
     }
     std::string header {"binf"};
     fo->Write(header.data(), 4);
-    fo->Write(&mparam, sizeof(LearnerModelParamLegacy));
+    if (DMLC_IO_NO_ENDIAN_SWAP) {
+      fo->Write(&mparam, sizeof(LearnerModelParamLegacy));
+    } else {
+      LearnerModelParamLegacy x = mparam.ByteSwap();
+      fo->Write(&x, sizeof(LearnerModelParamLegacy));
+    }
     fo->Write(tparam_.objective);
     fo->Write(tparam_.booster);
     gbm_->Save(fo);
@@ -867,7 +887,13 @@ class LearnerIO : public LearnerConfiguration {
       // concatonate the model and config at final output, it's a temporary solution for
       // continuing support for binary model format
       fo->Write(&serialisation_header_[0], serialisation_header_.size());
-      fo->Write(&json_offset, sizeof(json_offset));
+      if (DMLC_IO_NO_ENDIAN_SWAP) {
+        fo->Write(&json_offset, sizeof(json_offset));
+      } else {
+        auto x = json_offset;
+        dmlc::ByteSwap(&x, sizeof(x), 1);
+        fo->Write(&x, sizeof(json_offset));
+      }
       fo->Write(&binary_buf[0], binary_buf.size());
       fo->Write(&config_str[0], config_str.size());
     }
@@ -904,6 +930,9 @@ class LearnerIO : public LearnerConfiguration {
 )doc";
       int64_t sz {-1};
       CHECK_EQ(fp.Read(&sz, sizeof(sz)), sizeof(sz));
+      if (!DMLC_IO_NO_ENDIAN_SWAP) {
+        dmlc::ByteSwap(&sz, sizeof(sz), 1);
+      }
       CHECK_GT(sz, 0);
       size_t json_offset = static_cast<size_t>(sz);
       std::string buffer;
diff --git a/src/tree/tree_model.cc b/src/tree/tree_model.cc
index 8f45621ca15e..7f9721aef1d9 100644
--- a/src/tree/tree_model.cc
+++ b/src/tree/tree_model.cc
@@ -664,13 +664,26 @@ bst_node_t RegTree::GetNumSplitNodes() const {
 
 void RegTree::Load(dmlc::Stream* fi) {
   CHECK_EQ(fi->Read(&param, sizeof(TreeParam)), sizeof(TreeParam));
+  if (!DMLC_IO_NO_ENDIAN_SWAP) {
+    param = param.ByteSwap();
+  }
   nodes_.resize(param.num_nodes);
   stats_.resize(param.num_nodes);
   CHECK_NE(param.num_nodes, 0);
   CHECK_EQ(fi->Read(dmlc::BeginPtr(nodes_), sizeof(Node) * nodes_.size()),
            sizeof(Node) * nodes_.size());
+  if (!DMLC_IO_NO_ENDIAN_SWAP) {
+    for (Node& node : nodes_) {
+      node = node.ByteSwap();
+    }
+  }
   CHECK_EQ(fi->Read(dmlc::BeginPtr(stats_), sizeof(RTreeNodeStat) * stats_.size()),
            sizeof(RTreeNodeStat) * stats_.size());
+  if (!DMLC_IO_NO_ENDIAN_SWAP) {
+    for (RTreeNodeStat& stat : stats_) {
+      stat = stat.ByteSwap();
+    }
+  }
   // chg deleted nodes
   deleted_nodes_.resize(0);
   for (int i = 1; i < param.num_nodes; ++i) {
@@ -683,11 +696,32 @@ void RegTree::Load(dmlc::Stream* fi) {
 void RegTree::Save(dmlc::Stream* fo) const {
   CHECK_EQ(param.num_nodes, static_cast<int>(nodes_.size()));
   CHECK_EQ(param.num_nodes, static_cast<int>(stats_.size()));
-  fo->Write(&param, sizeof(TreeParam));
   CHECK_EQ(param.deprecated_num_roots, 1);
   CHECK_NE(param.num_nodes, 0);
-  fo->Write(dmlc::BeginPtr(nodes_), sizeof(Node) * nodes_.size());
-  fo->Write(dmlc::BeginPtr(stats_), sizeof(RTreeNodeStat) * nodes_.size());
+
+  if (DMLC_IO_NO_ENDIAN_SWAP) {
+    fo->Write(&param, sizeof(TreeParam));
+  } else {
+    TreeParam x = param.ByteSwap();
+    fo->Write(&x, sizeof(x));
+  }
+
+  if (DMLC_IO_NO_ENDIAN_SWAP) {
+    fo->Write(dmlc::BeginPtr(nodes_), sizeof(Node) * nodes_.size());
+  } else {
+    for (const Node& node : nodes_) {
+      Node x = node.ByteSwap();
+      fo->Write(&x, sizeof(x));
+    }
+  }
+  if (DMLC_IO_NO_ENDIAN_SWAP) {
+    fo->Write(dmlc::BeginPtr(stats_), sizeof(RTreeNodeStat) * nodes_.size());
+  } else {
+    for (const RTreeNodeStat& stat : stats_) {
+      RTreeNodeStat x = stat.ByteSwap();
+      fo->Write(&x, sizeof(x));
+    }
+  }
 }
 
 void RegTree::LoadModel(Json const& in) {
diff --git a/tests/ci_build/Dockerfile.s390x b/tests/ci_build/Dockerfile.s390x
new file mode 100644
index 000000000000..5ad4a7888feb
--- /dev/null
+++ b/tests/ci_build/Dockerfile.s390x
@@ -0,0 +1,27 @@
+FROM s390x/ubuntu:20.04
+
+# Environment
+ENV DEBIAN_FRONTEND noninteractive
+SHELL ["/bin/bash", "-c"]   # Use Bash as shell
+
+# Install all basic requirements
+RUN \
+    apt-get update && \
+    apt-get install -y --no-install-recommends tar unzip wget git build-essential ninja-build \
+      cmake time python3 python3-pip python3-numpy python3-scipy python3-sklearn r-base && \
+    python3 -m pip install pytest hypothesis
+
+ENV GOSU_VERSION 1.10
+
+# Install lightweight sudo (not bound to TTY)
+RUN set -ex; \
+    wget -O /usr/local/bin/gosu "https://github.com/tianon/gosu/releases/download/$GOSU_VERSION/gosu-amd64" && \
+    chmod +x /usr/local/bin/gosu && \
+    gosu nobody true
+
+# Default entry-point to use if running locally
+# It will preserve attributes of created files
+COPY entrypoint.sh /scripts/
+
+WORKDIR /workspace
+ENTRYPOINT ["/scripts/entrypoint.sh"]
diff --git a/tests/cpp/common/test_json.cc b/tests/cpp/common/test_json.cc
index ba3b12e337e2..8665420d684a 100644
--- a/tests/cpp/common/test_json.cc
+++ b/tests/cpp/common/test_json.cc
@@ -453,7 +453,8 @@ TEST(Json, Invalid) {
       Json load{Json::Load(StringView(str.c_str(), str.size()))};
     } catch (dmlc::Error const &e) {
       std::string msg = e.what();
-      ASSERT_NE(msg.find("EOF"), std::string::npos);
+      ASSERT_TRUE(msg.find("EOF") != std::string::npos
+                  || msg.find("255") != std::string::npos);  // EOF is printed as 255 on s390x
       has_thrown = true;
     };
     ASSERT_TRUE(has_thrown);
diff --git a/tests/cpp/tree/test_tree_model.cc b/tests/cpp/tree/test_tree_model.cc
index dbf2b80a2de4..1dbc5fc2c89b 100644
--- a/tests/cpp/tree/test_tree_model.cc
+++ b/tests/cpp/tree/test_tree_model.cc
@@ -6,6 +6,7 @@
 #include "xgboost/json_io.h"
 
 namespace xgboost {
+#if DMLC_IO_NO_ENDIAN_SWAP  // skip on big-endian machines
 // Manually construct tree in binary format
 // Do not use structs in case they change
 // We want to preserve backwards compatibility
@@ -85,6 +86,7 @@ TEST(Tree, Load) {
   EXPECT_EQ(tree[1].LeafValue(), 0.1f);
   EXPECT_TRUE(tree[1].IsLeaf());
 }
+#endif  // DMLC_IO_NO_ENDIAN_SWAP
 
 TEST(Tree, AllocateNode) {
   RegTree tree;
diff --git a/tests/python/test_demos.py b/tests/python/test_demos.py
index 25c1c4de6c1f..33e64f7dd40b 100644
--- a/tests/python/test_demos.py
+++ b/tests/python/test_demos.py
@@ -109,6 +109,8 @@ def test_evals_result_demo():
     subprocess.check_call(cmd)
 
 
+@pytest.mark.skipif(**tm.no_sklearn())
+@pytest.mark.skipif(**tm.no_pandas())
 def test_aft_demo():
     script = os.path.join(DEMO_DIR, 'aft_survival', 'aft_survival_demo.py')
     cmd = ['python', script]
diff --git a/tests/python/test_early_stopping.py b/tests/python/test_early_stopping.py
index 9338c095d657..4fca3e59302b 100644
--- a/tests/python/test_early_stopping.py
+++ b/tests/python/test_early_stopping.py
@@ -82,6 +82,7 @@ def test_cv_early_stopping(self):
         self.assert_metrics_length(cv, 1)
 
     @pytest.mark.skipif(**tm.no_sklearn())
+    @pytest.mark.skipif(**tm.no_pandas())
     def test_cv_early_stopping_with_multiple_eval_sets_and_metrics(self):
         from sklearn.datasets import load_breast_cancer
 
diff --git a/tests/python/test_model_compatibility.py b/tests/python/test_model_compatibility.py
index 55110720bf92..e02134d6cc71 100644
--- a/tests/python/test_model_compatibility.py
+++ b/tests/python/test_model_compatibility.py
@@ -1,10 +1,12 @@
 import xgboost
 import os
 import generate_models as gm
+import testing as tm
 import json
 import zipfile
 import pytest
 import copy
+import urllib.request
 
 
 def run_model_param_check(config):
@@ -87,6 +89,7 @@ def run_scikit_model_check(name, path):
         assert False
 
 
+@pytest.mark.skipif(**tm.no_sklearn())
 def test_model_compatibility():
     '''Test model compatibility, can only be run on CI as others don't
     have the credentials.
@@ -94,17 +97,9 @@ def test_model_compatibility():
     '''
     path = os.path.dirname(os.path.abspath(__file__))
     path = os.path.join(path, 'models')
-    try:
-        import boto3
-        import botocore
-    except ImportError:
-        pytest.skip(
-            'Skiping compatibility tests as boto3 is not installed.')
-
-    s3_bucket = boto3.resource('s3').Bucket('xgboost-ci-jenkins-artifacts')
-    zip_path = 'xgboost_model_compatibility_test.zip'
-    s3_bucket.download_file(zip_path, zip_path)
 
+    zip_path, _ = urllib.request.urlretrieve('https://xgboost-ci-jenkins-artifacts.s3-us-west-2' +
+                                             '.amazonaws.com/xgboost_model_compatibility_test.zip')
     with zipfile.ZipFile(zip_path, 'r') as z:
         z.extractall(path)
 
diff --git a/tests/python/testing.py b/tests/python/testing.py
index c3f78f78e966..30b44079607b 100644
--- a/tests/python/testing.py
+++ b/tests/python/testing.py
@@ -2,13 +2,17 @@
 import os
 from xgboost.compat import SKLEARN_INSTALLED, PANDAS_INSTALLED
 from xgboost.compat import DASK_INSTALLED
+import pytest
+import tempfile
+import xgboost as xgb
+import numpy as np
+
+hypothesis = pytest.importorskip('hypothesis')
+sklearn = pytest.importorskip('sklearn')
 from hypothesis import strategies
 from hypothesis.extra.numpy import arrays
 from joblib import Memory
 from sklearn import datasets
-import tempfile
-import xgboost as xgb
-import numpy as np
 
 try:
     import cupy as cp
diff --git a/tests/travis/run_test.sh b/tests/travis/run_test.sh
index a0e1c9f28651..500aa1e57ae1 100755
--- a/tests/travis/run_test.sh
+++ b/tests/travis/run_test.sh
@@ -88,3 +88,19 @@ if [ ${TASK} == "cmake_test" ]; then
     cd ..
     rm -rf build
 fi
+
+if [ ${TASK} == "s390x_test" ]; then
+    set -e
+
+    # Build and run C++ tests
+    rm -rf build
+    mkdir build && cd build
+    cmake .. -DCMAKE_VERBOSE_MAKEFILE=ON -DGOOGLE_TEST=ON -DUSE_OPENMP=ON -DUSE_DMLC_GTEST=ON -GNinja
+    time ninja -v
+    ./testxgboost
+
+    # Run model compatibility tests
+    cd ..
+    python3 -m pip install --user pytest hypothesis
+    PYTHONPATH=./python-package python3 -m pytest --fulltrace -v -rxXs tests/python/ -k 'test_model'
+fi
diff --git a/tests/travis/setup.sh b/tests/travis/setup.sh
index 5a7a91671da0..0e9f7e8fd687 100755
--- a/tests/travis/setup.sh
+++ b/tests/travis/setup.sh
@@ -20,6 +20,15 @@ if [ ${TASK} == "cmake_test" ] && [ ${TRAVIS_OS_NAME} == "osx" ]; then
     sudo softwareupdate -i "Command Line Tools (macOS High Sierra version 10.13) for Xcode-9.3"
 fi
 
+if [ ${TASK} == "s390x_test" ] && [ ${TRAVIS_CPU_ARCH} == "s390x" ]; then
+    sudo snap install cmake --channel=3.17/beta --classic
+    export PATH=/snap/bin:${PATH}
+    cmake --version
+    sudo apt-get update
+    sudo apt-get install -y --no-install-recommends tar unzip wget git build-essential ninja-build \
+      time python3 python3-pip python3-numpy python3-scipy python3-sklearn r-base
+fi
+
 if [ ${TASK} == "python_sdist_test" ] && [ ${TRAVIS_OS_NAME} == "linux" ]; then
     wget https://github.com/Kitware/CMake/releases/download/v3.17.1/cmake-3.17.1-Linux-x86_64.sh
     sudo bash cmake-3.17.1-Linux-x86_64.sh --prefix=/usr/local --skip-license

From e51cba6195dc97617b34360145263590dd07d7b8 Mon Sep 17 00:00:00 2001
From: Cuong Duong <cuong@canva.com>
Date: Wed, 19 Aug 2020 11:04:09 +1000
Subject: [PATCH 03/15] Add SHAP summary plot using ggplot2 (#5882)

* add SHAP summary plot using ggplot2

* Update xgb.plot.shap

* Update example in xgb.plot.shap documentation

* update logic, add tests

* whitespace fixes

* whitespace fixes for test_helpers

* namespace for sd function

* explicitly declare variables that are automatically evaluated by data.table

* Fix R lint

Co-authored-by: Philip Hyunsu Cho <chohyu01@cs.washington.edu>
---
 R-package/R/xgb.ggplot.R                |  81 ++++++++++-
 R-package/R/xgb.plot.shap.R             | 172 +++++++++++++++++-------
 R-package/tests/testthat/test_helpers.R |  40 +++++-
 3 files changed, 238 insertions(+), 55 deletions(-)

diff --git a/R-package/R/xgb.ggplot.R b/R-package/R/xgb.ggplot.R
index 3b76e9facf42..339e0fac1600 100644
--- a/R-package/R/xgb.ggplot.R
+++ b/R-package/R/xgb.ggplot.R
@@ -99,6 +99,85 @@ xgb.ggplot.deepness <- function(model = NULL, which = c("2x1", "max.depth", "med
   }
 }
 
+#' @rdname xgb.plot.shap.summary
+#' @export
+xgb.ggplot.shap.summary <- function(data, shap_contrib = NULL, features = NULL, top_n = 10, model = NULL,
+                                    trees = NULL, target_class = NULL, approxcontrib = FALSE, subsample = NULL) {
+  data_list <- xgb.shap.data(
+    data = data,
+    shap_contrib = shap_contrib,
+    features = features,
+    top_n = top_n,
+    model = model,
+    trees = trees,
+    target_class = target_class,
+    approxcontrib = approxcontrib,
+    subsample = subsample,
+    max_observations = 10000  # 10,000 samples per feature.
+  )
+  p_data <- prepare.ggplot.shap.data(data_list, normalize = TRUE)
+  # Reverse factor levels so that the first level is at the top of the plot
+  p_data[, "feature" := factor(feature, rev(levels(feature)))]
+
+  p <- ggplot2::ggplot(p_data, ggplot2::aes(x = feature, y = shap_value, colour = feature_value)) +
+    ggplot2::geom_jitter(alpha = 0.5, width = 0.1) +
+    ggplot2::scale_colour_viridis_c(limits = c(-3, 3), option = "plasma", direction = -1) +
+    ggplot2::geom_abline(slope = 0, intercept = 0, colour = "darkgrey") +
+    ggplot2::coord_flip()
+
+  p
+}
+
+#' Combine and melt feature values and SHAP contributions for sample
+#' observations.
+#'
+#' Conforms to data format required for ggplot functions.
+#'
+#' Internal utility function.
+#'
+#' @param data_list List containing 'data' and 'shap_contrib' returned by
+#'   \code{xgb.shap.data()}.
+#' @param normalize Whether to standardize feature values to have mean 0 and
+#'   standard deviation 1 (useful for comparing multiple features on the same
+#'   plot). Default \code{FALSE}.
+#'
+#' @return A data.table containing the observation ID, the feature name, the
+#'   feature value (normalized if specified), and the SHAP contribution value.
+prepare.ggplot.shap.data <- function(data_list, normalize = FALSE) {
+  data <- data_list[["data"]]
+  shap_contrib <- data_list[["shap_contrib"]]
+
+  data <- data.table::as.data.table(as.matrix(data))
+  if (normalize) {
+    data[, (names(data)) := lapply(.SD, normalize)]
+  }
+  data[, "id" := seq_len(nrow(data))]
+  data_m <- data.table::melt.data.table(data, id.vars = "id", variable.name = "feature", value.name = "feature_value")
+
+  shap_contrib <- data.table::as.data.table(as.matrix(shap_contrib))
+  shap_contrib[, "id" := seq_len(nrow(shap_contrib))]
+  shap_contrib_m <- data.table::melt.data.table(shap_contrib, id.vars = "id", variable.name = "feature", value.name = "shap_value")
+
+  p_data <- data.table::merge.data.table(data_m, shap_contrib_m, by = c("id", "feature"))
+
+  p_data
+}
+
+#' Scale feature value to have mean 0, standard deviation 1
+#'
+#' This is used to compare multiple features on the same plot.
+#' Internal utility function
+#'
+#' @param x Numeric vector
+#'
+#' @return Numeric vector with mean 0 and sd 1.
+normalize <- function(x) {
+  loc <- mean(x, na.rm = TRUE)
+  scale <- stats::sd(x, na.rm = TRUE)
+
+  (x - loc) / scale
+}
+
 # Plot multiple ggplot graph aligned by rows and columns.
 # ... the plots
 # cols number of columns
@@ -131,5 +210,5 @@ multiplot <- function(..., cols = 1) {
 
 globalVariables(c(
   "Cluster", "ggplot", "aes", "geom_bar", "coord_flip", "xlab", "ylab", "ggtitle", "theme",
-  "element_blank", "element_text", "V1", "Weight"
+  "element_blank", "element_text", "V1", "Weight", "feature"
 ))
diff --git a/R-package/R/xgb.plot.shap.R b/R-package/R/xgb.plot.shap.R
index a44d4b570a09..d9ea69786ad9 100644
--- a/R-package/R/xgb.plot.shap.R
+++ b/R-package/R/xgb.plot.shap.R
@@ -81,6 +81,7 @@
 #' xgb.plot.shap(agaricus.test$data, model = bst, features = "odor=none")
 #' contr <- predict(bst, agaricus.test$data, predcontrib = TRUE)
 #' xgb.plot.shap(agaricus.test$data, contr, model = bst, top_n = 12, n_col = 3)
+#' xgb.ggplot.shap.summary(agaricus.test$data, contr, model = bst, top_n = 12)  # Summary plot
 #'
 #' # multiclass example - plots for each class separately:
 #' nclass <- 3
@@ -99,6 +100,7 @@
 #'               n_col = 2, col = col, pch = 16, pch_NA = 17)
 #' xgb.plot.shap(x, model = mbst, trees = trees0 + 2, target_class = 2, top_n = 4,
 #'               n_col = 2, col = col, pch = 16, pch_NA = 17)
+#' xgb.ggplot.shap.summary(x, model = mbst, target_class = 0, top_n = 4)  # Summary plot
 #'
 #' @rdname xgb.plot.shap
 #' @export
@@ -109,69 +111,33 @@ xgb.plot.shap <- function(data, shap_contrib = NULL, features = NULL, top_n = 1,
                           plot_NA = TRUE, col_NA = rgb(0.7, 0, 1, 0.6), pch_NA = '.', pos_NA = 1.07,
                           plot_loess = TRUE, col_loess = 2, span_loess = 0.5,
                           which = c("1d", "2d"), plot = TRUE, ...) {
-
-  if (!is.matrix(data) && !inherits(data, "dgCMatrix"))
-    stop("data: must be either matrix or dgCMatrix")
-
-  if (is.null(shap_contrib) && (is.null(model) || !inherits(model, "xgb.Booster")))
-    stop("when shap_contrib is not provided, one must provide an xgb.Booster model")
-
-  if (is.null(features) && (is.null(model) || !inherits(model, "xgb.Booster")))
-    stop("when features are not provided, one must provide an xgb.Booster model to rank the features")
-
-  if (!is.null(shap_contrib) &&
-      (!is.matrix(shap_contrib) || nrow(shap_contrib) != nrow(data) || ncol(shap_contrib) != ncol(data) + 1))
-    stop("shap_contrib is not compatible with the provided data")
-
-  nsample <- if (is.null(subsample)) min(100000, nrow(data)) else as.integer(subsample * nrow(data))
-  idx <- sample(seq_len(nrow(data)), nsample)
-  data <- data[idx, ]
-
-  if (is.null(shap_contrib)) {
-    shap_contrib <- predict(model, data, predcontrib = TRUE, approxcontrib = approxcontrib)
-  } else {
-    shap_contrib <- shap_contrib[idx, ]
-  }
+  data_list <- xgb.shap.data(
+    data = data,
+    shap_contrib = shap_contrib,
+    features = features,
+    top_n = top_n,
+    model = model,
+    trees = trees,
+    target_class = target_class,
+    approxcontrib = approxcontrib,
+    subsample = subsample,
+    max_observations = 100000
+  )
+  data <- data_list[["data"]]
+  shap_contrib <- data_list[["shap_contrib"]]
+  features <- colnames(data)
 
   which <- match.arg(which)
   if (which == "2d")
     stop("2D plots are not implemented yet")
 
-  if (is.null(features)) {
-    imp <- xgb.importance(model = model, trees = trees)
-    top_n <- as.integer(top_n[1])
-    if (top_n < 1 && top_n > 100)
-      stop("top_n: must be an integer within [1, 100]")
-    features <- imp$Feature[1:min(top_n, NROW(imp))]
-  }
-
-  if (is.character(features)) {
-    if (is.null(colnames(data)))
-      stop("Either provide `data` with column names or provide `features` as column indices")
-    features <- match(features, colnames(data))
-  }
-
   if (n_col > length(features)) n_col <- length(features)
-
-  if (is.list(shap_contrib)) { # multiclass: either choose a class or merge
-    shap_contrib <- if (!is.null(target_class)) shap_contrib[[target_class + 1]]
-                    else Reduce("+", lapply(shap_contrib, abs))
-  }
-
-  shap_contrib <- shap_contrib[, features, drop = FALSE]
-  data <- data[, features, drop = FALSE]
-  cols <- colnames(data)
-  if (is.null(cols)) cols <- colnames(shap_contrib)
-  if (is.null(cols)) cols <- paste0('X', seq_len(ncol(data)))
-  colnames(data) <- cols
-  colnames(shap_contrib) <- cols
-
   if (plot && which == "1d") {
     op <- par(mfrow = c(ceiling(length(features) / n_col), n_col),
               oma = c(0, 0, 0, 0) + 0.2,
               mar = c(3.5, 3.5, 0, 0) + 0.1,
               mgp = c(1.7, 0.6, 0))
-    for (f in cols) {
+    for (f in features) {
       ord <- order(data[, f])
       x <- data[, f][ord]
       y <- shap_contrib[, f][ord]
@@ -216,3 +182,105 @@ xgb.plot.shap <- function(data, shap_contrib = NULL, features = NULL, top_n = 1,
   }
   invisible(list(data = data, shap_contrib = shap_contrib))
 }
+
+#' SHAP contribution dependency summary plot
+#'
+#' Compare SHAP contributions of different features.
+#'
+#' A point plot (each point representing one sample from \code{data}) is
+#' produced for each feature, with the points plotted on the SHAP value axis.
+#' Each point (observation) is coloured based on its feature value. The plot
+#' hence allows us to see which features have a negative / positive contribution
+#' on the model prediction, and whether the contribution is different for larger
+#' or smaller values of the feature. We effectively try to replicate the
+#' \code{summary_plot} function from https://github.com/slundberg/shap.
+#'
+#' @inheritParams xgb.plot.shap
+#'
+#' @return A \code{ggplot2} object.
+#' @export
+#'
+#' @examples See \code{\link{xgb.plot.shap}}.
+#' @seealso \code{\link{xgb.plot.shap}}, \code{\link{xgb.ggplot.shap.summary}},
+#'   \code{\url{https://github.com/slundberg/shap}}
+xgb.plot.shap.summary <- function(data, shap_contrib = NULL, features = NULL, top_n = 10, model = NULL,
+                                  trees = NULL, target_class = NULL, approxcontrib = FALSE, subsample = NULL) {
+  # Only ggplot implementation is available.
+  xgb.ggplot.shap.summary(data, shap_contrib, features, top_n, model, trees, target_class, approxcontrib, subsample)
+}
+
+#' Prepare data for SHAP plots. To be used in xgb.plot.shap, xgb.plot.shap.summary, etc.
+#' Internal utility function.
+#'
+#' @return A list containing: 'data', a matrix containing sample observations
+#'   and their feature values; 'shap_contrib', a matrix containing the SHAP contribution
+#'   values for these observations.
+xgb.shap.data <- function(data, shap_contrib = NULL, features = NULL, top_n = 1, model = NULL,
+                          trees = NULL, target_class = NULL, approxcontrib = FALSE,
+                          subsample = NULL, max_observations = 100000) {
+  if (!is.matrix(data) && !inherits(data, "dgCMatrix"))
+    stop("data: must be either matrix or dgCMatrix")
+
+  if (is.null(shap_contrib) && (is.null(model) || !inherits(model, "xgb.Booster")))
+    stop("when shap_contrib is not provided, one must provide an xgb.Booster model")
+
+  if (is.null(features) && (is.null(model) || !inherits(model, "xgb.Booster")))
+    stop("when features are not provided, one must provide an xgb.Booster model to rank the features")
+
+  if (!is.null(shap_contrib) &&
+      (!is.matrix(shap_contrib) || nrow(shap_contrib) != nrow(data) || ncol(shap_contrib) != ncol(data) + 1))
+    stop("shap_contrib is not compatible with the provided data")
+
+  if (is.character(features) && is.null(colnames(data)))
+    stop("either provide `data` with column names or provide `features` as column indices")
+
+  if (is.null(model$feature_names) && model$nfeatures != ncol(data))
+    stop("if model has no feature_names, columns in `data` must match features in model")
+
+  if (!is.null(subsample)) {
+    idx <- sample(x = seq_len(nrow(data)), size = as.integer(subsample * nrow(data)), replace = FALSE)
+  } else {
+    idx <- seq_len(min(nrow(data), max_observations))
+  }
+  data <- data[idx, ]
+  if (is.null(colnames(data))) {
+    colnames(data) <- paste0("X", seq_len(ncol(data)))
+  }
+
+  if (!is.null(shap_contrib)) {
+    if (is.list(shap_contrib)) { # multiclass: either choose a class or merge
+      shap_contrib <- if (!is.null(target_class)) shap_contrib[[target_class + 1]] else Reduce("+", lapply(shap_contrib, abs))
+    }
+    shap_contrib <- shap_contrib[idx, ]
+    if (is.null(colnames(shap_contrib))) {
+      colnames(shap_contrib) <- paste0("X", seq_len(ncol(data)))
+    }
+  } else {
+    shap_contrib <- predict(model, newdata = data, predcontrib = TRUE, approxcontrib = approxcontrib)
+    if (is.list(shap_contrib)) { # multiclass: either choose a class or merge
+      shap_contrib <- if (!is.null(target_class)) shap_contrib[[target_class + 1]] else Reduce("+", lapply(shap_contrib, abs))
+    }
+  }
+
+  if (is.null(features)) {
+    if (!is.null(model$feature_names)) {
+      imp <- xgb.importance(model = model, trees = trees)
+    } else {
+      imp <- xgb.importance(model = model, trees = trees, feature_names = colnames(data))
+    }
+    top_n <- top_n[1]
+    if (top_n < 1 | top_n > 100) stop("top_n: must be an integer within [1, 100]")
+    features <- imp$Feature[1:min(top_n, NROW(imp))]
+  }
+  if (is.character(features)) {
+    features <- match(features, colnames(data))
+  }
+
+  shap_contrib <- shap_contrib[, features, drop = FALSE]
+  data <- data[, features, drop = FALSE]
+
+  list(
+    data = data,
+    shap_contrib = shap_contrib
+  )
+}
diff --git a/R-package/tests/testthat/test_helpers.R b/R-package/tests/testthat/test_helpers.R
index 2ee1acf568a7..86c0efd0207e 100644
--- a/R-package/tests/testthat/test_helpers.R
+++ b/R-package/tests/testthat/test_helpers.R
@@ -351,11 +351,47 @@ test_that("xgb.plot.deepness works", {
   xgb.ggplot.deepness(model = bst.Tree)
 })
 
+test_that("xgb.shap.data works when top_n is provided", {
+  data_list <- xgb.shap.data(data = sparse_matrix, model = bst.Tree, top_n = 2)
+  expect_equal(names(data_list), c("data", "shap_contrib"))
+  expect_equal(NCOL(data_list$data), 2)
+  expect_equal(NCOL(data_list$shap_contrib), 2)
+  expect_equal(NROW(data_list$data), NROW(data_list$shap_contrib))
+  expect_gt(length(colnames(data_list$data)), 0)
+  expect_gt(length(colnames(data_list$shap_contrib)), 0)
+
+  # for multiclass without target class provided
+  data_list <- xgb.shap.data(data = as.matrix(iris[, -5]), model = mbst.Tree, top_n = 2)
+  expect_equal(dim(data_list$shap_contrib), c(nrow(iris), 2))
+  # for multiclass with target class provided
+  data_list <- xgb.shap.data(data = as.matrix(iris[, -5]), model = mbst.Tree, top_n = 2, target_class = 0)
+  expect_equal(dim(data_list$shap_contrib), c(nrow(iris), 2))
+})
+
+test_that("xgb.shap.data works with subsampling", {
+  data_list <- xgb.shap.data(data = sparse_matrix, model = bst.Tree, top_n = 2, subsample = 0.8)
+  expect_equal(NROW(data_list$data), as.integer(0.8 * nrow(sparse_matrix)))
+  expect_equal(NROW(data_list$data), NROW(data_list$shap_contrib))
+})
+
+test_that("prepare.ggplot.shap.data works", {
+  data_list <- xgb.shap.data(data = sparse_matrix, model = bst.Tree, top_n = 2)
+  plot_data <- prepare.ggplot.shap.data(data_list, normalize = TRUE)
+  expect_s3_class(plot_data, "data.frame")
+  expect_equal(names(plot_data), c("id", "feature", "feature_value", "shap_value"))
+  expect_s3_class(plot_data$feature, "factor")
+  # Each observation should have 1 row for each feature
+  expect_equal(nrow(plot_data), nrow(sparse_matrix) * 2)
+})
+
 test_that("xgb.plot.shap works", {
   sh <- xgb.plot.shap(data = sparse_matrix, model = bst.Tree, top_n = 2, col = 4)
   expect_equal(names(sh), c("data", "shap_contrib"))
-  expect_equal(NCOL(sh$data), 2)
-  expect_equal(NCOL(sh$shap_contrib), 2)
+})
+
+test_that("xgb.plot.shap.summary works", {
+  xgb.plot.shap.summary(data = sparse_matrix, model = bst.Tree, top_n = 2)
+  xgb.ggplot.shap.summary(data = sparse_matrix, model = bst.Tree, top_n = 2)
 })
 
 test_that("check.deprecation works", {

From f58e41bad8905d8531038ce70f7c752c4b1c1557 Mon Sep 17 00:00:00 2001
From: Anthony D'Amato <anthony.damato@hotmail.fr>
Date: Wed, 19 Aug 2020 03:55:37 +0200
Subject: [PATCH 04/15] Fix deterministic partitioning with dataset containing
 Double.NaN (#5996)

The functions featureValueOfSparseVector or featureValueOfDenseVector could return a Float.NaN if the input vectore was containing any missing values. This would make fail the partition key computation and most of the vectors would end up in the same partition. We fix this by avoid returning a NaN and simply use the row HashCode in this case.
We added a test to ensure that the repartition is indeed now uniform on input dataset containing values by checking that the partitions size variance is below a certain threshold.

Signed-off-by: Anthony D'Amato <anthony.damato@hotmail.fr>
---
 .../xgboost4j/scala/spark/DataUtils.scala     |  3 +-
 .../DeterministicPartitioningSuite.scala      | 31 +++++++++++++++++++
 2 files changed, 33 insertions(+), 1 deletion(-)

diff --git a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/DataUtils.scala b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/DataUtils.scala
index df787d8eb8ab..15ffe4c06c42 100644
--- a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/DataUtils.scala
+++ b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/DataUtils.scala
@@ -103,7 +103,8 @@ object DataUtils extends Serializable {
       case sparseVector: SparseVector =>
         featureValueOfSparseVector(rowHashCode, sparseVector)
     }
-    math.abs((rowHashCode.toLong + featureValue).toString.hashCode % numPartitions)
+    val nonNaNFeatureValue = if (featureValue.isNaN) { 0.0f } else { featureValue }
+    math.abs((rowHashCode.toLong + nonNaNFeatureValue).toString.hashCode % numPartitions)
   }
 
   private def attachPartitionKey(
diff --git a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/DeterministicPartitioningSuite.scala b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/DeterministicPartitioningSuite.scala
index 986b0843b5f3..ff0492f41a4a 100644
--- a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/DeterministicPartitioningSuite.scala
+++ b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/DeterministicPartitioningSuite.scala
@@ -16,6 +16,7 @@
 
 package ml.dmlc.xgboost4j.scala.spark
 
+import org.apache.spark.ml.linalg.Vectors
 import org.scalatest.FunSuite
 
 import org.apache.spark.sql.functions._
@@ -79,4 +80,34 @@ class DeterministicPartitioningSuite extends FunSuite with TmpFolderPerSuite wit
       map2
     }
   }
+
+  test("deterministic partitioning has a uniform repartition on dataset with missing values") {
+    val N = 10000
+    val dataset = (0 until N).map{ n =>
+      (n, n % 2, Vectors.sparse(3, Array(0, 1, 2), Array(Double.NaN, n, Double.NaN)))
+    }
+
+    val df = ss.createDataFrame(sc.parallelize(dataset)).toDF("id", "label", "features")
+
+    val dfRepartitioned = DataUtils.convertDataFrameToXGBLabeledPointRDDs(
+      col("label"),
+      col("features"),
+      lit(1.0),
+      lit(Float.NaN),
+      None,
+      10,
+      deterministicPartition = true,
+      df
+    ).head
+
+    val partitionsSizes = dfRepartitioned
+      .mapPartitions(iter => Array(iter.size.toDouble).iterator, true)
+      .collect()
+    val partitionMean = partitionsSizes.sum / partitionsSizes.length
+    val squaredDiffSum = partitionsSizes
+      .map(partitionSize => Math.pow(partitionSize - partitionMean, 2))
+    val standardDeviation = math.sqrt(squaredDiffSum.sum / squaredDiffSum.length)
+
+    assert(standardDeviation < math.sqrt(N.toDouble))
+  }
 }

From 90355b4f007ae5e0d0c7f9ccaf9f6752d80dbfa8 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Wed, 19 Aug 2020 09:57:43 +0800
Subject: [PATCH 05/15] Make JSON the default full serialization format.
 (#6027)

---
 include/xgboost/generic_parameters.h |  4 +--
 tests/cpp/test_serialization.cc      | 38 ----------------------------
 tests/python/test_cli.py             |  9 ++++---
 3 files changed, 8 insertions(+), 43 deletions(-)

diff --git a/include/xgboost/generic_parameters.h b/include/xgboost/generic_parameters.h
index 752342b9a90c..a78453604467 100644
--- a/include/xgboost/generic_parameters.h
+++ b/include/xgboost/generic_parameters.h
@@ -27,7 +27,7 @@ struct GenericParameter : public XGBoostParameter<GenericParameter> {
   int gpu_id;
   // gpu page size in external memory mode, 0 means using the default.
   size_t gpu_page_size;
-  bool enable_experimental_json_serialization {false};
+  bool enable_experimental_json_serialization {true};
   bool validate_parameters {false};
 
   void CheckDeprecated() {
@@ -68,7 +68,7 @@ struct GenericParameter : public XGBoostParameter<GenericParameter> {
         .set_lower_bound(0)
         .describe("GPU page size when running in external memory mode.");
     DMLC_DECLARE_FIELD(enable_experimental_json_serialization)
-        .set_default(false)
+        .set_default(true)
         .describe("Enable using JSON for memory serialization (Python Pickle, "
                   "rabit checkpoints etc.).");
     DMLC_DECLARE_FIELD(validate_parameters)
diff --git a/tests/cpp/test_serialization.cc b/tests/cpp/test_serialization.cc
index 23334408f1fa..66428e8de950 100644
--- a/tests/cpp/test_serialization.cc
+++ b/tests/cpp/test_serialization.cc
@@ -179,7 +179,6 @@ TEST_F(SerializationTest, Exact) {
                             {"nthread", "1"},
                             {"base_score", "3.14195265"},
                             {"max_depth", "2"},
-                            {"enable_experimental_json_serialization", "1"},
                             {"tree_method", "exact"}},
                            fmap_, p_dmat_);
 
@@ -189,7 +188,6 @@ TEST_F(SerializationTest, Exact) {
                             {"base_score", "3.14195265"},
                             {"max_depth", "2"},
                             {"num_parallel_tree", "4"},
-                            {"enable_experimental_json_serialization", "1"},
                             {"tree_method", "exact"}},
                            fmap_, p_dmat_);
 
@@ -198,7 +196,6 @@ TEST_F(SerializationTest, Exact) {
                             {"nthread", "1"},
                             {"base_score", "3.14195265"},
                             {"max_depth", "2"},
-                            {"enable_experimental_json_serialization", "1"},
                             {"tree_method", "exact"}},
                            fmap_, p_dmat_);
 }
@@ -208,7 +205,6 @@ TEST_F(SerializationTest, Approx) {
                             {"seed", "0"},
                             {"nthread", "1"},
                             {"max_depth", "2"},
-                            {"enable_experimental_json_serialization", "1"},
                             {"tree_method", "approx"}},
                            fmap_, p_dmat_);
 
@@ -217,7 +213,6 @@ TEST_F(SerializationTest, Approx) {
                             {"nthread", "1"},
                             {"max_depth", "2"},
                             {"num_parallel_tree", "4"},
-                            {"enable_experimental_json_serialization", "1"},
                             {"tree_method", "approx"}},
                            fmap_, p_dmat_);
 
@@ -225,7 +220,6 @@ TEST_F(SerializationTest, Approx) {
                             {"seed", "0"},
                             {"nthread", "1"},
                             {"max_depth", "2"},
-                            {"enable_experimental_json_serialization", "1"},
                             {"tree_method", "approx"}},
                            fmap_, p_dmat_);
 }
@@ -235,7 +229,6 @@ TEST_F(SerializationTest, Hist) {
                             {"seed", "0"},
                             {"nthread", "1"},
                             {"max_depth", "2"},
-                            {"enable_experimental_json_serialization", "1"},
                             {"tree_method", "hist"}},
                            fmap_, p_dmat_);
 
@@ -244,7 +237,6 @@ TEST_F(SerializationTest, Hist) {
                             {"nthread", "1"},
                             {"max_depth", "2"},
                             {"num_parallel_tree", "4"},
-                            {"enable_experimental_json_serialization", "1"},
                             {"tree_method", "hist"}},
                            fmap_, p_dmat_);
 
@@ -252,7 +244,6 @@ TEST_F(SerializationTest, Hist) {
                             {"seed", "0"},
                             {"nthread", "1"},
                             {"max_depth", "2"},
-                            {"enable_experimental_json_serialization", "1"},
                             {"tree_method", "hist"}},
                            fmap_, p_dmat_);
 }
@@ -261,7 +252,6 @@ TEST_F(SerializationTest, CPUCoordDescent) {
   TestLearnerSerialization({{"booster", "gblinear"},
                             {"seed", "0"},
                             {"nthread", "1"},
-                            {"enable_experimental_json_serialization", "1"},
                             {"updater", "coord_descent"}},
                            fmap_, p_dmat_);
 }
@@ -270,7 +260,6 @@ TEST_F(SerializationTest, CPUCoordDescent) {
 TEST_F(SerializationTest, GpuHist) {
   TestLearnerSerialization({{"booster", "gbtree"},
                             {"seed", "0"},
-                            {"enable_experimental_json_serialization", "1"},
                             {"nthread", "1"},
                             {"max_depth", "2"},
                             {"tree_method", "gpu_hist"}},
@@ -278,7 +267,6 @@ TEST_F(SerializationTest, GpuHist) {
 
   TestLearnerSerialization({{"booster", "gbtree"},
                             {"seed", "0"},
-                            {"enable_experimental_json_serialization", "1"},
                             {"nthread", "1"},
                             {"max_depth", "2"},
                             {"num_parallel_tree", "4"},
@@ -287,7 +275,6 @@ TEST_F(SerializationTest, GpuHist) {
 
   TestLearnerSerialization({{"booster", "dart"},
                             {"seed", "0"},
-                            {"enable_experimental_json_serialization", "1"},
                             {"nthread", "1"},
                             {"max_depth", "2"},
                             {"tree_method", "gpu_hist"}},
@@ -345,7 +332,6 @@ TEST_F(SerializationTest, GPUCoordDescent) {
   TestLearnerSerialization({{"booster", "gblinear"},
                             {"seed", "0"},
                             {"nthread", "1"},
-                            {"enable_experimental_json_serialization", "1"},
                             {"updater", "gpu_coord_descent"}},
                            fmap_, p_dmat_);
 }
@@ -380,7 +366,6 @@ TEST_F(LogitSerializationTest, Exact) {
                             {"seed", "0"},
                             {"nthread", "1"},
                             {"max_depth", "2"},
-                            {"enable_experimental_json_serialization", "1"},
                             {"tree_method", "exact"}},
                            fmap_, p_dmat_);
 
@@ -389,7 +374,6 @@ TEST_F(LogitSerializationTest, Exact) {
                             {"seed", "0"},
                             {"nthread", "1"},
                             {"max_depth", "2"},
-                            {"enable_experimental_json_serialization", "1"},
                             {"tree_method", "exact"}},
                            fmap_, p_dmat_);
 }
@@ -400,7 +384,6 @@ TEST_F(LogitSerializationTest, Approx) {
                             {"seed", "0"},
                             {"nthread", "1"},
                             {"max_depth", "2"},
-                            {"enable_experimental_json_serialization", "1"},
                             {"tree_method", "approx"}},
                            fmap_, p_dmat_);
 
@@ -409,7 +392,6 @@ TEST_F(LogitSerializationTest, Approx) {
                             {"seed", "0"},
                             {"nthread", "1"},
                             {"max_depth", "2"},
-                            {"enable_experimental_json_serialization", "1"},
                             {"tree_method", "approx"}},
                            fmap_, p_dmat_);
 }
@@ -420,7 +402,6 @@ TEST_F(LogitSerializationTest, Hist) {
                             {"seed", "0"},
                             {"nthread", "1"},
                             {"max_depth", "2"},
-                            {"enable_experimental_json_serialization", "1"},
                             {"tree_method", "hist"}},
                            fmap_, p_dmat_);
 
@@ -429,7 +410,6 @@ TEST_F(LogitSerializationTest, Hist) {
                             {"seed", "0"},
                             {"nthread", "1"},
                             {"max_depth", "2"},
-                            {"enable_experimental_json_serialization", "1"},
                             {"tree_method", "hist"}},
                            fmap_, p_dmat_);
 }
@@ -438,7 +418,6 @@ TEST_F(LogitSerializationTest, CPUCoordDescent) {
   TestLearnerSerialization({{"booster", "gblinear"},
                             {"seed", "0"},
                             {"nthread", "1"},
-                            {"enable_experimental_json_serialization", "1"},
                             {"updater", "coord_descent"}},
                            fmap_, p_dmat_);
 }
@@ -450,14 +429,12 @@ TEST_F(LogitSerializationTest, GpuHist) {
                             {"seed", "0"},
                             {"nthread", "1"},
                             {"max_depth", "2"},
-                            {"enable_experimental_json_serialization", "1"},
                             {"tree_method", "gpu_hist"}},
                            fmap_, p_dmat_);
 
   TestLearnerSerialization({{"booster", "gbtree"},
                             {"objective", "binary:logistic"},
                             {"seed", "0"},
-                            {"enable_experimental_json_serialization", "1"},
                             {"nthread", "1"},
                             {"max_depth", "2"},
                             {"num_parallel_tree", "4"},
@@ -469,7 +446,6 @@ TEST_F(LogitSerializationTest, GpuHist) {
                             {"seed", "0"},
                             {"nthread", "1"},
                             {"max_depth", "2"},
-                            {"enable_experimental_json_serialization", "1"},
                             {"tree_method", "gpu_hist"}},
                            fmap_, p_dmat_);
 }
@@ -479,7 +455,6 @@ TEST_F(LogitSerializationTest, GPUCoordDescent) {
                             {"objective", "binary:logistic"},
                             {"seed", "0"},
                             {"nthread", "1"},
-                            {"enable_experimental_json_serialization", "1"},
                             {"updater", "gpu_coord_descent"}},
                            fmap_, p_dmat_);
 }
@@ -515,7 +490,6 @@ TEST_F(MultiClassesSerializationTest, Exact) {
                             {"seed", "0"},
                             {"nthread", "1"},
                             {"max_depth", std::to_string(kClasses)},
-                            {"enable_experimental_json_serialization", "1"},
                             {"tree_method", "exact"}},
                            fmap_, p_dmat_);
 
@@ -525,7 +499,6 @@ TEST_F(MultiClassesSerializationTest, Exact) {
                             {"nthread", "1"},
                             {"max_depth", std::to_string(kClasses)},
                             {"num_parallel_tree", "4"},
-                            {"enable_experimental_json_serialization", "1"},
                             {"tree_method", "exact"}},
                            fmap_, p_dmat_);
 
@@ -534,7 +507,6 @@ TEST_F(MultiClassesSerializationTest, Exact) {
                             {"seed", "0"},
                             {"nthread", "1"},
                             {"max_depth", std::to_string(kClasses)},
-                            {"enable_experimental_json_serialization", "1"},
                             {"tree_method", "exact"}},
                            fmap_, p_dmat_);
 }
@@ -545,7 +517,6 @@ TEST_F(MultiClassesSerializationTest, Approx) {
                             {"seed", "0"},
                             {"nthread", "1"},
                             {"max_depth", std::to_string(kClasses)},
-                            {"enable_experimental_json_serialization", "1"},
                             {"tree_method", "approx"}},
                            fmap_, p_dmat_);
 
@@ -554,7 +525,6 @@ TEST_F(MultiClassesSerializationTest, Approx) {
                             {"seed", "0"},
                             {"nthread", "1"},
                             {"max_depth", std::to_string(kClasses)},
-                            {"enable_experimental_json_serialization", "1"},
                             {"tree_method", "approx"}},
                            fmap_, p_dmat_);
 }
@@ -565,7 +535,6 @@ TEST_F(MultiClassesSerializationTest, Hist) {
                             {"seed", "0"},
                             {"nthread", "1"},
                             {"max_depth", std::to_string(kClasses)},
-                            {"enable_experimental_json_serialization", "1"},
                             {"tree_method", "hist"}},
                            fmap_, p_dmat_);
 
@@ -574,7 +543,6 @@ TEST_F(MultiClassesSerializationTest, Hist) {
                             {"seed", "0"},
                             {"nthread", "1"},
                             {"max_depth", std::to_string(kClasses)},
-                            {"enable_experimental_json_serialization", "1"},
                             {"num_parallel_tree", "4"},
                             {"tree_method", "hist"}},
                            fmap_, p_dmat_);
@@ -584,7 +552,6 @@ TEST_F(MultiClassesSerializationTest, Hist) {
                             {"seed", "0"},
                             {"nthread", "1"},
                             {"max_depth", std::to_string(kClasses)},
-                            {"enable_experimental_json_serialization", "1"},
                             {"tree_method", "hist"}},
                            fmap_, p_dmat_);
 }
@@ -593,7 +560,6 @@ TEST_F(MultiClassesSerializationTest, CPUCoordDescent) {
   TestLearnerSerialization({{"booster", "gblinear"},
                             {"seed", "0"},
                             {"nthread", "1"},
-                            {"enable_experimental_json_serialization", "1"},
                             {"updater", "coord_descent"}},
                            fmap_, p_dmat_);
 }
@@ -609,7 +575,6 @@ TEST_F(MultiClassesSerializationTest, GpuHist) {
                             // different result (1e-7) with CPU predictor for some
                             // entries.
                             {"predictor", "gpu_predictor"},
-                            {"enable_experimental_json_serialization", "1"},
                             {"tree_method", "gpu_hist"}},
                            fmap_, p_dmat_);
 
@@ -621,7 +586,6 @@ TEST_F(MultiClassesSerializationTest, GpuHist) {
                             // GPU_Hist has higher floating point error. 1e-6 doesn't work
                             // after num_parallel_tree goes to 4
                             {"num_parallel_tree", "3"},
-                            {"enable_experimental_json_serialization", "1"},
                             {"tree_method", "gpu_hist"}},
                            fmap_, p_dmat_);
 
@@ -630,7 +594,6 @@ TEST_F(MultiClassesSerializationTest, GpuHist) {
                             {"seed", "0"},
                             {"nthread", "1"},
                             {"max_depth", std::to_string(kClasses)},
-                            {"enable_experimental_json_serialization", "1"},
                             {"tree_method", "gpu_hist"}},
                            fmap_, p_dmat_);
 }
@@ -640,7 +603,6 @@ TEST_F(MultiClassesSerializationTest, GPUCoordDescent) {
                             {"num_class", std::to_string(kClasses)},
                             {"seed", "0"},
                             {"nthread", "1"},
-                            {"enable_experimental_json_serialization", "1"},
                             {"updater", "gpu_coord_descent"}},
                            fmap_, p_dmat_);
 }
diff --git a/tests/python/test_cli.py b/tests/python/test_cli.py
index 3ff37ea521c8..4efb87e21938 100644
--- a/tests/python/test_cli.py
+++ b/tests/python/test_cli.py
@@ -47,9 +47,12 @@ def test_cli_model(self):
         seed = 1994
 
         with tempfile.TemporaryDirectory() as tmpdir:
-            model_out_cli = os.path.join(tmpdir, 'test_load_cli_model-cli.bin')
-            model_out_py = os.path.join(tmpdir, 'test_cli_model-py.bin')
-            config_path = os.path.join(tmpdir, 'test_load_cli_model.conf')
+            model_out_cli = os.path.join(
+                tmpdir, 'test_load_cli_model-cli.json')
+            model_out_py = os.path.join(
+                tmpdir, 'test_cli_model-py.json')
+            config_path = os.path.join(
+                tmpdir, 'test_load_cli_model.conf')
 
             train_conf = self.template.format(data_path=data_path,
                                               seed=seed,

From 29b7fea572888cc335679380a72d16ea3d88c104 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Wed, 19 Aug 2020 10:03:45 +0800
Subject: [PATCH 06/15] Optimize cpu sketch allreduce for sparse data. (#6009)

* Bypass RABIT serialization reducer and use custom allgather based merging.
---
 src/common/hist_util.h             |  24 +--
 src/common/quantile.cc             | 225 +++++++++++++++++++++++------
 src/common/quantile.h              |  26 ++++
 tests/cpp/c_api/test_c_api.cc      |  10 +-
 tests/cpp/common/test_hist_util.cc |   6 +-
 tests/cpp/common/test_hist_util.h  |   4 +-
 tests/cpp/common/test_json.cc      |   6 +-
 tests/cpp/common/test_quantile.cc  | 112 +++++++++++++-
 tests/cpp/common/test_quantile.h   |   4 +-
 tests/python/test_with_dask.py     |  25 +++-
 10 files changed, 356 insertions(+), 86 deletions(-)

diff --git a/src/common/hist_util.h b/src/common/hist_util.h
index d86b73135f34..0334b901224a 100644
--- a/src/common/hist_util.h
+++ b/src/common/hist_util.h
@@ -116,26 +116,14 @@ inline HistogramCuts SketchOnDMatrix(DMatrix *m, int32_t max_bins) {
   for (auto& column : column_sizes) {
     column.resize(info.num_col_, 0);
   }
-  for (auto const& page : m->GetBatches<SparsePage>()) {
-    page.data.HostVector();
-    page.offset.HostVector();
-    ParallelFor(page.Size(), threads, [&](size_t i) {
-      auto &local_column_sizes = column_sizes.at(omp_get_thread_num());
-      auto row = page[i];
-      auto const *p_row = row.data();
-      for (size_t j = 0; j < row.size(); ++j) {
-        local_column_sizes.at(p_row[j].index)++;
-      }
-    });
-  }
   std::vector<bst_row_t> reduced(info.num_col_, 0);
-
-  ParallelFor(info.num_col_, threads, [&](size_t i) {
-    for (auto const &thread : column_sizes) {
-      reduced[i] += thread[i];
+  for (auto const& page : m->GetBatches<SparsePage>()) {
+    auto const &entries_per_column =
+        HostSketchContainer::CalcColumnSize(page, info.num_col_, threads);
+    for (size_t i = 0; i < entries_per_column.size(); ++i) {
+      reduced[i] += entries_per_column[i];
     }
-  });
-
+  }
   HostSketchContainer container(reduced, max_bins,
                                 HostSketchContainer::UseGroup(info));
   for (auto const &page : m->GetBatches<SparsePage>()) {
diff --git a/src/common/quantile.cc b/src/common/quantile.cc
index 374864c8f4b0..9ab48a304b77 100644
--- a/src/common/quantile.cc
+++ b/src/common/quantile.cc
@@ -25,34 +25,67 @@ HostSketchContainer::HostSketchContainer(std::vector<bst_row_t> columns_size,
   }
 }
 
-std::vector<bst_feature_t> LoadBalance(SparsePage const &page,
-                                       std::vector<size_t> columns_size,
-                                       size_t const nthreads) {
-  /* Some sparse datasets have their mass concentrating on small
-   * number of features.  To avoid wating for a few threads running
-   * forever, we here distirbute different number of columns to
-   * different threads according to number of entries. */
-  size_t const total_entries = page.data.Size();
+std::vector<bst_row_t>
+HostSketchContainer::CalcColumnSize(SparsePage const &batch,
+                                    bst_feature_t const n_columns,
+                                    size_t const nthreads) {
+  auto page = batch.GetView();
+  std::vector<std::vector<bst_row_t>> column_sizes(nthreads);
+  for (auto &column : column_sizes) {
+    column.resize(n_columns, 0);
+  }
+
+  ParallelFor(page.Size(), nthreads, [&](size_t i) {
+    auto &local_column_sizes = column_sizes.at(omp_get_thread_num());
+    auto row = page[i];
+    auto const *p_row = row.data();
+    for (size_t j = 0; j < row.size(); ++j) {
+      local_column_sizes.at(p_row[j].index)++;
+    }
+  });
+  std::vector<bst_row_t> entries_per_columns(n_columns, 0);
+  ParallelFor(n_columns, nthreads, [&](size_t i) {
+    for (auto const &thread : column_sizes) {
+      entries_per_columns[i] += thread[i];
+    }
+  });
+  return entries_per_columns;
+}
+
+std::vector<bst_feature_t> HostSketchContainer::LoadBalance(
+    SparsePage const &batch, bst_feature_t n_columns, size_t const nthreads) {
+  /* Some sparse datasets have their mass concentrating on small number of features.  To
+   * avoid wating for a few threads running forever, we here distirbute different number
+   * of columns to different threads according to number of entries.
+   */
+  auto page = batch.GetView();
+  size_t const total_entries = page.data.size();
   size_t const entries_per_thread = common::DivRoundUp(total_entries, nthreads);
 
-  std::vector<bst_feature_t> cols_ptr(nthreads+1, 0);
+  std::vector<std::vector<bst_row_t>> column_sizes(nthreads);
+  for (auto& column : column_sizes) {
+    column.resize(n_columns, 0);
+  }
+  std::vector<bst_row_t> entries_per_columns =
+      CalcColumnSize(batch, n_columns, nthreads);
+  std::vector<bst_feature_t> cols_ptr(nthreads + 1, 0);
   size_t count {0};
   size_t current_thread {1};
 
-  for (auto col : columns_size) {
-    cols_ptr[current_thread]++;  // add one column to thread
+  for (auto col : entries_per_columns) {
+    cols_ptr.at(current_thread)++;  // add one column to thread
     count += col;
-    if (count > entries_per_thread + 1) {
+    CHECK_LE(count, total_entries);
+    if (count > entries_per_thread) {
       current_thread++;
       count = 0;
-      cols_ptr[current_thread] = cols_ptr[current_thread-1];
+      cols_ptr.at(current_thread) = cols_ptr[current_thread-1];
     }
   }
   // Idle threads.
   for (; current_thread < cols_ptr.size() - 1; ++current_thread) {
     cols_ptr[current_thread+1] = cols_ptr[current_thread];
   }
-
   return cols_ptr;
 }
 
@@ -67,11 +100,10 @@ void HostSketchContainer::PushRowPage(SparsePage const &page,
   // Use group index for weights?
   auto batch = page.GetView();
   dmlc::OMPException exec;
-  // Parallel over columns.  Asumming the data is dense, each thread owns a set of
-  // consecutive columns.
+  // Parallel over columns.  Each thread owns a set of consecutive columns.
   auto const ncol = static_cast<uint32_t>(info.num_col_);
   auto const is_dense = info.num_nonzero_ == info.num_col_ * info.num_row_;
-  auto thread_columns_ptr = LoadBalance(page, columns_size_, nthread);
+  auto thread_columns_ptr = LoadBalance(page, info.num_col_, nthread);
 
 #pragma omp parallel num_threads(nthread)
   {
@@ -112,58 +144,158 @@ void HostSketchContainer::PushRowPage(SparsePage const &page,
   monitor_.Stop(__func__);
 }
 
-void AddCutPoint(WQuantileSketch<float, float>::SummaryContainer const &summary,
-                 int max_bin, HistogramCuts *cuts) {
-  size_t required_cuts = std::min(summary.size, static_cast<size_t>(max_bin));
-  auto& cut_values = cuts->cut_values_.HostVector();
-  for (size_t i = 1; i < required_cuts; ++i) {
-    bst_float cpt = summary.data[i].value;
-    if (i == 1 || cpt > cuts->cut_values_.ConstHostVector().back()) {
-      cut_values.push_back(cpt);
-    }
+void HostSketchContainer::GatherSketchInfo(
+    std::vector<WQSketch::SummaryContainer> const &reduced,
+    std::vector<size_t> *p_worker_segments,
+    std::vector<bst_row_t> *p_sketches_scan,
+    std::vector<WQSketch::Entry> *p_global_sketches) {
+  auto& worker_segments = *p_worker_segments;
+  worker_segments.resize(1, 0);
+  auto world = rabit::GetWorldSize();
+  auto rank = rabit::GetRank();
+  auto n_columns = sketches_.size();
+
+  std::vector<bst_row_t> sketch_size;
+  for (auto const& sketch : reduced) {
+    sketch_size.push_back(sketch.size);
+  }
+  std::vector<bst_row_t>& sketches_scan = *p_sketches_scan;
+  sketches_scan.resize((n_columns + 1) * world, 0);
+  size_t beg_scan = rank * (n_columns + 1);
+  std::partial_sum(sketch_size.cbegin(), sketch_size.cend(),
+                   sketches_scan.begin() + beg_scan + 1);
+  // Gather all column pointers
+  rabit::Allreduce<rabit::op::Sum>(sketches_scan.data(), sketches_scan.size());
+
+  for (int32_t i = 0; i < world; ++i) {
+    size_t back = (i + 1) * (n_columns + 1) - 1;
+    auto n_entries = sketches_scan.at(back);
+    worker_segments.push_back(n_entries);
+  }
+  // Offset of sketch from each worker.
+  std::partial_sum(worker_segments.begin(), worker_segments.end(),
+                   worker_segments.begin());
+  CHECK_GE(worker_segments.size(), 1);
+  auto total = worker_segments.back();
+
+  auto& global_sketches = *p_global_sketches;
+  global_sketches.resize(total, WQSketch::Entry{0, 0, 0, 0});
+  auto worker_sketch = Span<WQSketch::Entry>{global_sketches}.subspan(
+      worker_segments[rank], worker_segments[rank + 1] - worker_segments[rank]);
+  size_t cursor = 0;
+  for (auto const &sketch : reduced) {
+    std::copy(sketch.data, sketch.data + sketch.size,
+              worker_sketch.begin() + cursor);
+    cursor += sketch.size;
   }
+
+  static_assert(sizeof(WQSketch::Entry) / 4 == sizeof(float), "");
+  rabit::Allreduce<rabit::op::Sum>(
+      reinterpret_cast<float *>(global_sketches.data()),
+      global_sketches.size() * sizeof(WQSketch::Entry) / sizeof(float));
 }
 
-void HostSketchContainer::MakeCuts(HistogramCuts* cuts) {
+void HostSketchContainer::AllReduce(
+    std::vector<WQSketch::SummaryContainer> *p_reduced,
+    std::vector<int32_t>* p_num_cuts) {
   monitor_.Start(__func__);
-  rabit::Allreduce<rabit::op::Sum>(columns_size_.data(), columns_size_.size());
-  std::vector<WQSketch::SummaryContainer> reduced(sketches_.size());
-  std::vector<int32_t> num_cuts;
-  size_t nbytes = 0;
+  auto& num_cuts = *p_num_cuts;
+  CHECK_EQ(num_cuts.size(), 0);
+  auto &reduced = *p_reduced;
+  reduced.resize(sketches_.size());
+
+  size_t n_columns = sketches_.size();
+  rabit::Allreduce<rabit::op::Max>(&n_columns, 1);
+  CHECK_EQ(n_columns, sketches_.size()) << "Number of columns differs across workers";
+
+  // Prune the intermediate num cuts for synchronization.
+  std::vector<bst_row_t> global_column_size(columns_size_);
+  rabit::Allreduce<rabit::op::Sum>(global_column_size.data(), global_column_size.size());
+
+size_t nbytes = 0;
   for (size_t i = 0; i < sketches_.size(); ++i) {
     int32_t intermediate_num_cuts =  static_cast<int32_t>(std::min(
-        columns_size_[i], static_cast<size_t>(max_bins_ * WQSketch::kFactor)));
-    if (columns_size_[i] != 0) {
+        global_column_size[i], static_cast<size_t>(max_bins_ * WQSketch::kFactor)));
+    if (global_column_size[i] != 0) {
       WQSketch::SummaryContainer out;
       sketches_[i].GetSummary(&out);
       reduced[i].Reserve(intermediate_num_cuts);
       CHECK(reduced[i].data);
       reduced[i].SetPrune(out, intermediate_num_cuts);
+      nbytes = std::max(
+          WQSketch::SummaryContainer::CalcMemCost(intermediate_num_cuts),
+          nbytes);
     }
+
     num_cuts.push_back(intermediate_num_cuts);
-    nbytes = std::max(
-        WQSketch::SummaryContainer::CalcMemCost(intermediate_num_cuts), nbytes);
   }
+  auto world = rabit::GetWorldSize();
+  if (world == 1) {
+    return;
+  }
+
+  std::vector<size_t> worker_segments(1, 0);  // CSC pointer to sketches.
+  std::vector<bst_row_t> sketches_scan((n_columns + 1) * world, 0);
+
+  std::vector<WQSketch::Entry> global_sketches;
+  this->GatherSketchInfo(reduced, &worker_segments, &sketches_scan,
+                         &global_sketches);
+
+  std::vector<WQSketch::SummaryContainer> final_sketches(n_columns);
+  ParallelFor(n_columns, omp_get_max_threads(), [&](size_t fidx) {
+    int32_t intermediate_num_cuts = num_cuts[fidx];
+    auto nbytes =
+        WQSketch::SummaryContainer::CalcMemCost(intermediate_num_cuts);
+
+    for (int32_t i = 1; i < world + 1; ++i) {
+      auto size = worker_segments.at(i) - worker_segments[i - 1];
+      auto worker_sketches = Span<WQSketch::Entry>{global_sketches}.subspan(
+          worker_segments[i - 1], size);
+      auto worker_scan =
+          Span<bst_row_t>(sketches_scan)
+              .subspan((i - 1) * (n_columns + 1), (n_columns + 1));
+
+      auto worker_feature = worker_sketches.subspan(
+          worker_scan[fidx], worker_scan[fidx + 1] - worker_scan[fidx]);
+      CHECK(worker_feature.data());
+      WQSummary<float, float> summary(worker_feature.data(),
+                                      worker_feature.size());
+      auto &out = final_sketches.at(fidx);
+      out.Reduce(summary, nbytes);
+    }
+
+    reduced.at(fidx).Reserve(intermediate_num_cuts);
+    reduced.at(fidx).SetPrune(final_sketches.at(fidx), intermediate_num_cuts);
+  });
+  monitor_.Stop(__func__);
+}
 
-  if (rabit::IsDistributed()) {
-    // FIXME(trivialfis): This call will allocate nbytes * num_columns on rabit, which
-    // may generate oom error when data is sparse.  To fix it, we need to:
-    //   - gather the column offsets over all workers.
-    //   - run rabit::allgather on sketch data to collect all data.
-    //   - merge all gathered sketches based on worker offsets and column offsets of data
-    //     from each worker.
-    // See GPU implementation for details.
-    rabit::SerializeReducer<WQSketch::SummaryContainer> sreducer;
-    sreducer.Allreduce(dmlc::BeginPtr(reduced), nbytes, reduced.size());
+void AddCutPoint(WQuantileSketch<float, float>::SummaryContainer const &summary,
+                 int max_bin, HistogramCuts *cuts) {
+  size_t required_cuts = std::min(summary.size, static_cast<size_t>(max_bin));
+  auto& cut_values = cuts->cut_values_.HostVector();
+  for (size_t i = 1; i < required_cuts; ++i) {
+    bst_float cpt = summary.data[i].value;
+    if (i == 1 || cpt > cuts->cut_values_.ConstHostVector().back()) {
+      cut_values.push_back(cpt);
+    }
   }
+}
+
+void HostSketchContainer::MakeCuts(HistogramCuts* cuts) {
+  monitor_.Start(__func__);
+  std::vector<WQSketch::SummaryContainer> reduced;
+  std::vector<int32_t> num_cuts;
+  this->AllReduce(&reduced, &num_cuts);
 
   cuts->min_vals_.HostVector().resize(sketches_.size(), 0.0f);
+
   for (size_t fid = 0; fid < reduced.size(); ++fid) {
     WQSketch::SummaryContainer a;
     size_t max_num_bins = std::min(num_cuts[fid], max_bins_);
     a.Reserve(max_num_bins + 1);
     CHECK(a.data);
-    if (columns_size_[fid] != 0) {
+    if (num_cuts[fid] != 0) {
       a.SetPrune(reduced[fid], max_num_bins + 1);
       CHECK(a.data && reduced[fid].data);
       const bst_float mval = a.data[0].value;
@@ -173,6 +305,7 @@ void HostSketchContainer::MakeCuts(HistogramCuts* cuts) {
       const float mval = 1e-5f;
       cuts->min_vals_.HostVector()[fid] = mval;
     }
+
     AddCutPoint(a, max_num_bins, cuts);
     // push a value that is greater than anything
     const bst_float cpt
diff --git a/src/common/quantile.h b/src/common/quantile.h
index 11e2530f748e..a70bf809ea28 100644
--- a/src/common/quantile.h
+++ b/src/common/quantile.h
@@ -166,6 +166,16 @@ struct WQSummary {
    * \param src source sketch
    */
   inline void CopyFrom(const WQSummary &src) {
+    if (!src.data) {
+      CHECK_EQ(src.size, 0);
+      size = 0;
+      return;
+    }
+    if (!data) {
+      CHECK_EQ(this->size, 0);
+      CHECK_EQ(src.size, 0);
+      return;
+    }
     size = src.size;
     std::memcpy(data, src.data, sizeof(Entry) * size);
   }
@@ -721,6 +731,14 @@ class HostSketchContainer {
     return use_group_ind;
   }
 
+  static std::vector<bst_row_t> CalcColumnSize(SparsePage const &page,
+                                               bst_feature_t const n_columns,
+                                               size_t const nthreads);
+
+  static std::vector<bst_feature_t> LoadBalance(SparsePage const &page,
+                                                bst_feature_t n_columns,
+                                                size_t const nthreads);
+
   static uint32_t SearchGroupIndFromRow(std::vector<bst_uint> const &group_ptr,
                                         size_t const base_rowid) {
     CHECK_LT(base_rowid, group_ptr.back())
@@ -730,6 +748,14 @@ class HostSketchContainer {
         group_ptr.cbegin() - 1;
     return group_ind;
   }
+  // Gather sketches from all workers.
+  void GatherSketchInfo(std::vector<WQSketch::SummaryContainer> const &reduced,
+                        std::vector<bst_row_t> *p_worker_segments,
+                        std::vector<bst_row_t> *p_sketches_scan,
+                        std::vector<WQSketch::Entry> *p_global_sketches);
+  // Merge sketches from all workers.
+  void AllReduce(std::vector<WQSketch::SummaryContainer> *p_reduced,
+                 std::vector<int32_t>* p_num_cuts);
 
   /* \brief Push a CSR matrix. */
   void PushRowPage(SparsePage const& page, MetaInfo const& info);
diff --git a/tests/cpp/c_api/test_c_api.cc b/tests/cpp/c_api/test_c_api.cc
index f4c2722fe92c..664118780cc3 100644
--- a/tests/cpp/c_api/test_c_api.cc
+++ b/tests/cpp/c_api/test_c_api.cc
@@ -23,9 +23,9 @@ TEST(CAPI, XGDMatrixCreateFromMatDT) {
   std::shared_ptr<xgboost::DMatrix> *dmat =
       static_cast<std::shared_ptr<xgboost::DMatrix> *>(handle);
   xgboost::MetaInfo &info = (*dmat)->Info();
-  ASSERT_EQ(info.num_col_, 2);
-  ASSERT_EQ(info.num_row_, 3);
-  ASSERT_EQ(info.num_nonzero_, 6);
+  ASSERT_EQ(info.num_col_, 2ul);
+  ASSERT_EQ(info.num_row_, 3ul);
+  ASSERT_EQ(info.num_nonzero_, 6ul);
 
   for (const auto &batch : (*dmat)->GetBatches<xgboost::SparsePage>()) {
     ASSERT_EQ(batch[0][0].fvalue, 0.0f);
@@ -38,9 +38,9 @@ TEST(CAPI, XGDMatrixCreateFromMatDT) {
 }
 
 TEST(CAPI, XGDMatrixCreateFromMatOmp) {
-  std::vector<int> num_rows = {100, 11374, 15000};
+  std::vector<bst_ulong> num_rows = {100, 11374, 15000};
   for (auto row : num_rows) {
-    int num_cols = 50;
+    bst_ulong num_cols = 50;
     int num_missing = 5;
     DMatrixHandle handle;
     std::vector<float> data(num_cols * row, 1.5);
diff --git a/tests/cpp/common/test_hist_util.cc b/tests/cpp/common/test_hist_util.cc
index 0fad360f4298..24c23b3e2608 100644
--- a/tests/cpp/common/test_hist_util.cc
+++ b/tests/cpp/common/test_hist_util.cc
@@ -159,10 +159,10 @@ TEST(CutsBuilder, SearchGroupInd) {
   HistogramCuts hmat;
 
   size_t group_ind = HostSketchContainer::SearchGroupIndFromRow(p_mat->Info().group_ptr_, 0);
-  ASSERT_EQ(group_ind, 0);
+  ASSERT_EQ(group_ind, 0ul);
 
   group_ind = HostSketchContainer::SearchGroupIndFromRow(p_mat->Info().group_ptr_, 5);
-  ASSERT_EQ(group_ind, 2);
+  ASSERT_EQ(group_ind, 2ul);
 
   EXPECT_ANY_THROW(HostSketchContainer::SearchGroupIndFromRow(p_mat->Info().group_ptr_, 17));
 
@@ -189,7 +189,7 @@ TEST(HistUtil, DenseCutsCategorical) {
        EXPECT_LT(cuts.MinValues()[0], x_sorted.front());
        EXPECT_GT(cuts_from_sketch.front(), x_sorted.front());
        EXPECT_GE(cuts_from_sketch.back(), x_sorted.back());
-       EXPECT_EQ(cuts_from_sketch.size(), num_categories);
+       EXPECT_EQ(cuts_from_sketch.size(), static_cast<size_t>(num_categories));
      }
    }
 }
diff --git a/tests/cpp/common/test_hist_util.h b/tests/cpp/common/test_hist_util.h
index bd88d14ef1f2..d025e5ea60bf 100644
--- a/tests/cpp/common/test_hist_util.h
+++ b/tests/cpp/common/test_hist_util.h
@@ -162,7 +162,7 @@ inline void ValidateColumn(const HistogramCuts& cuts, int column_idx,
 
   // Check all cut points are unique
   EXPECT_EQ(std::set<float>(cuts_begin, cuts_end).size(),
-            cuts_end - cuts_begin);
+            static_cast<size_t>(cuts_end - cuts_begin));
 
   auto unique = std::set<float>(sorted_column.begin(), sorted_column.end());
   if (unique.size() <= num_bins) {
@@ -189,7 +189,7 @@ inline void ValidateCuts(const HistogramCuts& cuts, DMatrix* dmat,
   // Collect data into columns
   std::vector<std::vector<float>> columns(dmat->Info().num_col_);
   for (auto& batch : dmat->GetBatches<SparsePage>()) {
-    ASSERT_GT(batch.Size(), 0);
+    ASSERT_GT(batch.Size(), 0ul);
     for (auto i = 0ull; i < batch.Size(); i++) {
       for (auto e : batch[i]) {
         columns[e.index].push_back(e.fvalue);
diff --git a/tests/cpp/common/test_json.cc b/tests/cpp/common/test_json.cc
index 8665420d684a..029beee8d48b 100644
--- a/tests/cpp/common/test_json.cc
+++ b/tests/cpp/common/test_json.cc
@@ -222,7 +222,7 @@ TEST(Json, ParseArray) {
   auto json = Json::Load(StringView{str.c_str(), str.size()});
   json = json["nodes"];
   std::vector<Json> arr = get<JsonArray>(json);
-  ASSERT_EQ(arr.size(), 3);
+  ASSERT_EQ(arr.size(), 3ul);
   Json v0 = arr[0];
   ASSERT_EQ(get<Integer>(v0["depth"]), 3);
   ASSERT_NEAR(get<Number>(v0["gain"]), 10.4866, kRtEps);
@@ -284,7 +284,7 @@ TEST(Json, EmptyArray) {
   std::istringstream iss(str);
   auto json = Json::Load(StringView{str.c_str(), str.size()});
   auto arr = get<JsonArray>(json["leaf_vector"]);
-  ASSERT_EQ(arr.size(), 0);
+  ASSERT_EQ(arr.size(), 0ul);
 }
 
 TEST(Json, Boolean) {
@@ -315,7 +315,7 @@ TEST(Json, AssigningObjects) {
     Json json;
     json = JsonObject();
     json["Okay"] = JsonArray();
-    ASSERT_EQ(get<JsonArray>(json["Okay"]).size(), 0);
+    ASSERT_EQ(get<JsonArray>(json["Okay"]).size(), 0ul);
   }
 
   {
diff --git a/tests/cpp/common/test_quantile.cc b/tests/cpp/common/test_quantile.cc
index c273658e54cb..fa748de1cc6c 100644
--- a/tests/cpp/common/test_quantile.cc
+++ b/tests/cpp/common/test_quantile.cc
@@ -5,14 +5,122 @@
 
 namespace xgboost {
 namespace common {
+
+TEST(Quantile, LoadBalance) {
+  size_t constexpr kRows = 1000, kCols = 100;
+  auto m = RandomDataGenerator{kRows, kCols, 0}.GenerateDMatrix();
+  std::vector<bst_feature_t> cols_ptr;
+  for (auto const &page : m->GetBatches<SparsePage>()) {
+    cols_ptr = HostSketchContainer::LoadBalance(page, kCols, 13);
+  }
+  size_t n_cols = 0;
+  for (size_t i = 1; i < cols_ptr.size(); ++i) {
+    n_cols += cols_ptr[i] - cols_ptr[i - 1];
+  }
+  CHECK_EQ(n_cols, kCols);
+}
+
+void TestDistributedQuantile(size_t rows, size_t cols) {
+  std::string msg {"Skipping AllReduce test"};
+  int32_t constexpr kWorkers = 4;
+  InitRabitContext(msg, kWorkers);
+  auto world = rabit::GetWorldSize();
+  if (world != 1) {
+    ASSERT_EQ(world, kWorkers);
+  } else {
+    return;
+  }
+
+  std::vector<MetaInfo> infos(2);
+  auto& h_weights = infos.front().weights_.HostVector();
+  h_weights.resize(rows);
+  SimpleLCG lcg;
+  SimpleRealUniformDistribution<float> dist(3, 1000);
+  std::generate(h_weights.begin(), h_weights.end(), [&]() { return dist(&lcg); });
+  std::vector<bst_row_t> column_size(cols, rows);
+  size_t n_bins = 64;
+
+  // Generate cuts for distributed environment.
+  auto sparsity = 0.5f;
+  auto rank = rabit::GetRank();
+  HostSketchContainer sketch_distributed(column_size, n_bins, false);
+  auto m = RandomDataGenerator{rows, cols, sparsity}
+               .Seed(rank)
+               .Lower(.0f)
+               .Upper(1.0f)
+               .GenerateDMatrix();
+  for (auto const &page : m->GetBatches<SparsePage>()) {
+    sketch_distributed.PushRowPage(page, m->Info());
+  }
+  HistogramCuts distributed_cuts;
+  sketch_distributed.MakeCuts(&distributed_cuts);
+
+  // Generate cuts for single node environment
+  rabit::Finalize();
+  CHECK_EQ(rabit::GetWorldSize(), 1);
+  std::for_each(column_size.begin(), column_size.end(), [=](auto& size) { size *= world; });
+  HostSketchContainer sketch_on_single_node(column_size, n_bins, false);
+  for (auto rank = 0; rank < world; ++rank) {
+    auto m = RandomDataGenerator{rows, cols, sparsity}
+                 .Seed(rank)
+                 .Lower(.0f)
+                 .Upper(1.0f)
+                 .GenerateDMatrix();
+    for (auto const &page : m->GetBatches<SparsePage>()) {
+      sketch_on_single_node.PushRowPage(page, m->Info());
+    }
+  }
+
+  HistogramCuts single_node_cuts;
+  sketch_on_single_node.MakeCuts(&single_node_cuts);
+
+  auto const& sptrs = single_node_cuts.Ptrs();
+  auto const& dptrs = distributed_cuts.Ptrs();
+  auto const& svals = single_node_cuts.Values();
+  auto const& dvals = distributed_cuts.Values();
+  auto const& smins = single_node_cuts.MinValues();
+  auto const& dmins = distributed_cuts.MinValues();
+
+  ASSERT_EQ(sptrs.size(), dptrs.size());
+  for (size_t i = 0; i < sptrs.size(); ++i) {
+    ASSERT_EQ(sptrs[i], dptrs[i]);
+  }
+
+  ASSERT_EQ(svals.size(), dvals.size());
+  for (size_t i = 0; i < svals.size(); ++i) {
+    ASSERT_NEAR(svals[i], dvals[i], 2e-2f);
+  }
+
+  ASSERT_EQ(smins.size(), dmins.size());
+  for (size_t i = 0; i < smins.size(); ++i) {
+    ASSERT_FLOAT_EQ(smins[i], dmins[i]);
+  }
+}
+
+TEST(Quantile, DistributedBasic) {
+#if defined(__unix__)
+  constexpr size_t kRows = 10, kCols = 10;
+  TestDistributedQuantile(kRows, kCols);
+#endif
+}
+
+TEST(Quantile, Distributed) {
+#if defined(__unix__)
+  constexpr size_t kRows = 1000, kCols = 200;
+  TestDistributedQuantile(kRows, kCols);
+#endif
+}
+
 TEST(Quantile, SameOnAllWorkers) {
+#if defined(__unix__)
   std::string msg{"Skipping Quantile AllreduceBasic test"};
-  size_t constexpr kWorkers = 4;
+  int32_t constexpr kWorkers = 4;
   InitRabitContext(msg, kWorkers);
   auto world = rabit::GetWorldSize();
   if (world != 1) {
     CHECK_EQ(world, kWorkers);
   } else {
+    LOG(WARNING) << msg;
     return;
   }
 
@@ -72,6 +180,8 @@ TEST(Quantile, SameOnAllWorkers) {
           }
         }
       });
+  rabit::Finalize();
+#endif  // defined(__unix__)
 }
 }  // namespace common
 }  // namespace xgboost
diff --git a/tests/cpp/common/test_quantile.h b/tests/cpp/common/test_quantile.h
index 7dea0b17deb3..e91f19ef84a8 100644
--- a/tests/cpp/common/test_quantile.h
+++ b/tests/cpp/common/test_quantile.h
@@ -7,7 +7,7 @@
 
 namespace xgboost {
 namespace common {
-inline void InitRabitContext(std::string msg, size_t n_workers) {
+inline void InitRabitContext(std::string msg, int32_t n_workers) {
   auto port = std::getenv("DMLC_TRACKER_PORT");
   std::string port_str;
   if (port) {
@@ -35,7 +35,7 @@ template <typename Fn> void RunWithSeedsAndBins(size_t rows, Fn fn) {
   for (size_t i = 0; i < bins.size() - 1; ++i) {
     bins[i] = i * 35 + 2;
   }
-  bins.back() = rows + 80;  // provide a bin number greater than rows.
+  bins.back() = rows + 160;  // provide a bin number greater than rows.
 
   std::vector<MetaInfo> infos(2);
   auto& h_weights = infos.front().weights_.HostVector();
diff --git a/tests/python/test_with_dask.py b/tests/python/test_with_dask.py
index dc5c155e6027..145fa0b524cd 100644
--- a/tests/python/test_with_dask.py
+++ b/tests/python/test_with_dask.py
@@ -501,17 +501,20 @@ def run_updater_test(self, client, params, num_rounds, dataset,
                                  num_boost_round=num_rounds,
                                  evals=[(m, 'train')])['history']
         note(history)
-        assert tm.non_increasing(history['train'][dataset.metric])
+        history = history['train'][dataset.metric]
+        assert tm.non_increasing(history)
+        # Make sure that it's decreasing
+        assert history[-1] < history[0]
 
     @given(params=hist_parameter_strategy,
-           num_rounds=strategies.integers(10, 20),
+           num_rounds=strategies.integers(20, 30),
            dataset=tm.dataset_strategy)
     @settings(deadline=None)
     def test_hist(self, params, num_rounds, dataset, client):
         self.run_updater_test(client, params, num_rounds, dataset, 'hist')
 
     @given(params=exact_parameter_strategy,
-           num_rounds=strategies.integers(10, 20),
+           num_rounds=strategies.integers(20, 30),
            dataset=tm.dataset_strategy)
     @settings(deadline=None)
     def test_approx(self, client, params, num_rounds, dataset):
@@ -524,8 +527,7 @@ def run_quantile(self, name):
         exe = None
         for possible_path in {'./testxgboost', './build/testxgboost',
                               '../build/testxgboost',
-                              '../cpu-build/testxgboost',
-                              '../gpu-build/testxgboost'}:
+                              '../cpu-build/testxgboost'}:
             if os.path.exists(possible_path):
                 exe = possible_path
         if exe is None:
@@ -542,7 +544,7 @@ def runit(worker_addr, rabit_args):
             port = port.split('=')
             env = os.environ.copy()
             env[port[0]] = port[1]
-            return subprocess.run([exe, test], env=env, stdout=subprocess.PIPE)
+            return subprocess.run([exe, test], env=env, capture_output=True)
 
         with LocalCluster(n_workers=4) as cluster:
             with Client(cluster) as client:
@@ -555,6 +557,7 @@ def runit(worker_addr, rabit_args):
                                      workers=workers,
                                      rabit_args=rabit_args)
                 results = client.gather(futures)
+
                 for ret in results:
                     msg = ret.stdout.decode('utf-8')
                     assert msg.find('1 test from Quantile') != -1, msg
@@ -563,4 +566,14 @@ def runit(worker_addr, rabit_args):
     @pytest.mark.skipif(**tm.no_dask())
     @pytest.mark.gtest
     def test_quantile_basic(self):
+        self.run_quantile('DistributedBasic')
+
+    @pytest.mark.skipif(**tm.no_dask())
+    @pytest.mark.gtest
+    def test_quantile(self):
+        self.run_quantile('Distributed')
+
+    @pytest.mark.skipif(**tm.no_dask())
+    @pytest.mark.gtest
+    def test_quantile_same_on_all_workers(self):
         self.run_quantile('SameOnAllWorkers')

From 24f2e6c97eb8c9562ec3ea7218fcb9b795315d80 Mon Sep 17 00:00:00 2001
From: ShvetsKS <33296480+ShvetsKS@users.noreply.github.com>
Date: Wed, 19 Aug 2020 20:37:03 +0300
Subject: [PATCH 07/15] Optimize DMatrix build time. (#5877)

Co-authored-by: SHVETS, KIRILL <kirill.shvets@intel.com>
---
 python-package/xgboost/core.py |  2 +-
 src/common/group_data.h        | 11 ++--
 src/data/data.cc               | 93 ++++++++++++++++++++++------------
 3 files changed, 68 insertions(+), 38 deletions(-)

diff --git a/python-package/xgboost/core.py b/python-package/xgboost/core.py
index f2cd880ba074..2a69fea517a7 100644
--- a/python-package/xgboost/core.py
+++ b/python-package/xgboost/core.py
@@ -422,7 +422,7 @@ def __init__(self, data, label=None, weight=None, base_margin=None,
             raise TypeError('Input data can not be a list.')
 
         self.missing = missing if missing is not None else np.nan
-        self.nthread = nthread if nthread is not None else 1
+        self.nthread = nthread if nthread is not None else -1
         self.silent = silent
 
         # force into void_p, mac need to pass things in as void_p
diff --git a/src/common/group_data.h b/src/common/group_data.h
index 0144d8099926..476b4925bff3 100644
--- a/src/common/group_data.h
+++ b/src/common/group_data.h
@@ -17,6 +17,7 @@
 #include <cstddef>
 #include <vector>
 #include <algorithm>
+#include <utility>
 
 #include "xgboost/base.h"
 
@@ -56,10 +57,10 @@ class ParallelGroupBuilder {
   void InitBudget(std::size_t max_key, int nthread) {
     thread_rptr_.resize(nthread);
     for (std::size_t i = 0; i < thread_rptr_.size(); ++i) {
-      thread_rptr_[i].resize(max_key - std::min(base_row_offset_, max_key));
-      std::fill(thread_rptr_[i].begin(), thread_rptr_[i].end(), 0);
+      thread_rptr_[i].resize(max_key - std::min(base_row_offset_, max_key), 0);
     }
   }
+
   /*!
    * \brief step 2: add budget to each key
    * \param key the key
@@ -74,6 +75,7 @@ class ParallelGroupBuilder {
     }
     trptr[offset_key] += nelem;
   }
+
   /*! \brief step 3: initialize the necessary storage */
   inline void InitStorage() {
     // set rptr to correct size
@@ -101,6 +103,7 @@ class ParallelGroupBuilder {
     }
     data_.resize(rptr_.back());
   }
+
   /*!
    * \brief step 4: add data to the allocated space,
    *   the calls to this function should be exactly match previous call to AddBudget
@@ -109,10 +112,10 @@ class ParallelGroupBuilder {
    * \param value The value to be pushed to the group.
    * \param threadid the id of thread that calls this function
    */
-  void Push(std::size_t key, ValueType value, int threadid) {
+  void Push(std::size_t key, ValueType&& value, int threadid) {
     size_t offset_key = key - base_row_offset_;
     SizeType &rp = thread_rptr_[threadid][offset_key];
-    data_[rp++] = value;
+    data_[rp++] = std::move(value);
   }
 
  private:
diff --git a/src/data/data.cc b/src/data/data.cc
index 8bd7c76cf59f..d7d18f189642 100644
--- a/src/data/data.cc
+++ b/src/data/data.cc
@@ -840,10 +840,11 @@ uint64_t SparsePage::Push(const AdapterBatchT& batch, float missing, int nthread
   // Set number of threads but keep old value so we can reset it after
   const int nthreadmax = omp_get_max_threads();
   if (nthread <= 0) nthread = nthreadmax;
-  int nthread_original = omp_get_max_threads();
+  const int nthread_original = omp_get_max_threads();
   omp_set_num_threads(nthread);
   auto& offset_vec = offset.HostVector();
   auto& data_vec = data.HostVector();
+
   size_t builder_base_row_offset = this->Size();
   common::ParallelGroupBuilder<
       Entry, std::remove_reference<decltype(offset_vec)>::type::value_type>
@@ -858,48 +859,74 @@ uint64_t SparsePage::Push(const AdapterBatchT& batch, float missing, int nthread
           last_line.GetElement(last_line.Size() - 1).row_idx - base_rowid;
     }
   }
-  builder.InitBudget(expected_rows, nthread);
+  size_t batch_size = batch.Size();
+  const size_t thread_size = batch_size/nthread;
+  builder.InitBudget(expected_rows+1, nthread);
   uint64_t max_columns = 0;
-
+  if (batch_size == 0) {
+    omp_set_num_threads(nthread_original);
+    return max_columns;
+  }
+  std::vector<std::vector<uint64_t>> max_columns_vector(nthread);
+  dmlc::OMPException exec;
   // First-pass over the batch counting valid elements
-  size_t batch_size = batch.Size();
-#pragma omp parallel for schedule(static)
-  for (omp_ulong i = 0; i < static_cast<omp_ulong>(batch_size);
-       ++i) {  // NOLINT(*)
-    int tid = omp_get_thread_num();
-    auto line = batch.GetLine(i);
-    for (auto j = 0ull; j < line.Size(); j++) {
-      data::COOTuple element = line.GetElement(j);
-      max_columns =
-          std::max(max_columns, static_cast<uint64_t>(element.column_idx + 1));
-      if (!common::CheckNAN(element.value) && element.value != missing) {
-        size_t key = element.row_idx - base_rowid;
-        // Adapter row index is absolute, here we want it relative to
-        // current page
-        CHECK_GE(key, builder_base_row_offset);
-        builder.AddBudget(key, tid);
+#pragma omp parallel num_threads(nthread)
+  {
+    exec.Run([&]() {
+      int tid = omp_get_thread_num();
+      size_t begin = tid*thread_size;
+      size_t end = tid != (nthread-1) ? (tid+1)*thread_size : batch_size;
+      max_columns_vector[tid].resize(1, 0);
+      uint64_t& max_columns_local = max_columns_vector[tid][0];
+
+      for (size_t i = begin; i < end; ++i) {
+        auto line = batch.GetLine(i);
+        for (auto j = 0ull; j < line.Size(); j++) {
+          auto element = line.GetElement(j);
+          const size_t key = element.row_idx - base_rowid;
+          CHECK_GE(key,  builder_base_row_offset);
+          max_columns_local =
+              std::max(max_columns_local, static_cast<uint64_t>(element.column_idx + 1));
+
+          if (!common::CheckNAN(element.value) && element.value != missing) {
+            // Adapter row index is absolute, here we want it relative to
+            // current page
+            builder.AddBudget(key, tid);
+          }
+        }
       }
-    }
+    });
   }
+  exec.Rethrow();
+  for (const auto & max : max_columns_vector) {
+    max_columns = std::max(max_columns, max[0]);
+  }
+
   builder.InitStorage();
 
   // Second pass over batch, placing elements in correct position
-#pragma omp parallel for schedule(static)
-  for (omp_ulong i = 0; i < static_cast<omp_ulong>(batch_size);
-       ++i) {  // NOLINT(*)
-    int tid = omp_get_thread_num();
-    auto line = batch.GetLine(i);
-    for (auto j = 0ull; j < line.Size(); j++) {
-      auto element = line.GetElement(j);
-      if (!common::CheckNAN(element.value) && element.value != missing) {
-        size_t key = element.row_idx -
-                     base_rowid;  // Adapter row index is absolute, here we want
-                                  // it relative to current page
-        builder.Push(key, Entry(element.column_idx, element.value), tid);
+
+#pragma omp parallel num_threads(nthread)
+  {
+    exec.Run([&]() {
+      int tid = omp_get_thread_num();
+      size_t begin = tid*thread_size;
+      size_t end = tid != (nthread-1) ? (tid+1)*thread_size : batch_size;
+      for (size_t i = begin; i < end; ++i) {
+        auto line = batch.GetLine(i);
+        for (auto j = 0ull; j < line.Size(); j++) {
+          auto element = line.GetElement(j);
+          const size_t key = (element.row_idx - base_rowid);
+          if (!common::CheckNAN(element.value) && element.value != missing) {
+            builder.Push(key, Entry(element.column_idx, element.value), tid);
+          }
+        }
       }
-    }
+    });
   }
+  exec.Rethrow();
   omp_set_num_threads(nthread_original);
+
   return max_columns;
 }
 

From 1fd29edf669ccc77144f94bd89e937a4319c9710 Mon Sep 17 00:00:00 2001
From: Philip Hyunsu Cho <chohyu01@cs.washington.edu>
Date: Wed, 19 Aug 2020 12:33:51 -0700
Subject: [PATCH 08/15] [CI] Migrate linters to GitHub Actions (#6035)

* [CI] Move lint to GitHub Actions

* [CI] Move Doxygen to GitHub Actions

* [CI] Move Sphinx build test to GitHub Actions

* [CI] Reduce workload for Windows R tests

* [CI] Move clang-tidy to Build stage
---
 .github/workflows/main.yml | 95 +++++++++++++++++++++++++++++++++-----
 Jenkinsfile                | 58 +----------------------
 doc/conf.py                |  4 +-
 tests/ci_build/doxygen.sh  | 19 --------
 4 files changed, 88 insertions(+), 88 deletions(-)
 delete mode 100755 tests/ci_build/doxygen.sh

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 9d4196feb754..6a77114277bd 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -40,12 +40,92 @@ jobs:
         cd jvm-packages
         mvn test -pl :xgboost4j_2.12
 
+  lint:
+    runs-on: ubuntu-latest
+    name: Code linting for Python and C++
+    steps:
+    - uses: actions/checkout@v2
+      with:
+        submodules: 'true'
+    - uses: actions/setup-python@v2
+      with:
+        python-version: '3.7'
+        architecture: 'x64'
+    - name: Install Python packages
+      run: |
+        python -m pip install wheel setuptools
+        python -m pip install pylint cpplint numpy scipy scikit-learn
+    - name: Run lint
+      run: |
+        make lint
+
+  doxygen:
+    runs-on: ubuntu-latest
+    name: Generate C/C++ API doc using Doxygen
+    steps:
+    - uses: actions/checkout@v2
+      with:
+        submodules: 'true'
+    - uses: actions/setup-python@v2
+      with:
+        python-version: '3.7'
+        architecture: 'x64'
+    - name: Install system packages
+      run: |
+        sudo apt-get install -y --no-install-recommends doxygen graphviz ninja-build
+        python -m pip install wheel setuptools
+        python -m pip install awscli
+    - name: Run Doxygen
+      run: |
+        mkdir build
+        cd build
+        cmake .. -DBUILD_C_DOC=ON -GNinja
+        ninja -v doc_doxygen
+    - name: Extract branch name
+      shell: bash
+      run: echo "##[set-output name=branch;]$(echo ${GITHUB_REF#refs/heads/})"
+      id: extract_branch
+      if: github.ref == 'refs/heads/master' || contains(github.ref, 'refs/heads/release_')
+    - name: Publish
+      run: |
+        cd build/
+        tar cvjf ${{ steps.extract_branch.outputs.branch }}.tar.bz2 doc_doxygen/
+        python -m awscli s3 cp ./${{ steps.extract_branch.outputs.branch }}.tar.bz2 s3://xgboost-docs/ --acl public-read
+      if: github.ref == 'refs/heads/master' || contains(github.ref, 'refs/heads/release_')
+      env:
+        AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID_IAM_S3_UPLOADER }}
+        AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY_IAM_S3_UPLOADER }}
+
+  sphinx:
+    runs-on: ubuntu-latest
+    name: Build docs using Sphinx
+    steps:
+    - uses: actions/checkout@v2
+      with:
+        submodules: 'true'
+    - uses: actions/setup-python@v2
+      with:
+        python-version: '3.7'
+        architecture: 'x64'
+    - name: Install system packages
+      run: |
+        sudo apt-get install -y --no-install-recommends graphviz
+        python -m pip install wheel setuptools
+        python -m pip install -r doc/requirements.txt
+    - name: Extract branch name
+      shell: bash
+      run: echo "##[set-output name=branch;]$(echo ${GITHUB_REF#refs/heads/})"
+      id: extract_branch
+      if: github.ref == 'refs/heads/master' || contains(github.ref, 'refs/heads/release_')
+    - name: Run Sphinx
+      run: |
+        make -C doc html
+      env:
+        SPHINX_GIT_BRANCH: ${{ steps.extract_branch.outputs.branch }}
 
   lintr:
     runs-on: ${{ matrix.config.os }}
-
     name: Run R linters on OS ${{ matrix.config.os }}, R ${{ matrix.config.r }}, Compiler ${{ matrix.config.compiler }}, Build ${{ matrix.config.build }}
-
     strategy:
       matrix:
         config:
@@ -83,23 +163,16 @@ jobs:
         R.exe CMD INSTALL .
         Rscript.exe tests/run_lint.R
 
-
   test-with-R:
     runs-on: ${{ matrix.config.os }}
-
     name: Test R on OS ${{ matrix.config.os }}, R ${{ matrix.config.r }}, Compiler ${{ matrix.config.compiler }}, Build ${{ matrix.config.build }}
-
     strategy:
       fail-fast: false
       matrix:
         config:
-          - {os: windows-latest, r: 'release', compiler: 'msvc', build: 'autotools'}
           - {os: windows-2016, r: 'release', compiler: 'msvc', build: 'autotools'}
-          - {os: windows-latest, r: 'release', compiler: 'msvc', build: 'cmake'}
           - {os: windows-2016, r: 'release', compiler: 'msvc', build: 'cmake'}
-          - {os: windows-latest, r: 'release', compiler: 'mingw', build: 'autotools'}
           - {os: windows-2016, r: 'release', compiler: 'mingw', build: 'autotools'}
-          - {os: windows-latest, r: 'release', compiler: 'mingw', build: 'cmake'}
           - {os: windows-2016, r: 'release', compiler: 'mingw', build: 'cmake'}
     env:
       R_REMOTES_NO_ERRORS_FROM_WARNINGS: true
@@ -130,8 +203,8 @@ jobs:
 
     - uses: actions/setup-python@v2
       with:
-        python-version: '3.6' # Version range or exact version of a Python version to use, using SemVer's version range syntax
-        architecture: 'x64' # optional x64 or x86. Defaults to x64 if not specified
+        python-version: '3.7'
+        architecture: 'x64'
 
     - name: Test R
       run: |
diff --git a/Jenkinsfile b/Jenkinsfile
index 60e8116f330b..a251ac1a3fb2 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -49,24 +49,12 @@ pipeline {
         stash name: 'srcs'
       }
     }
-    stage('Jenkins Linux: Formatting Check') {
-      agent none
-      steps {
-        script {
-          parallel ([
-            'clang-tidy': { ClangTidy() },
-            'lint': { Lint() },
-            'sphinx-doc': { SphinxDoc() },
-            'doxygen': { Doxygen() }
-          ])
-        }
-      }
-    }
     stage('Jenkins Linux: Build') {
       agent none
       steps {
         script {
           parallel ([
+            'clang-tidy': { ClangTidy() },
             'build-cpu': { BuildCPU() },
             'build-cpu-rabit-mock': { BuildCPUMock() },
             'build-cpu-non-omp': { BuildCPUNonOmp() },
@@ -152,50 +140,6 @@ def ClangTidy() {
   }
 }
 
-def Lint() {
-  node('linux && cpu') {
-    unstash name: 'srcs'
-    echo "Running lint..."
-    def container_type = "cpu"
-    def docker_binary = "docker"
-    sh """
-    ${dockerRun} ${container_type} ${docker_binary} bash -c "source activate cpu_test && make lint"
-    """
-    deleteDir()
-  }
-}
-
-def SphinxDoc() {
-  node('linux && cpu') {
-    unstash name: 'srcs'
-    echo "Running sphinx-doc..."
-    def container_type = "cpu"
-    def docker_binary = "docker"
-    def docker_extra_params = "CI_DOCKER_EXTRA_PARAMS_INIT='-e SPHINX_GIT_BRANCH=${BRANCH_NAME}'"
-    sh """#!/bin/bash
-    ${docker_extra_params} ${dockerRun} ${container_type} ${docker_binary} bash -c "source activate cpu_test && make -C doc html"
-    """
-    deleteDir()
-  }
-}
-
-def Doxygen() {
-  node('linux && cpu') {
-    unstash name: 'srcs'
-    echo "Running doxygen..."
-    def container_type = "cpu"
-    def docker_binary = "docker"
-    sh """
-    ${dockerRun} ${container_type} ${docker_binary} tests/ci_build/doxygen.sh ${BRANCH_NAME}
-    """
-    if (env.BRANCH_NAME == 'master' || env.BRANCH_NAME.startsWith('release')) {
-      echo 'Uploading doc...'
-      s3Upload file: "build/${BRANCH_NAME}.tar.bz2", bucket: 'xgboost-docs', acl: 'PublicRead', path: "doxygen/${BRANCH_NAME}.tar.bz2"
-    }
-    deleteDir()
-  }
-}
-
 def BuildCPU() {
   node('linux && cpu') {
     unstash name: 'srcs'
diff --git a/doc/conf.py b/doc/conf.py
index d17f9594a285..749d400c6e8f 100644
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -22,7 +22,7 @@
 import guzzle_sphinx_theme
 
 git_branch = os.getenv('SPHINX_GIT_BRANCH', default=None)
-if git_branch is None:
+if not git_branch:
     # If SPHINX_GIT_BRANCH environment variable is not given, run git
     # to determine branch name
     git_branch = [
@@ -30,6 +30,8 @@
             git.branch('-r', '--contains', 'HEAD')).rstrip('\n').split('\n')
     ]
     git_branch = [x for x in git_branch if 'HEAD' not in x]
+else:
+    git_branch = [git_branch]
 print('git_branch = {}'.format(git_branch[0]))
 try:
     filename, _ = urllib.request.urlretrieve(
diff --git a/tests/ci_build/doxygen.sh b/tests/ci_build/doxygen.sh
deleted file mode 100755
index 41757eb6935f..000000000000
--- a/tests/ci_build/doxygen.sh
+++ /dev/null
@@ -1,19 +0,0 @@
-#!/bin/bash
-
-if [ $# -ne 1 ]; then
-  echo "Usage: $0 [branch name]"
-  exit 1
-fi
-
-set -e
-set -x
-
-branch_name=$1
-
-rm -rf build
-mkdir build
-cd build
-cmake .. -DBUILD_C_DOC=ON
-make -j
-
-tar cvjf ${branch_name}.tar.bz2 doc_doxygen/

From 7be2e04bd43ec1ed835e8fe0e3db476c017da7a9 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Fri, 21 Aug 2020 10:23:06 +0800
Subject: [PATCH 09/15] Fix scikit learn cls doc. (#6041)

---
 python-package/xgboost/sklearn.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/python-package/xgboost/sklearn.py b/python-package/xgboost/sklearn.py
index c6c34dce1c99..4721533b49d5 100644
--- a/python-package/xgboost/sklearn.py
+++ b/python-package/xgboost/sklearn.py
@@ -757,7 +757,10 @@ def intercept_(self):
 
 @xgboost_model_doc(
     "Implementation of the scikit-learn API for XGBoost classification.",
-    ['model', 'objective'])
+    ['model', 'objective'], extra_parameters='''
+    n_estimators : int
+        Number of boosting rounds.
+''')
 class XGBClassifier(XGBModel, XGBClassifierBase):
     # pylint: disable=missing-docstring,invalid-name,too-many-instance-attributes
     def __init__(self, objective="binary:logistic", **kwargs):
@@ -1041,7 +1044,10 @@ def __init__(self, objective="reg:squarederror", **kwargs):
 
 @xgboost_model_doc(
     "scikit-learn API for XGBoost random forest regression.",
-    ['model', 'objective'])
+    ['model', 'objective'], extra_parameters='''
+    n_estimators : int
+        Number of trees in random forest to fit.
+''')
 class XGBRFRegressor(XGBRegressor):
     # pylint: disable=missing-docstring
     def __init__(self, learning_rate=1, subsample=0.8, colsample_bynode=0.8,

From 7a46515d3d32380814079146e5a87375d0763b97 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Fri, 21 Aug 2020 10:39:46 +0800
Subject: [PATCH 10/15] Remove win2016 jvm github action test. (#6042)

---
 .github/workflows/main.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 6a77114277bd..5c73ffa27aa9 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -17,7 +17,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        os: [windows-latest, windows-2016, ubuntu-latest]
+        os: [windows-latest, ubuntu-latest]
 
     steps:
     - uses: actions/checkout@v2

From b9ebbffc57e238a9487bbd14a806fad7d807c3ae Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Sat, 22 Aug 2020 13:18:48 +0800
Subject: [PATCH 11/15] Fix plotting test. (#6040)

Previously the test loads a model generated by `test_basic.py`, now we generate
the model explicitly.

* Cleanup saved files for basic tests.
---
 tests/python/test_basic.py        | 23 +++++++++++++----------
 tests/python/test_basic_models.py | 23 ++++++++++++++---------
 tests/python/test_plotting.py     | 24 ++++++++++++------------
 3 files changed, 39 insertions(+), 31 deletions(-)

diff --git a/tests/python/test_basic.py b/tests/python/test_basic.py
index dad7ddc9db0c..acfb6db560be 100644
--- a/tests/python/test_basic.py
+++ b/tests/python/test_basic.py
@@ -110,16 +110,19 @@ def test_multiclass(self):
         # error must be smaller than 10%
         assert err < 0.1
 
-        # save dmatrix into binary buffer
-        dtest.save_binary('dtest.buffer')
-        # save model
-        bst.save_model('xgb.model')
-        # load model and data in
-        bst2 = xgb.Booster(model_file='xgb.model')
-        dtest2 = xgb.DMatrix('dtest.buffer')
-        preds2 = bst2.predict(dtest2)
-        # assert they are the same
-        assert np.sum(np.abs(preds2 - preds)) == 0
+        with tempfile.TemporaryDirectory() as tmpdir:
+            dtest_path = os.path.join(tmpdir, 'dtest.buffer')
+            model_path = os.path.join(tmpdir, 'xgb.model')
+            # save dmatrix into binary buffer
+            dtest.save_binary(dtest_path)
+            # save model
+            bst.save_model(model_path)
+            # load model and data in
+            bst2 = xgb.Booster(model_file=model_path)
+            dtest2 = xgb.DMatrix(dtest_path)
+            preds2 = bst2.predict(dtest2)
+            # assert they are the same
+            assert np.sum(np.abs(preds2 - preds)) == 0
 
     def test_dump(self):
         data = np.random.randn(100, 2)
diff --git a/tests/python/test_basic_models.py b/tests/python/test_basic_models.py
index 3eafdf71d821..529f7784c60d 100644
--- a/tests/python/test_basic_models.py
+++ b/tests/python/test_basic_models.py
@@ -6,6 +6,7 @@
 import testing as tm
 import pytest
 import locale
+import tempfile
 
 dpath = 'demo/data/'
 dtrain = xgb.DMatrix(dpath + 'agaricus.txt.train')
@@ -60,15 +61,20 @@ def test_dart(self):
         # error must be smaller than 10%
         assert err < 0.1
 
-        # save dmatrix into binary buffer
-        dtest.save_binary('dtest.buffer')
-        model_path = 'xgb.model.dart'
-        # save model
-        bst.save_model(model_path)
-        # load model and data in
-        bst2 = xgb.Booster(params=param, model_file='xgb.model.dart')
-        dtest2 = xgb.DMatrix('dtest.buffer')
+        with tempfile.TemporaryDirectory() as tmpdir:
+            dtest_path = os.path.join(tmpdir, 'dtest.dmatrix')
+            model_path = os.path.join(tmpdir, 'xgboost.model.dart')
+            # save dmatrix into binary buffer
+            dtest.save_binary(dtest_path)
+            model_path = model_path
+            # save model
+            bst.save_model(model_path)
+            # load model and data in
+            bst2 = xgb.Booster(params=param, model_file=model_path)
+            dtest2 = xgb.DMatrix(dtest_path)
+
         preds2 = bst2.predict(dtest2, ntree_limit=num_round)
+
         # assert they are the same
         assert np.sum(np.abs(preds2 - preds)) == 0
 
@@ -103,7 +109,6 @@ def my_logloss(preds, dtrain):
         for ii in range(len(preds_list)):
             for jj in range(ii + 1, len(preds_list)):
                 assert np.sum(np.abs(preds_list[ii] - preds_list[jj])) > 0
-        os.remove(model_path)
 
     def run_eta_decay(self, tree_method):
         watchlist = [(dtest, 'eval'), (dtrain, 'train')]
diff --git a/tests/python/test_plotting.py b/tests/python/test_plotting.py
index 18b0b83c7d60..e5e3a96e1bb0 100644
--- a/tests/python/test_plotting.py
+++ b/tests/python/test_plotting.py
@@ -14,27 +14,27 @@
 except ImportError:
     pass
 
+pytestmark = pytest.mark.skipif(**tm.no_multiple(tm.no_matplotlib(),
+                                                 tm.no_graphviz()))
 
-pytestmark = pytest.mark.skipif(**tm.no_multiple(tm.no_matplotlib(), tm.no_graphviz()))
-
-
-dpath = 'demo/data/'
-rng = np.random.RandomState(1994)
+dpath = 'demo/data/agaricus.txt.train'
 
 
 class TestPlotting(unittest.TestCase):
-
     def test_plotting(self):
-        bst2 = xgb.Booster(model_file='xgb.model')
+        m = xgb.DMatrix(dpath)
+        booster = xgb.train({'max_depth': 2, 'eta': 1,
+                             'objective': 'binary:logistic'}, m,
+                            num_boost_round=2)
 
-        ax = xgb.plot_importance(bst2)
+        ax = xgb.plot_importance(booster)
         assert isinstance(ax, Axes)
         assert ax.get_title() == 'Feature importance'
         assert ax.get_xlabel() == 'F score'
         assert ax.get_ylabel() == 'Features'
         assert len(ax.patches) == 4
 
-        ax = xgb.plot_importance(bst2, color='r',
+        ax = xgb.plot_importance(booster, color='r',
                                  title='t', xlabel='x', ylabel='y')
         assert isinstance(ax, Axes)
         assert ax.get_title() == 't'
@@ -44,7 +44,7 @@ def test_plotting(self):
         for p in ax.patches:
             assert p.get_facecolor() == (1.0, 0, 0, 1.0)  # red
 
-        ax = xgb.plot_importance(bst2, color=['r', 'r', 'b', 'b'],
+        ax = xgb.plot_importance(booster, color=['r', 'r', 'b', 'b'],
                                  title=None, xlabel=None, ylabel=None)
         assert isinstance(ax, Axes)
         assert ax.get_title() == ''
@@ -56,10 +56,10 @@ def test_plotting(self):
         assert ax.patches[2].get_facecolor() == (0, 0, 1.0, 1.0)  # blue
         assert ax.patches[3].get_facecolor() == (0, 0, 1.0, 1.0)  # blue
 
-        g = xgb.to_graphviz(bst2, num_trees=0)
+        g = xgb.to_graphviz(booster, num_trees=0)
         assert isinstance(g, Source)
 
-        ax = xgb.plot_tree(bst2, num_trees=0)
+        ax = xgb.plot_tree(booster, num_trees=0)
         assert isinstance(ax, Axes)
 
     def test_importance_plot_lim(self):

From a144daf0340357bc8fc39628451319e215b1b905 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Sat, 22 Aug 2020 19:34:52 +0800
Subject: [PATCH 12/15] Limit tree depth for GPU hist. (#6045)

---
 python-package/xgboost/core.py    |  2 +-
 python-package/xgboost/sklearn.py |  4 ++--
 src/tree/param.h                  |  4 ++++
 tests/cpp/tree/test_gpu_hist.cu   | 12 ++++++++++++
 4 files changed, 19 insertions(+), 3 deletions(-)

diff --git a/python-package/xgboost/core.py b/python-package/xgboost/core.py
index 2a69fea517a7..c8d0460825e5 100644
--- a/python-package/xgboost/core.py
+++ b/python-package/xgboost/core.py
@@ -40,7 +40,7 @@ class EarlyStopException(Exception):
     """
 
     def __init__(self, best_iteration):
-        super(EarlyStopException, self).__init__()
+        super().__init__()
         self.best_iteration = best_iteration
 
 
diff --git a/python-package/xgboost/sklearn.py b/python-package/xgboost/sklearn.py
index 4721533b49d5..96d358128a9a 100644
--- a/python-package/xgboost/sklearn.py
+++ b/python-package/xgboost/sklearn.py
@@ -1025,7 +1025,7 @@ def __init__(self,
                          **kwargs)
 
     def get_xgb_params(self):
-        params = super(XGBRFClassifier, self).get_xgb_params()
+        params = super().get_xgb_params()
         params['num_parallel_tree'] = self.n_estimators
         return params
 
@@ -1057,7 +1057,7 @@ def __init__(self, learning_rate=1, subsample=0.8, colsample_bynode=0.8,
                          reg_lambda=reg_lambda, **kwargs)
 
     def get_xgb_params(self):
-        params = super(XGBRFRegressor, self).get_xgb_params()
+        params = super().get_xgb_params()
         params['num_parallel_tree'] = self.n_estimators
         return params
 
diff --git a/src/tree/param.h b/src/tree/param.h
index 280f06066e44..dedc2a7f0ff5 100644
--- a/src/tree/param.h
+++ b/src/tree/param.h
@@ -239,6 +239,10 @@ struct TrainParam : public XGBoostParameter<TrainParam> {
     if (this->max_leaves > 0) {
       n_nodes = this->max_leaves * 2 - 1;
     } else {
+      // bst_node_t will overflow.
+      CHECK_LE(this->max_depth, 31)
+          << "max_depth can not be greater than 31 as that might generate 2 ** "
+             "32 - 1 nodes.";
       n_nodes = (1 << (this->max_depth + 1)) - 1;
     }
     CHECK_NE(n_nodes, 0);
diff --git a/tests/cpp/tree/test_gpu_hist.cu b/tests/cpp/tree/test_gpu_hist.cu
index 153cafb88fd8..5199a27d26e8 100644
--- a/tests/cpp/tree/test_gpu_hist.cu
+++ b/tests/cpp/tree/test_gpu_hist.cu
@@ -505,5 +505,17 @@ TEST(GpuHist, ConfigIO) {
   ASSERT_EQ(j_updater, j_updater_roundtrip);
 }
 
+TEST(GpuHist, MaxDepth) {
+  GenericParameter generic_param(CreateEmptyGenericParam(0));
+  size_t constexpr kRows = 16;
+  size_t constexpr kCols = 4;
+  auto p_mat = RandomDataGenerator{kRows, kCols, 0}.GenerateDMatrix();
+
+  auto learner = std::unique_ptr<Learner>(Learner::Create({p_mat}));
+  learner->SetParam("max_depth", "32");
+  learner->Configure();
+
+  ASSERT_THROW({learner->UpdateOneIter(0, p_mat);}, dmlc::Error);
+}
 }  // namespace tree
 }  // namespace xgboost

From cfced58c1c3614cdf1431960e11ac45785b0bf75 Mon Sep 17 00:00:00 2001
From: Philip Hyunsu Cho <chohyu01@cs.washington.edu>
Date: Sat, 22 Aug 2020 23:24:46 -0700
Subject: [PATCH 13/15] [CI] Port CI fixes from the 1.2.0 branch (#6050)

* Fix a unit test on CLI, to handle RC versions

* [CI] Use mgpu machine to run gpu hist unit tests

* [CI] Build GPU-enabled JAR artifact and deploy to xgboost-maven-repo
---
 Jenkinsfile                           |  9 +++++----
 tests/ci_build/deploy_jvm_packages.sh | 18 ++++++++++++++----
 tests/python/test_cli.py              |  2 ++
 3 files changed, 21 insertions(+), 8 deletions(-)

diff --git a/Jenkinsfile b/Jenkinsfile
index a251ac1a3fb2..54c8b9565ec8 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -245,7 +245,7 @@ def BuildCUDA(args) {
 }
 
 def BuildJVMPackagesWithCUDA(args) {
-  node('linux && gpu') {
+  node('linux && mgpu') {
     unstash name: 'srcs'
     echo "Build XGBoost4J-Spark with Spark ${args.spark_version}, CUDA ${args.cuda_version}"
     def container_type = "jvm_gpu_build"
@@ -440,10 +440,11 @@ def DeployJVMPackages(args) {
     unstash name: 'srcs'
     if (env.BRANCH_NAME == 'master' || env.BRANCH_NAME.startsWith('release')) {
       echo 'Deploying to xgboost-maven-repo S3 repo...'
-      def container_type = "jvm"
-      def docker_binary = "docker"
       sh """
-      ${dockerRun} ${container_type} ${docker_binary} tests/ci_build/deploy_jvm_packages.sh ${args.spark_version}
+      ${dockerRun} jvm docker tests/ci_build/deploy_jvm_packages.sh ${args.spark_version} 0
+      """
+      sh """
+      ${dockerRun} jvm_gpu_build docker --build-arg CUDA_VERSION=10.0 tests/ci_build/deploy_jvm_packages.sh ${args.spark_version} 1
       """
     }
     deleteDir()
diff --git a/tests/ci_build/deploy_jvm_packages.sh b/tests/ci_build/deploy_jvm_packages.sh
index 950cfeb38573..50a190862dd1 100755
--- a/tests/ci_build/deploy_jvm_packages.sh
+++ b/tests/ci_build/deploy_jvm_packages.sh
@@ -3,22 +3,32 @@
 set -e
 set -x
 
-if [ $# -ne 1 ]; then
-  echo "Usage: $0 [spark version]"
+if [ $# -ne 2 ]; then
+  echo "Usage: $0 [spark version] [build_gpu? 0 or 1]"
   exit 1
 fi
 
 spark_version=$1
+build_gpu=$2
 
 # Initialize local Maven repository
 ./tests/ci_build/initialize_maven.sh
 
-rm -rf build/
 cd jvm-packages
+rm -rf $(find . -name target)
+rm -rf ../build/
 
 # Re-build package without Mock Rabit
 # Deploy to S3 bucket xgboost-maven-repo
-mvn --no-transfer-progress package deploy -P release-to-s3 -Dspark.version=${spark_version} -DskipTests
+if [[ "$build_gpu" == "0" ]]
+then
+  # Build CPU artifact
+  mvn --no-transfer-progress package deploy -P release-to-s3 -Dspark.version=${spark_version} -DskipTests
+else
+  # Build GPU artifact
+  sed -i -e 's/<artifactId>xgboost\(.*\)_\(.*\)<\/artifactId>/<artifactId>xgboost\1-gpu_\2<\/artifactId>/' $(find . -name pom.xml)
+  mvn --no-transfer-progress package deploy -Duse.cuda=ON -P release-to-s3 -Dspark.version=${spark_version} -DskipTests
+fi
 
 set +x
 set +e
diff --git a/tests/python/test_cli.py b/tests/python/test_cli.py
index 4efb87e21938..e437f426cc6f 100644
--- a/tests/python/test_cli.py
+++ b/tests/python/test_cli.py
@@ -124,6 +124,8 @@ def test_cli_help(self):
         v = xgboost.__version__
         if v.find('SNAPSHOT') != -1:
             assert msg.split(':')[1].strip() == v.split('-')[0]
+        elif v.find('rc') != -1:
+            assert msg.split(':')[1].strip() == v.split('rc')[0]
         else:
             assert msg.split(':')[1].strip() == v
 

From 4729458a363c64291e84da28b408a0ac8d7851fa Mon Sep 17 00:00:00 2001
From: Philip Hyunsu Cho <chohyu01@cs.washington.edu>
Date: Sun, 23 Aug 2020 14:14:53 -0700
Subject: [PATCH 14/15] [jvm-packages] [doc] Update install doc for JVM
 packages (#6051)

---
 doc/jvm/index.rst      | 10 ++++--
 jvm-packages/README.md | 74 +++++++++++++++++++++++++-----------------
 2 files changed, 53 insertions(+), 31 deletions(-)

diff --git a/doc/jvm/index.rst b/doc/jvm/index.rst
index a4c9cdd53abe..6b3bf9348c12 100644
--- a/doc/jvm/index.rst
+++ b/doc/jvm/index.rst
@@ -65,6 +65,8 @@ This will check out the latest stable version from the Maven Central.
 
 For the latest release version number, please check `here <https://github.com/dmlc/xgboost/releases>`_.
 
+To enable the GPU algorithm (``tree_method='gpu_hist'``), use artifacts ``xgboost4j-gpu_2.12`` and ``xgboost4j-spark-gpu_2.12`` instead (note the ``gpu`` suffix).
+
 .. note:: Using Maven repository hosted by the XGBoost project
 
   There may be some delay until a new release becomes available to Maven Central. If you would like to access the latest release immediately, add the Maven repository hosted by the XGBoost project:
@@ -83,6 +85,11 @@ For the latest release version number, please check `here <https://github.com/dm
 
     resolvers += "XGBoost4J Release Repo" at "https://s3-us-west-2.amazonaws.com/xgboost-maven-repo/release/"
 
+.. note:: Windows not supported in the JVM package
+
+  Currently, XGBoost4J-Spark does not support Windows platform, as the distributed training algorithm is inoperational for Windows. Please use Linux or MacOS.
+
+
 Access SNAPSHOT version
 -----------------------
 
@@ -141,9 +148,8 @@ The SNAPSHOT JARs are hosted by the XGBoost project. Every commit in the ``maste
 
 You can browse the file listing of the Maven repository at https://s3-us-west-2.amazonaws.com/xgboost-maven-repo/list.html.
 
-.. note:: Windows not supported by published JARs
+To enable the GPU algorithm (``tree_method='gpu_hist'``), use artifacts ``xgboost4j-gpu_2.12`` and ``xgboost4j-spark-gpu_2.12`` instead (note the ``gpu`` suffix).
 
-  The published JARs from the Maven Central and GitHub currently only supports Linux and MacOS. Windows users should consider building XGBoost4J / XGBoost4J-Spark from the source. Alternatively, checkout pre-built JARs from `criteo-forks/xgboost-jars <https://github.com/criteo-forks/xgboost-jars>`_.
 
 Installation from source
 ========================
diff --git a/jvm-packages/README.md b/jvm-packages/README.md
index 5bc65e343cc0..7185e951c0ca 100644
--- a/jvm-packages/README.md
+++ b/jvm-packages/README.md
@@ -18,11 +18,11 @@ You can find more about XGBoost on [Documentation](https://xgboost.readthedocs.o
 
 ## Add Maven Dependency
 
-XGBoost4J, XGBoost4J-Spark, etc. in maven repository is compiled with g++-4.8.5  
+XGBoost4J, XGBoost4J-Spark, etc. in maven repository is compiled with g++-4.8.5.
 
 ### Access release version
 
-<b>maven</b> 
+<b>Maven</b>
 
 ```
 <dependency>
@@ -30,66 +30,82 @@ XGBoost4J, XGBoost4J-Spark, etc. in maven repository is compiled with g++-4.8.5
     <artifactId>xgboost4j_2.12</artifactId>
     <version>latest_version_num</version>
 </dependency>
-``` 
- 
-<b>sbt</b> 
+<dependency>
+    <groupId>ml.dmlc</groupId>
+    <artifactId>xgboost4j-spark_2.12</artifactId>
+    <version>latest_version_num</version>
+</dependency>
+```
+
+<b>sbt</b>
 ```sbt
- "ml.dmlc" %% "xgboost4j" % "latest_version_num"
-``` 
+libraryDependencies ++= Seq(
+  "ml.dmlc" %% "xgboost4j" % "latest_version_num",
+  "ml.dmlc" %% "xgboost4j-spark" % "latest_version_num"
+)
+```
 
 For the latest release version number, please check [here](https://github.com/dmlc/xgboost/releases).
 
-if you want to use `xgboost4j-spark`, you just need to replace xgboost4j with `xgboost4j-spark`
+To enable the GPU algorithm (`tree_method='gpu_hist'`), use artifacts `xgboost4j-gpu_2.12` and `xgboost4j-spark-gpu_2.12` instead.
 
 ### Access SNAPSHOT version
 
-You need to add github as repo:
+First add the following Maven repository hosted by the XGBoost project:
 
-<b>maven</b>:
+<b>Maven</b>:
 
 ```xml
 <repository>
-  <id>GitHub Repo</id>
-  <name>GitHub Repo</name>
-  <url>https://raw.githubusercontent.com/CodingCat/xgboost/maven-repo/</url>
+  <id>XGBoost4J Snapshot Repo</id>
+  <name>XGBoost4J Snapshot Repo</name>
+  <url>https://s3-us-west-2.amazonaws.com/xgboost-maven-repo/snapshot/</url>
 </repository>
 ```
 
 <b>sbt</b>:
- 
-```sbt 
-resolvers += "GitHub Repo" at "https://raw.githubusercontent.com/CodingCat/xgboost/maven-repo/"
+
+```sbt
+resolvers += "XGBoost4J Snapshot Repo" at "https://s3-us-west-2.amazonaws.com/xgboost-maven-repo/snapshot/"
 ```
 
-the add dependency as following:
+Then add XGBoost4J as a dependency:
 
-<b>maven</b> 
+<b>Maven</b>
 
 ```
 <dependency>
     <groupId>ml.dmlc</groupId>
     <artifactId>xgboost4j_2.12</artifactId>
-    <version>latest_version_num</version>
+    <version>latest_version_num-SNAPSHOT</version>
+</dependency>
+<dependency>
+    <groupId>ml.dmlc</groupId>
+    <artifactId>xgboost4j-spark_2.12</artifactId>
+    <version>latest_version_num-SNAPSHOT</version>
 </dependency>
-``` 
- 
-<b>sbt</b> 
+```
+
+<b>sbt</b>
 ```sbt
- "ml.dmlc" %% "xgboost4j" % "latest_version_num"
-``` 
+libraryDependencies ++= Seq(
+  "ml.dmlc" %% "xgboost4j" % "latest_version_num-SNAPSHOT",
+  "ml.dmlc" %% "xgboost4j-spark" % "latest_version_num-SNAPSHOT"
+)
+```
 
-For the latest release version number, please check [here](https://github.com/CodingCat/xgboost/tree/maven-repo/ml/dmlc/xgboost4j_2.12).
+For the latest release version number, please check [the repository listing](https://s3-us-west-2.amazonaws.com/xgboost-maven-repo/list.html).
 
-if you want to use `xgboost4j-spark`, you just need to replace xgboost4j with `xgboost4j-spark`
+To enable the GPU algorithm (`tree_method='gpu_hist'`), use artifacts `xgboost4j-gpu_2.12` and `xgboost4j-spark-gpu_2.12` instead.
 
 ## Examples
 
 Full code examples for Scala, Java, Apache Spark, and Apache Flink can
 be found in the [examples package](https://github.com/dmlc/xgboost/tree/master/jvm-packages/xgboost4j-example).
 
-**NOTE on LIBSVM Format**: 
+**NOTE on LIBSVM Format**:
 
-There is an inconsistent issue between XGBoost4J-Spark and other language bindings of XGBoost. 
+There is an inconsistent issue between XGBoost4J-Spark and other language bindings of XGBoost.
 
 When users use Spark to load trainingset/testset in LibSVM format with the following code snippet:
 
@@ -108,7 +124,7 @@ You can build/package xgboost4j locally with the following steps:
 2. Clone this repo: `git clone --recursive https://github.com/dmlc/xgboost.git`
 3. Run the following command:
   - With Tests: `./xgboost/jvm-packages/dev/build-linux.sh`
-  - Skip Tests: `./xgboost/jvm-packages/dev/build-linux.sh --skip-tests` 
+  - Skip Tests: `./xgboost/jvm-packages/dev/build-linux.sh --skip-tests`
 
 **Windows:**
 1. Ensure [Docker for Windows](https://docs.docker.com/docker-for-windows/install/) is installed.

From b3193052b31e3e984f0f3d9c2c67ae3c2e114f9b Mon Sep 17 00:00:00 2001
From: Philip Hyunsu Cho <chohyu01@cs.washington.edu>
Date: Sun, 23 Aug 2020 17:13:46 -0700
Subject: [PATCH 15/15] Bump version to 1.3.0 snapshot in master (#6052)

---
 CMakeLists.txt                         | 2 +-
 R-package/DESCRIPTION                  | 2 +-
 jvm-packages/pom.xml                   | 2 +-
 jvm-packages/xgboost4j-example/pom.xml | 8 ++++----
 jvm-packages/xgboost4j-flink/pom.xml   | 6 +++---
 jvm-packages/xgboost4j-spark/pom.xml   | 4 ++--
 jvm-packages/xgboost4j/pom.xml         | 4 ++--
 python-package/xgboost/VERSION         | 2 +-
 8 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index e26265130e07..24b9ac3adcaf 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,5 +1,5 @@
 cmake_minimum_required(VERSION 3.13)
-project(xgboost LANGUAGES CXX C VERSION 1.2.0)
+project(xgboost LANGUAGES CXX C VERSION 1.3.0)
 include(cmake/Utils.cmake)
 list(APPEND CMAKE_MODULE_PATH "${xgboost_SOURCE_DIR}/cmake/modules")
 cmake_policy(SET CMP0022 NEW)
diff --git a/R-package/DESCRIPTION b/R-package/DESCRIPTION
index b5d7585a3ca8..1a35eaa0612a 100644
--- a/R-package/DESCRIPTION
+++ b/R-package/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: xgboost
 Type: Package
 Title: Extreme Gradient Boosting
-Version: 1.2.0.1
+Version: 1.3.0.1
 Date: 2020-02-21
 Authors@R: c(
   person("Tianqi", "Chen", role = c("aut"),
diff --git a/jvm-packages/pom.xml b/jvm-packages/pom.xml
index fdca78ba403b..03de3bd1c019 100644
--- a/jvm-packages/pom.xml
+++ b/jvm-packages/pom.xml
@@ -6,7 +6,7 @@
 
     <groupId>ml.dmlc</groupId>
     <artifactId>xgboost-jvm_2.12</artifactId>
-    <version>1.2.0-SNAPSHOT</version>
+    <version>1.3.0-SNAPSHOT</version>
     <packaging>pom</packaging>
     <name>XGBoost JVM Package</name>
     <description>JVM Package for XGBoost</description>
diff --git a/jvm-packages/xgboost4j-example/pom.xml b/jvm-packages/xgboost4j-example/pom.xml
index b70f3e25f2e3..4f493caa444a 100644
--- a/jvm-packages/xgboost4j-example/pom.xml
+++ b/jvm-packages/xgboost4j-example/pom.xml
@@ -6,10 +6,10 @@
     <parent>
         <groupId>ml.dmlc</groupId>
         <artifactId>xgboost-jvm_2.12</artifactId>
-        <version>1.2.0-SNAPSHOT</version>
+        <version>1.3.0-SNAPSHOT</version>
     </parent>
     <artifactId>xgboost4j-example_2.12</artifactId>
-    <version>1.2.0-SNAPSHOT</version>
+    <version>1.3.0-SNAPSHOT</version>
     <packaging>jar</packaging>
     <build>
         <plugins>
@@ -26,7 +26,7 @@
         <dependency>
             <groupId>ml.dmlc</groupId>
             <artifactId>xgboost4j-spark_${scala.binary.version}</artifactId>
-            <version>1.2.0-SNAPSHOT</version>
+            <version>1.3.0-SNAPSHOT</version>
         </dependency>
         <dependency>
             <groupId>org.apache.spark</groupId>
@@ -37,7 +37,7 @@
         <dependency>
             <groupId>ml.dmlc</groupId>
             <artifactId>xgboost4j-flink_${scala.binary.version}</artifactId>
-            <version>1.2.0-SNAPSHOT</version>
+            <version>1.3.0-SNAPSHOT</version>
         </dependency>
         <dependency>
             <groupId>org.apache.commons</groupId>
diff --git a/jvm-packages/xgboost4j-flink/pom.xml b/jvm-packages/xgboost4j-flink/pom.xml
index 645822d2ad07..a65823a228a9 100644
--- a/jvm-packages/xgboost4j-flink/pom.xml
+++ b/jvm-packages/xgboost4j-flink/pom.xml
@@ -6,10 +6,10 @@
     <parent>
         <groupId>ml.dmlc</groupId>
         <artifactId>xgboost-jvm_2.12</artifactId>
-        <version>1.2.0-SNAPSHOT</version>
+        <version>1.3.0-SNAPSHOT</version>
     </parent>
     <artifactId>xgboost4j-flink_2.12</artifactId>
-    <version>1.2.0-SNAPSHOT</version>
+    <version>1.3.0-SNAPSHOT</version>
     <build>
         <plugins>
             <plugin>
@@ -26,7 +26,7 @@
         <dependency>
             <groupId>ml.dmlc</groupId>
             <artifactId>xgboost4j_${scala.binary.version}</artifactId>
-            <version>1.2.0-SNAPSHOT</version>
+            <version>1.3.0-SNAPSHOT</version>
         </dependency>
         <dependency>
             <groupId>org.apache.commons</groupId>
diff --git a/jvm-packages/xgboost4j-spark/pom.xml b/jvm-packages/xgboost4j-spark/pom.xml
index 115f563938ca..6435a17f37d9 100644
--- a/jvm-packages/xgboost4j-spark/pom.xml
+++ b/jvm-packages/xgboost4j-spark/pom.xml
@@ -6,7 +6,7 @@
     <parent>
         <groupId>ml.dmlc</groupId>
         <artifactId>xgboost-jvm_2.12</artifactId>
-        <version>1.2.0-SNAPSHOT</version>
+        <version>1.3.0-SNAPSHOT</version>
     </parent>
     <artifactId>xgboost4j-spark_2.12</artifactId>
     <build>
@@ -24,7 +24,7 @@
         <dependency>
             <groupId>ml.dmlc</groupId>
             <artifactId>xgboost4j_${scala.binary.version}</artifactId>
-            <version>1.2.0-SNAPSHOT</version>
+            <version>1.3.0-SNAPSHOT</version>
         </dependency>
         <dependency>
             <groupId>org.apache.spark</groupId>
diff --git a/jvm-packages/xgboost4j/pom.xml b/jvm-packages/xgboost4j/pom.xml
index 927e6d42e418..fff44d9ea37d 100644
--- a/jvm-packages/xgboost4j/pom.xml
+++ b/jvm-packages/xgboost4j/pom.xml
@@ -6,10 +6,10 @@
     <parent>
         <groupId>ml.dmlc</groupId>
         <artifactId>xgboost-jvm_2.12</artifactId>
-        <version>1.2.0-SNAPSHOT</version>
+        <version>1.3.0-SNAPSHOT</version>
     </parent>
     <artifactId>xgboost4j_2.12</artifactId>
-    <version>1.2.0-SNAPSHOT</version>
+    <version>1.3.0-SNAPSHOT</version>
     <packaging>jar</packaging>
 
     <dependencies>
diff --git a/python-package/xgboost/VERSION b/python-package/xgboost/VERSION
index 468e6c357b7a..9d7c109bb7dc 100644
--- a/python-package/xgboost/VERSION
+++ b/python-package/xgboost/VERSION
@@ -1 +1 @@
-1.2.0-SNAPSHOT
+1.3.0-SNAPSHOT