diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 9d4196feb754..5c73ffa27aa9 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -17,7 +17,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        os: [windows-latest, windows-2016, ubuntu-latest]
+        os: [windows-latest, ubuntu-latest]
 
     steps:
     - uses: actions/checkout@v2
@@ -40,12 +40,92 @@ jobs:
         cd jvm-packages
         mvn test -pl :xgboost4j_2.12
 
+  lint:
+    runs-on: ubuntu-latest
+    name: Code linting for Python and C++
+    steps:
+    - uses: actions/checkout@v2
+      with:
+        submodules: 'true'
+    - uses: actions/setup-python@v2
+      with:
+        python-version: '3.7'
+        architecture: 'x64'
+    - name: Install Python packages
+      run: |
+        python -m pip install wheel setuptools
+        python -m pip install pylint cpplint numpy scipy scikit-learn
+    - name: Run lint
+      run: |
+        make lint
+
+  doxygen:
+    runs-on: ubuntu-latest
+    name: Generate C/C++ API doc using Doxygen
+    steps:
+    - uses: actions/checkout@v2
+      with:
+        submodules: 'true'
+    - uses: actions/setup-python@v2
+      with:
+        python-version: '3.7'
+        architecture: 'x64'
+    - name: Install system packages
+      run: |
+        sudo apt-get install -y --no-install-recommends doxygen graphviz ninja-build
+        python -m pip install wheel setuptools
+        python -m pip install awscli
+    - name: Run Doxygen
+      run: |
+        mkdir build
+        cd build
+        cmake .. -DBUILD_C_DOC=ON -GNinja
+        ninja -v doc_doxygen
+    - name: Extract branch name
+      shell: bash
+      run: echo "##[set-output name=branch;]$(echo ${GITHUB_REF#refs/heads/})"
+      id: extract_branch
+      if: github.ref == 'refs/heads/master' || contains(github.ref, 'refs/heads/release_')
+    - name: Publish
+      run: |
+        cd build/
+        tar cvjf ${{ steps.extract_branch.outputs.branch }}.tar.bz2 doc_doxygen/
+        python -m awscli s3 cp ./${{ steps.extract_branch.outputs.branch }}.tar.bz2 s3://xgboost-docs/ --acl public-read
+      if: github.ref == 'refs/heads/master' || contains(github.ref, 'refs/heads/release_')
+      env:
+        AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID_IAM_S3_UPLOADER }}
+        AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY_IAM_S3_UPLOADER }}
+
+  sphinx:
+    runs-on: ubuntu-latest
+    name: Build docs using Sphinx
+    steps:
+    - uses: actions/checkout@v2
+      with:
+        submodules: 'true'
+    - uses: actions/setup-python@v2
+      with:
+        python-version: '3.7'
+        architecture: 'x64'
+    - name: Install system packages
+      run: |
+        sudo apt-get install -y --no-install-recommends graphviz
+        python -m pip install wheel setuptools
+        python -m pip install -r doc/requirements.txt
+    - name: Extract branch name
+      shell: bash
+      run: echo "##[set-output name=branch;]$(echo ${GITHUB_REF#refs/heads/})"
+      id: extract_branch
+      if: github.ref == 'refs/heads/master' || contains(github.ref, 'refs/heads/release_')
+    - name: Run Sphinx
+      run: |
+        make -C doc html
+      env:
+        SPHINX_GIT_BRANCH: ${{ steps.extract_branch.outputs.branch }}
 
   lintr:
     runs-on: ${{ matrix.config.os }}
-
     name: Run R linters on OS ${{ matrix.config.os }}, R ${{ matrix.config.r }}, Compiler ${{ matrix.config.compiler }}, Build ${{ matrix.config.build }}
-
     strategy:
       matrix:
         config:
@@ -83,23 +163,16 @@ jobs:
         R.exe CMD INSTALL .
         Rscript.exe tests/run_lint.R
 
-
   test-with-R:
     runs-on: ${{ matrix.config.os }}
-
     name: Test R on OS ${{ matrix.config.os }}, R ${{ matrix.config.r }}, Compiler ${{ matrix.config.compiler }}, Build ${{ matrix.config.build }}
-
     strategy:
       fail-fast: false
       matrix:
         config:
-          - {os: windows-latest, r: 'release', compiler: 'msvc', build: 'autotools'}
           - {os: windows-2016, r: 'release', compiler: 'msvc', build: 'autotools'}
-          - {os: windows-latest, r: 'release', compiler: 'msvc', build: 'cmake'}
           - {os: windows-2016, r: 'release', compiler: 'msvc', build: 'cmake'}
-          - {os: windows-latest, r: 'release', compiler: 'mingw', build: 'autotools'}
           - {os: windows-2016, r: 'release', compiler: 'mingw', build: 'autotools'}
-          - {os: windows-latest, r: 'release', compiler: 'mingw', build: 'cmake'}
           - {os: windows-2016, r: 'release', compiler: 'mingw', build: 'cmake'}
     env:
       R_REMOTES_NO_ERRORS_FROM_WARNINGS: true
@@ -130,8 +203,8 @@ jobs:
 
     - uses: actions/setup-python@v2
       with:
-        python-version: '3.6' # Version range or exact version of a Python version to use, using SemVer's version range syntax
-        architecture: 'x64' # optional x64 or x86. Defaults to x64 if not specified
+        python-version: '3.7'
+        architecture: 'x64'
 
     - name: Test R
       run: |
diff --git a/.travis.yml b/.travis.yml
index d0f72423b6ea..5f782ffe472a 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -1,38 +1,33 @@
-# disable sudo for container build.
 sudo: required
 
-# Enabling test OS X
-os:
-  - linux
-  - osx
-
 osx_image: xcode10.1
 dist: bionic
 
-# Use Build Matrix to do lint and build seperately
 env:
-  matrix:
-    # python package test
-    - TASK=python_test
-    # test installation of Python source distribution
-    - TASK=python_sdist_test
-    # java package test
-    - TASK=java_test
-    # cmake test
-    - TASK=cmake_test
-
   global:
     - secure: "PR16i9F8QtNwn99C5NDp8nptAS+97xwDtXEJJfEiEVhxPaaRkOp0MPWhogCaK0Eclxk1TqkgWbdXFknwGycX620AzZWa/A1K3gAs+GrpzqhnPMuoBJ0Z9qxXTbSJvCyvMbYwVrjaxc/zWqdMU8waWz8A7iqKGKs/SqbQ3rO6v7c="
     - secure: "dAGAjBokqm/0nVoLMofQni/fWIBcYSmdq4XvCBX1ZAMDsWnuOfz/4XCY6h2lEI1rVHZQ+UdZkc9PioOHGPZh5BnvE49/xVVWr9c4/61lrDOlkD01ZjSAeoV0fAZq+93V/wPl4QV+MM+Sem9hNNzFSbN5VsQLAiWCSapWsLdKzqA="
 
-matrix:
-  exclude:
+jobs:
+  include:
     - os: linux
+      arch: amd64
+      env: TASK=python_sdist_test
+    - os: osx
+      arch: amd64
       env: TASK=python_test
-    - os: linux
+    - os: osx
+      arch: amd64
+      env: TASK=python_sdist_test
+    - os: osx
+      arch: amd64
       env: TASK=java_test
-    - os: linux
+    - os: osx
+      arch: amd64
       env: TASK=cmake_test
+    - os: linux
+      arch: s390x
+      env: TASK=s390x_test
 
 # dependent brew packages
 addons:
@@ -47,6 +42,9 @@ addons:
       - wget
       - r
     update: true
+  apt:
+    packages:
+      - snapd
 
 before_install:
   - source tests/travis/travis_setup_env.sh
diff --git a/CMakeLists.txt b/CMakeLists.txt
index e26265130e07..24b9ac3adcaf 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,5 +1,5 @@
 cmake_minimum_required(VERSION 3.13)
-project(xgboost LANGUAGES CXX C VERSION 1.2.0)
+project(xgboost LANGUAGES CXX C VERSION 1.3.0)
 include(cmake/Utils.cmake)
 list(APPEND CMAKE_MODULE_PATH "${xgboost_SOURCE_DIR}/cmake/modules")
 cmake_policy(SET CMP0022 NEW)
diff --git a/Jenkinsfile b/Jenkinsfile
index 60e8116f330b..54c8b9565ec8 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -49,24 +49,12 @@ pipeline {
         stash name: 'srcs'
       }
     }
-    stage('Jenkins Linux: Formatting Check') {
-      agent none
-      steps {
-        script {
-          parallel ([
-            'clang-tidy': { ClangTidy() },
-            'lint': { Lint() },
-            'sphinx-doc': { SphinxDoc() },
-            'doxygen': { Doxygen() }
-          ])
-        }
-      }
-    }
     stage('Jenkins Linux: Build') {
       agent none
       steps {
         script {
           parallel ([
+            'clang-tidy': { ClangTidy() },
             'build-cpu': { BuildCPU() },
             'build-cpu-rabit-mock': { BuildCPUMock() },
             'build-cpu-non-omp': { BuildCPUNonOmp() },
@@ -152,50 +140,6 @@ def ClangTidy() {
   }
 }
 
-def Lint() {
-  node('linux && cpu') {
-    unstash name: 'srcs'
-    echo "Running lint..."
-    def container_type = "cpu"
-    def docker_binary = "docker"
-    sh """
-    ${dockerRun} ${container_type} ${docker_binary} bash -c "source activate cpu_test && make lint"
-    """
-    deleteDir()
-  }
-}
-
-def SphinxDoc() {
-  node('linux && cpu') {
-    unstash name: 'srcs'
-    echo "Running sphinx-doc..."
-    def container_type = "cpu"
-    def docker_binary = "docker"
-    def docker_extra_params = "CI_DOCKER_EXTRA_PARAMS_INIT='-e SPHINX_GIT_BRANCH=${BRANCH_NAME}'"
-    sh """#!/bin/bash
-    ${docker_extra_params} ${dockerRun} ${container_type} ${docker_binary} bash -c "source activate cpu_test && make -C doc html"
-    """
-    deleteDir()
-  }
-}
-
-def Doxygen() {
-  node('linux && cpu') {
-    unstash name: 'srcs'
-    echo "Running doxygen..."
-    def container_type = "cpu"
-    def docker_binary = "docker"
-    sh """
-    ${dockerRun} ${container_type} ${docker_binary} tests/ci_build/doxygen.sh ${BRANCH_NAME}
-    """
-    if (env.BRANCH_NAME == 'master' || env.BRANCH_NAME.startsWith('release')) {
-      echo 'Uploading doc...'
-      s3Upload file: "build/${BRANCH_NAME}.tar.bz2", bucket: 'xgboost-docs', acl: 'PublicRead', path: "doxygen/${BRANCH_NAME}.tar.bz2"
-    }
-    deleteDir()
-  }
-}
-
 def BuildCPU() {
   node('linux && cpu') {
     unstash name: 'srcs'
@@ -301,7 +245,7 @@ def BuildCUDA(args) {
 }
 
 def BuildJVMPackagesWithCUDA(args) {
-  node('linux && gpu') {
+  node('linux && mgpu') {
     unstash name: 'srcs'
     echo "Build XGBoost4J-Spark with Spark ${args.spark_version}, CUDA ${args.cuda_version}"
     def container_type = "jvm_gpu_build"
@@ -496,10 +440,11 @@ def DeployJVMPackages(args) {
     unstash name: 'srcs'
     if (env.BRANCH_NAME == 'master' || env.BRANCH_NAME.startsWith('release')) {
       echo 'Deploying to xgboost-maven-repo S3 repo...'
-      def container_type = "jvm"
-      def docker_binary = "docker"
       sh """
-      ${dockerRun} ${container_type} ${docker_binary} tests/ci_build/deploy_jvm_packages.sh ${args.spark_version}
+      ${dockerRun} jvm docker tests/ci_build/deploy_jvm_packages.sh ${args.spark_version} 0
+      """
+      sh """
+      ${dockerRun} jvm_gpu_build docker --build-arg CUDA_VERSION=10.0 tests/ci_build/deploy_jvm_packages.sh ${args.spark_version} 1
       """
     }
     deleteDir()
diff --git a/R-package/DESCRIPTION b/R-package/DESCRIPTION
index b5d7585a3ca8..1a35eaa0612a 100644
--- a/R-package/DESCRIPTION
+++ b/R-package/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: xgboost
 Type: Package
 Title: Extreme Gradient Boosting
-Version: 1.2.0.1
+Version: 1.3.0.1
 Date: 2020-02-21
 Authors@R: c(
   person("Tianqi", "Chen", role = c("aut"),
diff --git a/R-package/R/xgb.ggplot.R b/R-package/R/xgb.ggplot.R
index 3b76e9facf42..339e0fac1600 100644
--- a/R-package/R/xgb.ggplot.R
+++ b/R-package/R/xgb.ggplot.R
@@ -99,6 +99,85 @@ xgb.ggplot.deepness <- function(model = NULL, which = c("2x1", "max.depth", "med
   }
 }
 
+#' @rdname xgb.plot.shap.summary
+#' @export
+xgb.ggplot.shap.summary <- function(data, shap_contrib = NULL, features = NULL, top_n = 10, model = NULL,
+                                    trees = NULL, target_class = NULL, approxcontrib = FALSE, subsample = NULL) {
+  data_list <- xgb.shap.data(
+    data = data,
+    shap_contrib = shap_contrib,
+    features = features,
+    top_n = top_n,
+    model = model,
+    trees = trees,
+    target_class = target_class,
+    approxcontrib = approxcontrib,
+    subsample = subsample,
+    max_observations = 10000  # 10,000 samples per feature.
+  )
+  p_data <- prepare.ggplot.shap.data(data_list, normalize = TRUE)
+  # Reverse factor levels so that the first level is at the top of the plot
+  p_data[, "feature" := factor(feature, rev(levels(feature)))]
+
+  p <- ggplot2::ggplot(p_data, ggplot2::aes(x = feature, y = shap_value, colour = feature_value)) +
+    ggplot2::geom_jitter(alpha = 0.5, width = 0.1) +
+    ggplot2::scale_colour_viridis_c(limits = c(-3, 3), option = "plasma", direction = -1) +
+    ggplot2::geom_abline(slope = 0, intercept = 0, colour = "darkgrey") +
+    ggplot2::coord_flip()
+
+  p
+}
+
+#' Combine and melt feature values and SHAP contributions for sample
+#' observations.
+#'
+#' Conforms to data format required for ggplot functions.
+#'
+#' Internal utility function.
+#'
+#' @param data_list List containing 'data' and 'shap_contrib' returned by
+#'   \code{xgb.shap.data()}.
+#' @param normalize Whether to standardize feature values to have mean 0 and
+#'   standard deviation 1 (useful for comparing multiple features on the same
+#'   plot). Default \code{FALSE}.
+#'
+#' @return A data.table containing the observation ID, the feature name, the
+#'   feature value (normalized if specified), and the SHAP contribution value.
+prepare.ggplot.shap.data <- function(data_list, normalize = FALSE) {
+  data <- data_list[["data"]]
+  shap_contrib <- data_list[["shap_contrib"]]
+
+  data <- data.table::as.data.table(as.matrix(data))
+  if (normalize) {
+    data[, (names(data)) := lapply(.SD, normalize)]
+  }
+  data[, "id" := seq_len(nrow(data))]
+  data_m <- data.table::melt.data.table(data, id.vars = "id", variable.name = "feature", value.name = "feature_value")
+
+  shap_contrib <- data.table::as.data.table(as.matrix(shap_contrib))
+  shap_contrib[, "id" := seq_len(nrow(shap_contrib))]
+  shap_contrib_m <- data.table::melt.data.table(shap_contrib, id.vars = "id", variable.name = "feature", value.name = "shap_value")
+
+  p_data <- data.table::merge.data.table(data_m, shap_contrib_m, by = c("id", "feature"))
+
+  p_data
+}
+
+#' Scale feature value to have mean 0, standard deviation 1
+#'
+#' This is used to compare multiple features on the same plot.
+#' Internal utility function
+#'
+#' @param x Numeric vector
+#'
+#' @return Numeric vector with mean 0 and sd 1.
+normalize <- function(x) {
+  loc <- mean(x, na.rm = TRUE)
+  scale <- stats::sd(x, na.rm = TRUE)
+
+  (x - loc) / scale
+}
+
 # Plot multiple ggplot graph aligned by rows and columns.
 # ... the plots
 # cols number of columns
@@ -131,5 +210,5 @@ multiplot <- function(..., cols = 1) {
 
 globalVariables(c(
   "Cluster", "ggplot", "aes", "geom_bar", "coord_flip", "xlab", "ylab", "ggtitle", "theme",
-  "element_blank", "element_text", "V1", "Weight"
+  "element_blank", "element_text", "V1", "Weight", "feature"
 ))
diff --git a/R-package/R/xgb.plot.shap.R b/R-package/R/xgb.plot.shap.R
index a44d4b570a09..d9ea69786ad9 100644
--- a/R-package/R/xgb.plot.shap.R
+++ b/R-package/R/xgb.plot.shap.R
@@ -81,6 +81,7 @@
 #' xgb.plot.shap(agaricus.test$data, model = bst, features = "odor=none")
 #' contr <- predict(bst, agaricus.test$data, predcontrib = TRUE)
 #' xgb.plot.shap(agaricus.test$data, contr, model = bst, top_n = 12, n_col = 3)
+#' xgb.ggplot.shap.summary(agaricus.test$data, contr, model = bst, top_n = 12)  # Summary plot
 #'
 #' # multiclass example - plots for each class separately:
 #' nclass <- 3
@@ -99,6 +100,7 @@
 #'               n_col = 2, col = col, pch = 16, pch_NA = 17)
 #' xgb.plot.shap(x, model = mbst, trees = trees0 + 2, target_class = 2, top_n = 4,
 #'               n_col = 2, col = col, pch = 16, pch_NA = 17)
+#' xgb.ggplot.shap.summary(x, model = mbst, target_class = 0, top_n = 4)  # Summary plot
 #'
 #' @rdname xgb.plot.shap
 #' @export
@@ -109,69 +111,33 @@ xgb.plot.shap <- function(data, shap_contrib = NULL, features = NULL, top_n = 1,
                           plot_NA = TRUE, col_NA = rgb(0.7, 0, 1, 0.6), pch_NA = '.', pos_NA = 1.07,
                           plot_loess = TRUE, col_loess = 2, span_loess = 0.5,
                           which = c("1d", "2d"), plot = TRUE, ...) {
-
-  if (!is.matrix(data) && !inherits(data, "dgCMatrix"))
-    stop("data: must be either matrix or dgCMatrix")
-
-  if (is.null(shap_contrib) && (is.null(model) || !inherits(model, "xgb.Booster")))
-    stop("when shap_contrib is not provided, one must provide an xgb.Booster model")
-
-  if (is.null(features) && (is.null(model) || !inherits(model, "xgb.Booster")))
-    stop("when features are not provided, one must provide an xgb.Booster model to rank the features")
-
-  if (!is.null(shap_contrib) &&
-      (!is.matrix(shap_contrib) || nrow(shap_contrib) != nrow(data) || ncol(shap_contrib) != ncol(data) + 1))
-    stop("shap_contrib is not compatible with the provided data")
-
-  nsample <- if (is.null(subsample)) min(100000, nrow(data)) else as.integer(subsample * nrow(data))
-  idx <- sample(seq_len(nrow(data)), nsample)
-  data <- data[idx, ]
-
-  if (is.null(shap_contrib)) {
-    shap_contrib <- predict(model, data, predcontrib = TRUE, approxcontrib = approxcontrib)
-  } else {
-    shap_contrib <- shap_contrib[idx, ]
-  }
+  data_list <- xgb.shap.data(
+    data = data,
+    shap_contrib = shap_contrib,
+    features = features,
+    top_n = top_n,
+    model = model,
+    trees = trees,
+    target_class = target_class,
+    approxcontrib = approxcontrib,
+    subsample = subsample,
+    max_observations = 100000
+  )
+  data <- data_list[["data"]]
+  shap_contrib <- data_list[["shap_contrib"]]
+  features <- colnames(data)
 
   which <- match.arg(which)
   if (which == "2d")
     stop("2D plots are not implemented yet")
 
-  if (is.null(features)) {
-    imp <- xgb.importance(model = model, trees = trees)
-    top_n <- as.integer(top_n[1])
-    if (top_n < 1 && top_n > 100)
-      stop("top_n: must be an integer within [1, 100]")
-    features <- imp$Feature[1:min(top_n, NROW(imp))]
-  }
-
-  if (is.character(features)) {
-    if (is.null(colnames(data)))
-      stop("Either provide `data` with column names or provide `features` as column indices")
-    features <- match(features, colnames(data))
-  }
-
   if (n_col > length(features)) n_col <- length(features)
-
-  if (is.list(shap_contrib)) { # multiclass: either choose a class or merge
-    shap_contrib <- if (!is.null(target_class)) shap_contrib[[target_class + 1]]
-                    else Reduce("+", lapply(shap_contrib, abs))
-  }
-
-  shap_contrib <- shap_contrib[, features, drop = FALSE]
-  data <- data[, features, drop = FALSE]
-  cols <- colnames(data)
-  if (is.null(cols)) cols <- colnames(shap_contrib)
-  if (is.null(cols)) cols <- paste0('X', seq_len(ncol(data)))
-  colnames(data) <- cols
-  colnames(shap_contrib) <- cols
-
   if (plot && which == "1d") {
     op <- par(mfrow = c(ceiling(length(features) / n_col), n_col),
               oma = c(0, 0, 0, 0) + 0.2,
               mar = c(3.5, 3.5, 0, 0) + 0.1,
               mgp = c(1.7, 0.6, 0))
-    for (f in cols) {
+    for (f in features) {
       ord <- order(data[, f])
       x <- data[, f][ord]
       y <- shap_contrib[, f][ord]
@@ -216,3 +182,105 @@ xgb.plot.shap <- function(data, shap_contrib = NULL, features = NULL, top_n = 1,
   }
   invisible(list(data = data, shap_contrib = shap_contrib))
 }
+
+#' SHAP contribution dependency summary plot
+#'
+#' Compare SHAP contributions of different features.
+#'
+#' A point plot (each point representing one sample from \code{data}) is
+#' produced for each feature, with the points plotted on the SHAP value axis.
+#' Each point (observation) is coloured based on its feature value. The plot
+#' hence allows us to see which features have a negative / positive contribution
+#' on the model prediction, and whether the contribution is different for larger
+#' or smaller values of the feature. We effectively try to replicate the
+#' \code{summary_plot} function from https://github.com/slundberg/shap.
+#'
+#' @inheritParams xgb.plot.shap
+#'
+#' @return A \code{ggplot2} object.
+#' @export
+#'
+#' @examples See \code{\link{xgb.plot.shap}}.
+#' @seealso \code{\link{xgb.plot.shap}}, \code{\link{xgb.ggplot.shap.summary}},
+#'   \code{\url{https://github.com/slundberg/shap}}
+xgb.plot.shap.summary <- function(data, shap_contrib = NULL, features = NULL, top_n = 10, model = NULL,
+                                  trees = NULL, target_class = NULL, approxcontrib = FALSE, subsample = NULL) {
+  # Only ggplot implementation is available.
+  xgb.ggplot.shap.summary(data, shap_contrib, features, top_n, model, trees, target_class, approxcontrib, subsample)
+}
+
+#' Prepare data for SHAP plots. To be used in xgb.plot.shap, xgb.plot.shap.summary, etc.
+#' Internal utility function.
+#'
+#' @return A list containing: 'data', a matrix containing sample observations
+#'   and their feature values; 'shap_contrib', a matrix containing the SHAP contribution
+#'   values for these observations.
+xgb.shap.data <- function(data, shap_contrib = NULL, features = NULL, top_n = 1, model = NULL,
+                          trees = NULL, target_class = NULL, approxcontrib = FALSE,
+                          subsample = NULL, max_observations = 100000) {
+  if (!is.matrix(data) && !inherits(data, "dgCMatrix"))
+    stop("data: must be either matrix or dgCMatrix")
+
+  if (is.null(shap_contrib) && (is.null(model) || !inherits(model, "xgb.Booster")))
+    stop("when shap_contrib is not provided, one must provide an xgb.Booster model")
+
+  if (is.null(features) && (is.null(model) || !inherits(model, "xgb.Booster")))
+    stop("when features are not provided, one must provide an xgb.Booster model to rank the features")
+
+  if (!is.null(shap_contrib) &&
+      (!is.matrix(shap_contrib) || nrow(shap_contrib) != nrow(data) || ncol(shap_contrib) != ncol(data) + 1))
+    stop("shap_contrib is not compatible with the provided data")
+
+  if (is.character(features) && is.null(colnames(data)))
+    stop("either provide `data` with column names or provide `features` as column indices")
+
+  if (is.null(model$feature_names) && model$nfeatures != ncol(data))
+    stop("if model has no feature_names, columns in `data` must match features in model")
+
+  if (!is.null(subsample)) {
+    idx <- sample(x = seq_len(nrow(data)), size = as.integer(subsample * nrow(data)), replace = FALSE)
+  } else {
+    idx <- seq_len(min(nrow(data), max_observations))
+  }
+  data <- data[idx, ]
+  if (is.null(colnames(data))) {
+    colnames(data) <- paste0("X", seq_len(ncol(data)))
+  }
+
+  if (!is.null(shap_contrib)) {
+    if (is.list(shap_contrib)) { # multiclass: either choose a class or merge
+      shap_contrib <- if (!is.null(target_class)) shap_contrib[[target_class + 1]] else Reduce("+", lapply(shap_contrib, abs))
+    }
+    shap_contrib <- shap_contrib[idx, ]
+    if (is.null(colnames(shap_contrib))) {
+      colnames(shap_contrib) <- paste0("X", seq_len(ncol(data)))
+    }
+  } else {
+    shap_contrib <- predict(model, newdata = data, predcontrib = TRUE, approxcontrib = approxcontrib)
+    if (is.list(shap_contrib)) { # multiclass: either choose a class or merge
+      shap_contrib <- if (!is.null(target_class)) shap_contrib[[target_class + 1]] else Reduce("+", lapply(shap_contrib, abs))
+    }
+  }
+
+  if (is.null(features)) {
+    if (!is.null(model$feature_names)) {
+      imp <- xgb.importance(model = model, trees = trees)
+    } else {
+      imp <- xgb.importance(model = model, trees = trees, feature_names = colnames(data))
+    }
+    top_n <- top_n[1]
+    if (top_n < 1 | top_n > 100) stop("top_n: must be an integer within [1, 100]")
+    features <- imp$Feature[1:min(top_n, NROW(imp))]
+  }
+  if (is.character(features)) {
+    features <- match(features, colnames(data))
+  }
+
+  shap_contrib <- shap_contrib[, features, drop = FALSE]
+  data <- data[, features, drop = FALSE]
+
+  list(
+    data = data,
+    shap_contrib = shap_contrib
+  )
+}
diff --git a/R-package/tests/testthat/test_helpers.R b/R-package/tests/testthat/test_helpers.R
index 2ee1acf568a7..86c0efd0207e 100644
--- a/R-package/tests/testthat/test_helpers.R
+++ b/R-package/tests/testthat/test_helpers.R
@@ -351,11 +351,47 @@ test_that("xgb.plot.deepness works", {
   xgb.ggplot.deepness(model = bst.Tree)
 })
 
+test_that("xgb.shap.data works when top_n is provided", {
+  data_list <- xgb.shap.data(data = sparse_matrix, model = bst.Tree, top_n = 2)
+  expect_equal(names(data_list), c("data", "shap_contrib"))
+  expect_equal(NCOL(data_list$data), 2)
+  expect_equal(NCOL(data_list$shap_contrib), 2)
+  expect_equal(NROW(data_list$data), NROW(data_list$shap_contrib))
+  expect_gt(length(colnames(data_list$data)), 0)
+  expect_gt(length(colnames(data_list$shap_contrib)), 0)
+
+  # for multiclass without target class provided
+  data_list <- xgb.shap.data(data = as.matrix(iris[, -5]), model = mbst.Tree, top_n = 2)
+  expect_equal(dim(data_list$shap_contrib), c(nrow(iris), 2))
+  # for multiclass with target class provided
+  data_list <- xgb.shap.data(data = as.matrix(iris[, -5]), model = mbst.Tree, top_n = 2, target_class = 0)
+  expect_equal(dim(data_list$shap_contrib), c(nrow(iris), 2))
+})
+
+test_that("xgb.shap.data works with subsampling", {
+  data_list <- xgb.shap.data(data = sparse_matrix, model = bst.Tree, top_n = 2, subsample = 0.8)
+  expect_equal(NROW(data_list$data), as.integer(0.8 * nrow(sparse_matrix)))
+  expect_equal(NROW(data_list$data), NROW(data_list$shap_contrib))
+})
+
+test_that("prepare.ggplot.shap.data works", {
+  data_list <- xgb.shap.data(data = sparse_matrix, model = bst.Tree, top_n = 2)
+  plot_data <- prepare.ggplot.shap.data(data_list, normalize = TRUE)
+  expect_s3_class(plot_data, "data.frame")
+  expect_equal(names(plot_data), c("id", "feature", "feature_value", "shap_value"))
+  expect_s3_class(plot_data$feature, "factor")
+  # Each observation should have 1 row for each feature
+  expect_equal(nrow(plot_data), nrow(sparse_matrix) * 2)
+})
+
 test_that("xgb.plot.shap works", {
   sh <- xgb.plot.shap(data = sparse_matrix, model = bst.Tree, top_n = 2, col = 4)
   expect_equal(names(sh), c("data", "shap_contrib"))
-  expect_equal(NCOL(sh$data), 2)
-  expect_equal(NCOL(sh$shap_contrib), 2)
+})
+
+test_that("xgb.plot.shap.summary works", {
+  xgb.plot.shap.summary(data = sparse_matrix, model = bst.Tree, top_n = 2)
+  xgb.ggplot.shap.summary(data = sparse_matrix, model = bst.Tree, top_n = 2)
 })
 
 test_that("check.deprecation works", {
diff --git a/amalgamation/xgboost-all0.cc b/amalgamation/xgboost-all0.cc
index 8220135d9e32..792b43797ce5 100644
--- a/amalgamation/xgboost-all0.cc
+++ b/amalgamation/xgboost-all0.cc
@@ -68,6 +68,7 @@
 #include "../src/learner.cc"
 #include "../src/logging.cc"
 #include "../src/common/common.cc"
+#include "../src/common/random.cc"
 #include "../src/common/charconv.cc"
 #include "../src/common/timer.cc"
 #include "../src/common/quantile.cc"
diff --git a/demo/guide-python/feature_weights.py b/demo/guide-python/feature_weights.py
new file mode 100644
index 000000000000..b9cee8c050af
--- /dev/null
+++ b/demo/guide-python/feature_weights.py
@@ -0,0 +1,49 @@
+'''Using feature weight to change column sampling.
+
+    .. versionadded:: 1.3.0
+'''
+
+import numpy as np
+import xgboost
+from matplotlib import pyplot as plt
+import argparse
+
+
+def main(args):
+    rng = np.random.RandomState(1994)
+
+    kRows = 1000
+    kCols = 10
+
+    X = rng.randn(kRows, kCols)
+    y = rng.randn(kRows)
+    fw = np.ones(shape=(kCols,))
+    for i in range(kCols):
+        fw[i] *= float(i)
+
+    dtrain = xgboost.DMatrix(X, y)
+    dtrain.set_info(feature_weights=fw)
+
+    bst = xgboost.train({'tree_method': 'hist',
+                         'colsample_bynode': 0.5},
+                        dtrain, num_boost_round=10,
+                        evals=[(dtrain, 'd')])
+    featue_map = bst.get_fscore()
+    # feature zero has 0 weight
+    assert featue_map.get('f0', None) is None
+    assert max(featue_map.values()) == featue_map.get('f9')
+
+    if args.plot:
+        xgboost.plot_importance(bst)
+        plt.show()
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        '--plot',
+        type=int,
+        default=1,
+        help='Set to 0 to disable plotting the evaluation history.')
+    args = parser.parse_args()
+    main(args)
diff --git a/demo/json-model/json_parser.py b/demo/json-model/json_parser.py
index eedcbf9c2287..c41a44d881c8 100644
--- a/demo/json-model/json_parser.py
+++ b/demo/json-model/json_parser.py
@@ -94,7 +94,7 @@ def __str__(self):
 
 class Model:
     '''Gradient boosted tree model.'''
-    def __init__(self, m: dict):
+    def __init__(self, model: dict):
         '''Construct the Model from JSON object.
 
          parameters
diff --git a/doc/conf.py b/doc/conf.py
index d17f9594a285..749d400c6e8f 100644
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -22,7 +22,7 @@
 import guzzle_sphinx_theme
 
 git_branch = os.getenv('SPHINX_GIT_BRANCH', default=None)
-if git_branch is None:
+if not git_branch:
     # If SPHINX_GIT_BRANCH environment variable is not given, run git
     # to determine branch name
     git_branch = [
@@ -30,6 +30,8 @@
             git.branch('-r', '--contains', 'HEAD')).rstrip('\n').split('\n')
     ]
     git_branch = [x for x in git_branch if 'HEAD' not in x]
+else:
+    git_branch = [git_branch]
 print('git_branch = {}'.format(git_branch[0]))
 try:
     filename, _ = urllib.request.urlretrieve(
diff --git a/doc/jvm/index.rst b/doc/jvm/index.rst
index a4c9cdd53abe..6b3bf9348c12 100644
--- a/doc/jvm/index.rst
+++ b/doc/jvm/index.rst
@@ -65,6 +65,8 @@ This will check out the latest stable version from the Maven Central.
 
 For the latest release version number, please check `here <https://github.com/dmlc/xgboost/releases>`_.
 
+To enable the GPU algorithm (``tree_method='gpu_hist'``), use artifacts ``xgboost4j-gpu_2.12`` and ``xgboost4j-spark-gpu_2.12`` instead (note the ``gpu`` suffix).
+
 .. note:: Using Maven repository hosted by the XGBoost project
 
   There may be some delay until a new release becomes available to Maven Central. If you would like to access the latest release immediately, add the Maven repository hosted by the XGBoost project:
@@ -83,6 +85,11 @@ For the latest release version number, please check `here <https://github.com/dm
 
     resolvers += "XGBoost4J Release Repo" at "https://s3-us-west-2.amazonaws.com/xgboost-maven-repo/release/"
 
+.. note:: Windows not supported in the JVM package
+
+  Currently, XGBoost4J-Spark does not support Windows platform, as the distributed training algorithm is inoperational for Windows. Please use Linux or MacOS.
+
+
 Access SNAPSHOT version
 -----------------------
 
@@ -141,9 +148,8 @@ The SNAPSHOT JARs are hosted by the XGBoost project. Every commit in the ``maste
 
 You can browse the file listing of the Maven repository at https://s3-us-west-2.amazonaws.com/xgboost-maven-repo/list.html.
 
-.. note:: Windows not supported by published JARs
+To enable the GPU algorithm (``tree_method='gpu_hist'``), use artifacts ``xgboost4j-gpu_2.12`` and ``xgboost4j-spark-gpu_2.12`` instead (note the ``gpu`` suffix).
 
-  The published JARs from the Maven Central and GitHub currently only supports Linux and MacOS. Windows users should consider building XGBoost4J / XGBoost4J-Spark from the source. Alternatively, checkout pre-built JARs from `criteo-forks/xgboost-jars <https://github.com/criteo-forks/xgboost-jars>`_.
 
 Installation from source
 ========================
diff --git a/doc/parameter.rst b/doc/parameter.rst
index 626ddf10f8ab..7e7e774a2bfa 100644
--- a/doc/parameter.rst
+++ b/doc/parameter.rst
@@ -107,6 +107,10 @@ Parameters for Tree Booster
     'colsample_bynode':0.5}`` with 64 features will leave 8 features to choose from at
     each split.
 
+    On Python interface, one can set the ``feature_weights`` for DMatrix to define the
+    probability of each feature being selected when using column sampling.  There's a
+    similar parameter for ``fit`` method in sklearn interface.
+
 * ``lambda`` [default=1, alias: ``reg_lambda``]
 
   - L2 regularization term on weights. Increasing this value will make model more conservative.
@@ -224,7 +228,7 @@ Parameters for Tree Booster
     list is a group of indices of features that are allowed to interact with each other.
     See tutorial for more information
 
-Additional parameters for ``hist`` and ```gpu_hist`` tree method
+Additional parameters for ``hist`` and ``gpu_hist`` tree method
 ================================================================
 
 * ``single_precision_histogram``, [default=``false``]
diff --git a/include/xgboost/c_api.h b/include/xgboost/c_api.h
index 794cbdf19e8f..4db461d11b1c 100644
--- a/include/xgboost/c_api.h
+++ b/include/xgboost/c_api.h
@@ -483,6 +483,34 @@ XGB_DLL int XGDMatrixGetStrFeatureInfo(DMatrixHandle handle, const char *field,
                                        bst_ulong *size,
                                        const char ***out_features);
 
+/*!
+ * \brief Set meta info from dense matrix.  Valid field names are:
+ *
+ *  - label
+ *  - weight
+ *  - base_margin
+ *  - group
+ *  - label_lower_bound
+ *  - label_upper_bound
+ *  - feature_weights
+ *
+ * \param handle An instance of data matrix
+ * \param field  Feild name
+ * \param data   Pointer to consecutive memory storing data.
+ * \param size   Size of the data, this is relative to size of type.  (Meaning NOT number
+ *               of bytes.)
+ * \param type   Indicator of data type.  This is defined in xgboost::DataType enum class.
+ *
+ *    float    = 1
+ *    double   = 2
+ *    uint32_t = 3
+ *    uint64_t = 4
+ *
+ * \return 0 when success, -1 when failure happens
+ */
+XGB_DLL int XGDMatrixSetDenseInfo(DMatrixHandle handle, const char *field,
+                                  void *data, bst_ulong size, int type);
+
 /*!
  * \brief (deprecated) Use XGDMatrixSetUIntInfo instead. Set group of the training matrix
  * \param handle a instance of data matrix
diff --git a/include/xgboost/data.h b/include/xgboost/data.h
index 1ee292a89edb..f74dbd2c5a76 100644
--- a/include/xgboost/data.h
+++ b/include/xgboost/data.h
@@ -88,34 +88,17 @@ class MetaInfo {
    * \brief Type of each feature.  Automatically set when feature_type_names is specifed.
    */
   HostDeviceVector<FeatureType> feature_types;
+  /*
+   * \brief Weight of each feature, used to define the probability of each feature being
+   *        selected when using column sampling.
+   */
+  HostDeviceVector<float> feature_weigths;
 
   /*! \brief default constructor */
   MetaInfo()  = default;
   MetaInfo(MetaInfo&& that) = default;
   MetaInfo& operator=(MetaInfo&& that) = default;
-  MetaInfo& operator=(MetaInfo const& that) {
-    this->num_row_ = that.num_row_;
-    this->num_col_ = that.num_col_;
-    this->num_nonzero_ = that.num_nonzero_;
-
-    this->labels_.Resize(that.labels_.Size());
-    this->labels_.Copy(that.labels_);
-
-    this->group_ptr_ = that.group_ptr_;
-
-    this->weights_.Resize(that.weights_.Size());
-    this->weights_.Copy(that.weights_);
-
-    this->base_margin_.Resize(that.base_margin_.Size());
-    this->base_margin_.Copy(that.base_margin_);
-
-    this->labels_lower_bound_.Resize(that.labels_lower_bound_.Size());
-    this->labels_lower_bound_.Copy(that.labels_lower_bound_);
-
-    this->labels_upper_bound_.Resize(that.labels_upper_bound_.Size());
-    this->labels_upper_bound_.Copy(that.labels_upper_bound_);
-    return *this;
-  }
+  MetaInfo& operator=(MetaInfo const& that) = delete;
 
   /*!
    * \brief Validate all metainfo.
diff --git a/include/xgboost/generic_parameters.h b/include/xgboost/generic_parameters.h
index 752342b9a90c..a78453604467 100644
--- a/include/xgboost/generic_parameters.h
+++ b/include/xgboost/generic_parameters.h
@@ -27,7 +27,7 @@ struct GenericParameter : public XGBoostParameter<GenericParameter> {
   int gpu_id;
   // gpu page size in external memory mode, 0 means using the default.
   size_t gpu_page_size;
-  bool enable_experimental_json_serialization {false};
+  bool enable_experimental_json_serialization {true};
   bool validate_parameters {false};
 
   void CheckDeprecated() {
@@ -68,7 +68,7 @@ struct GenericParameter : public XGBoostParameter<GenericParameter> {
         .set_lower_bound(0)
         .describe("GPU page size when running in external memory mode.");
     DMLC_DECLARE_FIELD(enable_experimental_json_serialization)
-        .set_default(false)
+        .set_default(true)
         .describe("Enable using JSON for memory serialization (Python Pickle, "
                   "rabit checkpoints etc.).");
     DMLC_DECLARE_FIELD(validate_parameters)
diff --git a/include/xgboost/tree_model.h b/include/xgboost/tree_model.h
index e7f6dc8ec089..fd9c69df3e7b 100644
--- a/include/xgboost/tree_model.h
+++ b/include/xgboost/tree_model.h
@@ -59,6 +59,21 @@ struct TreeParam : public dmlc::Parameter<TreeParam> {
     num_nodes = 1;
     deprecated_num_roots = 1;
   }
+
+  // Swap byte order for all fields. Useful for transporting models between machines with different
+  // endianness (big endian vs little endian)
+  inline TreeParam ByteSwap() const {
+    TreeParam x = *this;
+    dmlc::ByteSwap(&x.deprecated_num_roots, sizeof(x.deprecated_num_roots), 1);
+    dmlc::ByteSwap(&x.num_nodes, sizeof(x.num_nodes), 1);
+    dmlc::ByteSwap(&x.num_deleted, sizeof(x.num_deleted), 1);
+    dmlc::ByteSwap(&x.deprecated_max_depth, sizeof(x.deprecated_max_depth), 1);
+    dmlc::ByteSwap(&x.num_feature, sizeof(x.num_feature), 1);
+    dmlc::ByteSwap(&x.size_leaf_vector, sizeof(x.size_leaf_vector), 1);
+    dmlc::ByteSwap(x.reserved, sizeof(x.reserved[0]), sizeof(x.reserved) / sizeof(x.reserved[0]));
+    return x;
+  }
+
   // declare the parameters
   DMLC_DECLARE_PARAMETER(TreeParam) {
     // only declare the parameters that can be set by the user.
@@ -97,6 +112,16 @@ struct RTreeNodeStat {
     return loss_chg == b.loss_chg && sum_hess == b.sum_hess &&
            base_weight == b.base_weight && leaf_child_cnt == b.leaf_child_cnt;
   }
+  // Swap byte order for all fields. Useful for transporting models between machines with different
+  // endianness (big endian vs little endian)
+  inline RTreeNodeStat ByteSwap() const {
+    RTreeNodeStat x = *this;
+    dmlc::ByteSwap(&x.loss_chg, sizeof(x.loss_chg), 1);
+    dmlc::ByteSwap(&x.sum_hess, sizeof(x.sum_hess), 1);
+    dmlc::ByteSwap(&x.base_weight, sizeof(x.base_weight), 1);
+    dmlc::ByteSwap(&x.leaf_child_cnt, sizeof(x.leaf_child_cnt), 1);
+    return x;
+  }
 };
 
 /*!
@@ -227,6 +252,16 @@ class RegTree : public Model {
              info_.leaf_value == b.info_.leaf_value;
     }
 
+    inline Node ByteSwap() const {
+      Node x = *this;
+      dmlc::ByteSwap(&x.parent_, sizeof(x.parent_), 1);
+      dmlc::ByteSwap(&x.cleft_, sizeof(x.cleft_), 1);
+      dmlc::ByteSwap(&x.cright_, sizeof(x.cright_), 1);
+      dmlc::ByteSwap(&x.sindex_, sizeof(x.sindex_), 1);
+      dmlc::ByteSwap(&x.info_, sizeof(x.info_), 1);
+      return x;
+    }
+
    private:
     /*!
      * \brief in leaf node, we have weights, in non-leaf nodes,
diff --git a/jvm-packages/README.md b/jvm-packages/README.md
index 5bc65e343cc0..7185e951c0ca 100644
--- a/jvm-packages/README.md
+++ b/jvm-packages/README.md
@@ -18,11 +18,11 @@ You can find more about XGBoost on [Documentation](https://xgboost.readthedocs.o
 
 ## Add Maven Dependency
 
-XGBoost4J, XGBoost4J-Spark, etc. in maven repository is compiled with g++-4.8.5  
+XGBoost4J, XGBoost4J-Spark, etc. in maven repository is compiled with g++-4.8.5.
 
 ### Access release version
 
-<b>maven</b> 
+<b>Maven</b>
 
 ```
 <dependency>
@@ -30,66 +30,82 @@ XGBoost4J, XGBoost4J-Spark, etc. in maven repository is compiled with g++-4.8.5
     <artifactId>xgboost4j_2.12</artifactId>
     <version>latest_version_num</version>
 </dependency>
-``` 
- 
-<b>sbt</b> 
+<dependency>
+    <groupId>ml.dmlc</groupId>
+    <artifactId>xgboost4j-spark_2.12</artifactId>
+    <version>latest_version_num</version>
+</dependency>
+```
+
+<b>sbt</b>
 ```sbt
- "ml.dmlc" %% "xgboost4j" % "latest_version_num"
-``` 
+libraryDependencies ++= Seq(
+  "ml.dmlc" %% "xgboost4j" % "latest_version_num",
+  "ml.dmlc" %% "xgboost4j-spark" % "latest_version_num"
+)
+```
 
 For the latest release version number, please check [here](https://github.com/dmlc/xgboost/releases).
 
-if you want to use `xgboost4j-spark`, you just need to replace xgboost4j with `xgboost4j-spark`
+To enable the GPU algorithm (`tree_method='gpu_hist'`), use artifacts `xgboost4j-gpu_2.12` and `xgboost4j-spark-gpu_2.12` instead.
 
 ### Access SNAPSHOT version
 
-You need to add github as repo:
+First add the following Maven repository hosted by the XGBoost project:
 
-<b>maven</b>:
+<b>Maven</b>:
 
 ```xml
 <repository>
-  <id>GitHub Repo</id>
-  <name>GitHub Repo</name>
-  <url>https://raw.githubusercontent.com/CodingCat/xgboost/maven-repo/</url>
+  <id>XGBoost4J Snapshot Repo</id>
+  <name>XGBoost4J Snapshot Repo</name>
+  <url>https://s3-us-west-2.amazonaws.com/xgboost-maven-repo/snapshot/</url>
 </repository>
 ```
 
 <b>sbt</b>:
- 
-```sbt 
-resolvers += "GitHub Repo" at "https://raw.githubusercontent.com/CodingCat/xgboost/maven-repo/"
+
+```sbt
+resolvers += "XGBoost4J Snapshot Repo" at "https://s3-us-west-2.amazonaws.com/xgboost-maven-repo/snapshot/"
 ```
 
-the add dependency as following:
+Then add XGBoost4J as a dependency:
 
-<b>maven</b> 
+<b>Maven</b>
 
 ```
 <dependency>
     <groupId>ml.dmlc</groupId>
     <artifactId>xgboost4j_2.12</artifactId>
-    <version>latest_version_num</version>
+    <version>latest_version_num-SNAPSHOT</version>
+</dependency>
+<dependency>
+    <groupId>ml.dmlc</groupId>
+    <artifactId>xgboost4j-spark_2.12</artifactId>
+    <version>latest_version_num-SNAPSHOT</version>
 </dependency>
-``` 
- 
-<b>sbt</b> 
+```
+
+<b>sbt</b>
 ```sbt
- "ml.dmlc" %% "xgboost4j" % "latest_version_num"
-``` 
+libraryDependencies ++= Seq(
+  "ml.dmlc" %% "xgboost4j" % "latest_version_num-SNAPSHOT",
+  "ml.dmlc" %% "xgboost4j-spark" % "latest_version_num-SNAPSHOT"
+)
+```
 
-For the latest release version number, please check [here](https://github.com/CodingCat/xgboost/tree/maven-repo/ml/dmlc/xgboost4j_2.12).
+For the latest release version number, please check [the repository listing](https://s3-us-west-2.amazonaws.com/xgboost-maven-repo/list.html).
 
-if you want to use `xgboost4j-spark`, you just need to replace xgboost4j with `xgboost4j-spark`
+To enable the GPU algorithm (`tree_method='gpu_hist'`), use artifacts `xgboost4j-gpu_2.12` and `xgboost4j-spark-gpu_2.12` instead.
 
 ## Examples
 
 Full code examples for Scala, Java, Apache Spark, and Apache Flink can
 be found in the [examples package](https://github.com/dmlc/xgboost/tree/master/jvm-packages/xgboost4j-example).
 
-**NOTE on LIBSVM Format**: 
+**NOTE on LIBSVM Format**:
 
-There is an inconsistent issue between XGBoost4J-Spark and other language bindings of XGBoost. 
+There is an inconsistent issue between XGBoost4J-Spark and other language bindings of XGBoost.
 
 When users use Spark to load trainingset/testset in LibSVM format with the following code snippet:
 
@@ -108,7 +124,7 @@ You can build/package xgboost4j locally with the following steps:
 2. Clone this repo: `git clone --recursive https://github.com/dmlc/xgboost.git`
 3. Run the following command:
   - With Tests: `./xgboost/jvm-packages/dev/build-linux.sh`
-  - Skip Tests: `./xgboost/jvm-packages/dev/build-linux.sh --skip-tests` 
+  - Skip Tests: `./xgboost/jvm-packages/dev/build-linux.sh --skip-tests`
 
 **Windows:**
 1. Ensure [Docker for Windows](https://docs.docker.com/docker-for-windows/install/) is installed.
diff --git a/jvm-packages/pom.xml b/jvm-packages/pom.xml
index fdca78ba403b..03de3bd1c019 100644
--- a/jvm-packages/pom.xml
+++ b/jvm-packages/pom.xml
@@ -6,7 +6,7 @@
 
     <groupId>ml.dmlc</groupId>
     <artifactId>xgboost-jvm_2.12</artifactId>
-    <version>1.2.0-SNAPSHOT</version>
+    <version>1.3.0-SNAPSHOT</version>
     <packaging>pom</packaging>
     <name>XGBoost JVM Package</name>
     <description>JVM Package for XGBoost</description>
diff --git a/jvm-packages/xgboost4j-example/pom.xml b/jvm-packages/xgboost4j-example/pom.xml
index b70f3e25f2e3..4f493caa444a 100644
--- a/jvm-packages/xgboost4j-example/pom.xml
+++ b/jvm-packages/xgboost4j-example/pom.xml
@@ -6,10 +6,10 @@
     <parent>
         <groupId>ml.dmlc</groupId>
         <artifactId>xgboost-jvm_2.12</artifactId>
-        <version>1.2.0-SNAPSHOT</version>
+        <version>1.3.0-SNAPSHOT</version>
     </parent>
     <artifactId>xgboost4j-example_2.12</artifactId>
-    <version>1.2.0-SNAPSHOT</version>
+    <version>1.3.0-SNAPSHOT</version>
     <packaging>jar</packaging>
     <build>
         <plugins>
@@ -26,7 +26,7 @@
         <dependency>
             <groupId>ml.dmlc</groupId>
             <artifactId>xgboost4j-spark_${scala.binary.version}</artifactId>
-            <version>1.2.0-SNAPSHOT</version>
+            <version>1.3.0-SNAPSHOT</version>
         </dependency>
         <dependency>
             <groupId>org.apache.spark</groupId>
@@ -37,7 +37,7 @@
         <dependency>
             <groupId>ml.dmlc</groupId>
             <artifactId>xgboost4j-flink_${scala.binary.version}</artifactId>
-            <version>1.2.0-SNAPSHOT</version>
+            <version>1.3.0-SNAPSHOT</version>
         </dependency>
         <dependency>
             <groupId>org.apache.commons</groupId>
diff --git a/jvm-packages/xgboost4j-flink/pom.xml b/jvm-packages/xgboost4j-flink/pom.xml
index 645822d2ad07..a65823a228a9 100644
--- a/jvm-packages/xgboost4j-flink/pom.xml
+++ b/jvm-packages/xgboost4j-flink/pom.xml
@@ -6,10 +6,10 @@
     <parent>
         <groupId>ml.dmlc</groupId>
         <artifactId>xgboost-jvm_2.12</artifactId>
-        <version>1.2.0-SNAPSHOT</version>
+        <version>1.3.0-SNAPSHOT</version>
     </parent>
     <artifactId>xgboost4j-flink_2.12</artifactId>
-    <version>1.2.0-SNAPSHOT</version>
+    <version>1.3.0-SNAPSHOT</version>
     <build>
         <plugins>
             <plugin>
@@ -26,7 +26,7 @@
         <dependency>
             <groupId>ml.dmlc</groupId>
             <artifactId>xgboost4j_${scala.binary.version}</artifactId>
-            <version>1.2.0-SNAPSHOT</version>
+            <version>1.3.0-SNAPSHOT</version>
         </dependency>
         <dependency>
             <groupId>org.apache.commons</groupId>
diff --git a/jvm-packages/xgboost4j-spark/pom.xml b/jvm-packages/xgboost4j-spark/pom.xml
index 115f563938ca..6435a17f37d9 100644
--- a/jvm-packages/xgboost4j-spark/pom.xml
+++ b/jvm-packages/xgboost4j-spark/pom.xml
@@ -6,7 +6,7 @@
     <parent>
         <groupId>ml.dmlc</groupId>
         <artifactId>xgboost-jvm_2.12</artifactId>
-        <version>1.2.0-SNAPSHOT</version>
+        <version>1.3.0-SNAPSHOT</version>
     </parent>
     <artifactId>xgboost4j-spark_2.12</artifactId>
     <build>
@@ -24,7 +24,7 @@
         <dependency>
             <groupId>ml.dmlc</groupId>
             <artifactId>xgboost4j_${scala.binary.version}</artifactId>
-            <version>1.2.0-SNAPSHOT</version>
+            <version>1.3.0-SNAPSHOT</version>
         </dependency>
         <dependency>
             <groupId>org.apache.spark</groupId>
diff --git a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/DataUtils.scala b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/DataUtils.scala
index df787d8eb8ab..15ffe4c06c42 100644
--- a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/DataUtils.scala
+++ b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/DataUtils.scala
@@ -103,7 +103,8 @@ object DataUtils extends Serializable {
       case sparseVector: SparseVector =>
         featureValueOfSparseVector(rowHashCode, sparseVector)
     }
-    math.abs((rowHashCode.toLong + featureValue).toString.hashCode % numPartitions)
+    val nonNaNFeatureValue = if (featureValue.isNaN) { 0.0f } else { featureValue }
+    math.abs((rowHashCode.toLong + nonNaNFeatureValue).toString.hashCode % numPartitions)
   }
 
   private def attachPartitionKey(
diff --git a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/DeterministicPartitioningSuite.scala b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/DeterministicPartitioningSuite.scala
index 986b0843b5f3..ff0492f41a4a 100644
--- a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/DeterministicPartitioningSuite.scala
+++ b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/DeterministicPartitioningSuite.scala
@@ -16,6 +16,7 @@
 
 package ml.dmlc.xgboost4j.scala.spark
 
+import org.apache.spark.ml.linalg.Vectors
 import org.scalatest.FunSuite
 
 import org.apache.spark.sql.functions._
@@ -79,4 +80,34 @@ class DeterministicPartitioningSuite extends FunSuite with TmpFolderPerSuite wit
       map2
     }
   }
+
+  test("deterministic partitioning has a uniform repartition on dataset with missing values") {
+    val N = 10000
+    val dataset = (0 until N).map{ n =>
+      (n, n % 2, Vectors.sparse(3, Array(0, 1, 2), Array(Double.NaN, n, Double.NaN)))
+    }
+
+    val df = ss.createDataFrame(sc.parallelize(dataset)).toDF("id", "label", "features")
+
+    val dfRepartitioned = DataUtils.convertDataFrameToXGBLabeledPointRDDs(
+      col("label"),
+      col("features"),
+      lit(1.0),
+      lit(Float.NaN),
+      None,
+      10,
+      deterministicPartition = true,
+      df
+    ).head
+
+    val partitionsSizes = dfRepartitioned
+      .mapPartitions(iter => Array(iter.size.toDouble).iterator, true)
+      .collect()
+    val partitionMean = partitionsSizes.sum / partitionsSizes.length
+    val squaredDiffSum = partitionsSizes
+      .map(partitionSize => Math.pow(partitionSize - partitionMean, 2))
+    val standardDeviation = math.sqrt(squaredDiffSum.sum / squaredDiffSum.length)
+
+    assert(standardDeviation < math.sqrt(N.toDouble))
+  }
 }
diff --git a/jvm-packages/xgboost4j/pom.xml b/jvm-packages/xgboost4j/pom.xml
index 927e6d42e418..fff44d9ea37d 100644
--- a/jvm-packages/xgboost4j/pom.xml
+++ b/jvm-packages/xgboost4j/pom.xml
@@ -6,10 +6,10 @@
     <parent>
         <groupId>ml.dmlc</groupId>
         <artifactId>xgboost-jvm_2.12</artifactId>
-        <version>1.2.0-SNAPSHOT</version>
+        <version>1.3.0-SNAPSHOT</version>
     </parent>
     <artifactId>xgboost4j_2.12</artifactId>
-    <version>1.2.0-SNAPSHOT</version>
+    <version>1.3.0-SNAPSHOT</version>
     <packaging>jar</packaging>
 
     <dependencies>
diff --git a/python-package/xgboost/VERSION b/python-package/xgboost/VERSION
index 468e6c357b7a..9d7c109bb7dc 100644
--- a/python-package/xgboost/VERSION
+++ b/python-package/xgboost/VERSION
@@ -1 +1 @@
-1.2.0-SNAPSHOT
+1.3.0-SNAPSHOT
diff --git a/python-package/xgboost/core.py b/python-package/xgboost/core.py
index 4bc77783ee91..c8d0460825e5 100644
--- a/python-package/xgboost/core.py
+++ b/python-package/xgboost/core.py
@@ -40,7 +40,7 @@ class EarlyStopException(Exception):
     """
 
     def __init__(self, best_iteration):
-        super(EarlyStopException, self).__init__()
+        super().__init__()
         self.best_iteration = best_iteration
 
 
@@ -422,7 +422,7 @@ def __init__(self, data, label=None, weight=None, base_margin=None,
             raise TypeError('Input data can not be a list.')
 
         self.missing = missing if missing is not None else np.nan
-        self.nthread = nthread if nthread is not None else 1
+        self.nthread = nthread if nthread is not None else -1
         self.silent = silent
 
         # force into void_p, mac need to pass things in as void_p
@@ -455,7 +455,8 @@ def set_info(self,
                  label_lower_bound=None,
                  label_upper_bound=None,
                  feature_names=None,
-                 feature_types=None):
+                 feature_types=None,
+                 feature_weights=None):
         '''Set meta info for DMatrix.'''
         if label is not None:
             self.set_label(label)
@@ -473,6 +474,10 @@ def set_info(self,
             self.feature_names = feature_names
         if feature_types is not None:
             self.feature_types = feature_types
+        if feature_weights is not None:
+            from .data import dispatch_meta_backend
+            dispatch_meta_backend(matrix=self, data=feature_weights,
+                                  name='feature_weights')
 
     def get_float_info(self, field):
         """Get float property from the DMatrix.
@@ -1460,8 +1465,12 @@ def reshape_output(predt, rows):
                            ctypes.c_uint(iteration_range[1]))
 
         # once caching is supported, we can pass id(data) as cache id.
-        if isinstance(data, DataFrame):
-            data = data.values
+        try:
+            import pandas as pd
+            if isinstance(data, pd.DataFrame):
+                data = data.values
+        except ImportError:
+            pass
         if isinstance(data, np.ndarray):
             assert data.flags.c_contiguous
             arr = np.array(data.reshape(data.size), copy=False,
diff --git a/python-package/xgboost/data.py b/python-package/xgboost/data.py
index 9491efd1c38c..e4c05dcc244e 100644
--- a/python-package/xgboost/data.py
+++ b/python-package/xgboost/data.py
@@ -530,22 +530,38 @@ def dispatch_data_backend(data, missing, threads,
     raise TypeError('Not supported type for data.' + str(type(data)))
 
 
+def _to_data_type(dtype: str, name: str):
+    dtype_map = {'float32': 1, 'float64': 2, 'uint32': 3, 'uint64': 4}
+    if dtype not in dtype_map.keys():
+        raise TypeError(
+            f'Expecting float32, float64, uint32, uint64, got {dtype} ' +
+            f'for {name}.')
+    return dtype_map[dtype]
+
+
+def _validate_meta_shape(data):
+    if hasattr(data, 'shape'):
+        assert len(data.shape) == 1 or (
+            len(data.shape) == 2 and
+            (data.shape[1] == 0 or data.shape[1] == 1))
+
+
 def _meta_from_numpy(data, field, dtype, handle):
     data = _maybe_np_slice(data, dtype)
-    if dtype == 'uint32':
-        c_data = c_array(ctypes.c_uint32, data)
-        _check_call(_LIB.XGDMatrixSetUIntInfo(handle,
-                                              c_str(field),
-                                              c_array(ctypes.c_uint, data),
-                                              c_bst_ulong(len(data))))
-    elif dtype == 'float':
-        c_data = c_array(ctypes.c_float, data)
-        _check_call(_LIB.XGDMatrixSetFloatInfo(handle,
-                                               c_str(field),
-                                               c_data,
-                                               c_bst_ulong(len(data))))
-    else:
-        raise TypeError('Unsupported type ' + str(dtype) + ' for:' + field)
+    interface = data.__array_interface__
+    assert interface.get('mask', None) is None, 'Masked array is not supported'
+    size = data.shape[0]
+
+    c_type = _to_data_type(str(data.dtype), field)
+    ptr = interface['data'][0]
+    ptr = ctypes.c_void_p(ptr)
+    _check_call(_LIB.XGDMatrixSetDenseInfo(
+        handle,
+        c_str(field),
+        ptr,
+        c_bst_ulong(size),
+        c_type
+    ))
 
 
 def _meta_from_list(data, field, dtype, handle):
@@ -595,6 +611,7 @@ def _meta_from_dt(data, field, dtype, handle):
 def dispatch_meta_backend(matrix: DMatrix, data, name: str, dtype: str = None):
     '''Dispatch for meta info.'''
     handle = matrix.handle
+    _validate_meta_shape(data)
     if data is None:
         return
     if _is_list(data):
diff --git a/python-package/xgboost/sklearn.py b/python-package/xgboost/sklearn.py
index f533f7f3477d..96d358128a9a 100644
--- a/python-package/xgboost/sklearn.py
+++ b/python-package/xgboost/sklearn.py
@@ -441,6 +441,7 @@ def load_model(self, fname):
     def fit(self, X, y, sample_weight=None, base_margin=None,
             eval_set=None, eval_metric=None, early_stopping_rounds=None,
             verbose=True, xgb_model=None, sample_weight_eval_set=None,
+            feature_weights=None,
             callbacks=None):
         # pylint: disable=invalid-name,attribute-defined-outside-init
         """Fit gradient boosting model
@@ -459,9 +460,6 @@ def fit(self, X, y, sample_weight=None, base_margin=None,
             A list of (X, y) tuple pairs to use as validation sets, for which
             metrics will be computed.
             Validation metrics will help us track the performance of the model.
-        sample_weight_eval_set : list, optional
-            A list of the form [L_1, L_2, ..., L_n], where each L_i is a list of
-            instance weights on the i-th validation set.
         eval_metric : str, list of str, or callable, optional
             If a str, should be a built-in evaluation metric to use. See
             doc/parameter.rst.
@@ -490,6 +488,13 @@ def fit(self, X, y, sample_weight=None, base_margin=None,
         xgb_model : str
             file name of stored XGBoost model or 'Booster' instance XGBoost model to be
             loaded before training (allows training continuation).
+        sample_weight_eval_set : list, optional
+            A list of the form [L_1, L_2, ..., L_n], where each L_i is a list of
+            instance weights on the i-th validation set.
+        feature_weights: array_like
+            Weight for each feature, defines the probability of each feature
+            being selected when colsample is being used.  All values must be
+            greater than 0, otherwise a `ValueError` is thrown.
         callbacks : list of callback functions
             List of callback functions that are applied at end of each iteration.
             It is possible to use predefined callbacks by using :ref:`callback_api`.
@@ -498,6 +503,7 @@ def fit(self, X, y, sample_weight=None, base_margin=None,
             .. code-block:: python
 
                 [xgb.callback.reset_learning_rate(custom_rates)]
+
         """
         self.n_features_in_ = X.shape[1]
 
@@ -505,6 +511,7 @@ def fit(self, X, y, sample_weight=None, base_margin=None,
                                 base_margin=base_margin,
                                 missing=self.missing,
                                 nthread=self.n_jobs)
+        train_dmatrix.set_info(feature_weights=feature_weights)
 
         evals_result = {}
 
@@ -750,7 +757,10 @@ def intercept_(self):
 
 @xgboost_model_doc(
     "Implementation of the scikit-learn API for XGBoost classification.",
-    ['model', 'objective'])
+    ['model', 'objective'], extra_parameters='''
+    n_estimators : int
+        Number of boosting rounds.
+''')
 class XGBClassifier(XGBModel, XGBClassifierBase):
     # pylint: disable=missing-docstring,invalid-name,too-many-instance-attributes
     def __init__(self, objective="binary:logistic", **kwargs):
@@ -759,7 +769,7 @@ def __init__(self, objective="binary:logistic", **kwargs):
     def fit(self, X, y, sample_weight=None, base_margin=None,
             eval_set=None, eval_metric=None,
             early_stopping_rounds=None, verbose=True, xgb_model=None,
-            sample_weight_eval_set=None, callbacks=None):
+            sample_weight_eval_set=None, feature_weights=None, callbacks=None):
         # pylint: disable = attribute-defined-outside-init,arguments-differ
 
         evals_result = {}
@@ -821,6 +831,7 @@ def fit(self, X, y, sample_weight=None, base_margin=None,
         train_dmatrix = DMatrix(X, label=training_labels, weight=sample_weight,
                                 base_margin=base_margin,
                                 missing=self.missing, nthread=self.n_jobs)
+        train_dmatrix.set_info(feature_weights=feature_weights)
 
         self._Booster = train(xgb_options, train_dmatrix,
                               self.get_num_boosting_rounds(),
@@ -1014,7 +1025,7 @@ def __init__(self,
                          **kwargs)
 
     def get_xgb_params(self):
-        params = super(XGBRFClassifier, self).get_xgb_params()
+        params = super().get_xgb_params()
         params['num_parallel_tree'] = self.n_estimators
         return params
 
@@ -1033,7 +1044,10 @@ def __init__(self, objective="reg:squarederror", **kwargs):
 
 @xgboost_model_doc(
     "scikit-learn API for XGBoost random forest regression.",
-    ['model', 'objective'])
+    ['model', 'objective'], extra_parameters='''
+    n_estimators : int
+        Number of trees in random forest to fit.
+''')
 class XGBRFRegressor(XGBRegressor):
     # pylint: disable=missing-docstring
     def __init__(self, learning_rate=1, subsample=0.8, colsample_bynode=0.8,
@@ -1043,7 +1057,7 @@ def __init__(self, learning_rate=1, subsample=0.8, colsample_bynode=0.8,
                          reg_lambda=reg_lambda, **kwargs)
 
     def get_xgb_params(self):
-        params = super(XGBRFRegressor, self).get_xgb_params()
+        params = super().get_xgb_params()
         params['num_parallel_tree'] = self.n_estimators
         return params
 
@@ -1101,10 +1115,10 @@ def __init__(self, objective='rank:pairwise', **kwargs):
             raise ValueError("please use XGBRanker for ranking task")
 
     def fit(self, X, y, group, sample_weight=None, base_margin=None,
-            eval_set=None,
-            sample_weight_eval_set=None, eval_group=None, eval_metric=None,
+            eval_set=None, sample_weight_eval_set=None,
+            eval_group=None, eval_metric=None,
             early_stopping_rounds=None, verbose=False, xgb_model=None,
-            callbacks=None):
+            feature_weights=None, callbacks=None):
         # pylint: disable = attribute-defined-outside-init,arguments-differ
         """Fit gradient boosting ranker
 
@@ -1170,6 +1184,10 @@ def fit(self, X, y, group, sample_weight=None, base_margin=None,
         xgb_model : str
             file name of stored XGBoost model or 'Booster' instance XGBoost
             model to be loaded before training (allows training continuation).
+        feature_weights: array_like
+            Weight for each feature, defines the probability of each feature
+            being selected when colsample is being used.  All values must be
+            greater than 0, otherwise a `ValueError` is thrown.
         callbacks : list of callback functions
             List of callback functions that are applied at end of each
             iteration.  It is possible to use predefined callbacks by using
@@ -1205,6 +1223,7 @@ def _dmat_init(group, **params):
         train_dmatrix = DMatrix(data=X, label=y, weight=sample_weight,
                                 base_margin=base_margin,
                                 missing=self.missing, nthread=self.n_jobs)
+        train_dmatrix.set_info(feature_weights=feature_weights)
         train_dmatrix.set_group(group)
 
         evals_result = {}
diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index aa6ecf43a784..397f83e69bf8 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -316,6 +316,17 @@ XGB_DLL int XGDMatrixGetStrFeatureInfo(DMatrixHandle handle, const char *field,
   API_END();
 }
 
+XGB_DLL int XGDMatrixSetDenseInfo(DMatrixHandle handle, const char *field,
+                                  void *data, xgboost::bst_ulong size,
+                                  int type) {
+  API_BEGIN();
+  CHECK_HANDLE();
+  auto &info = static_cast<std::shared_ptr<DMatrix> *>(handle)->get()->Info();
+  CHECK(type >= 1 && type <= 4);
+  info.SetInfo(field, data, static_cast<DataType>(type), size);
+  API_END();
+}
+
 XGB_DLL int XGDMatrixSetGroup(DMatrixHandle handle,
                               const unsigned* group,
                               xgboost::bst_ulong len) {
diff --git a/src/common/common.h b/src/common/common.h
index b0bd6b6d6cec..a4397d1c89aa 100644
--- a/src/common/common.h
+++ b/src/common/common.h
@@ -9,12 +9,15 @@
 #include <xgboost/base.h>
 #include <xgboost/logging.h>
 
+#include <algorithm>
 #include <exception>
+#include <functional>
 #include <limits>
 #include <type_traits>
 #include <vector>
 #include <string>
 #include <sstream>
+#include <numeric>
 
 #if defined(__CUDACC__)
 #include <thrust/system/cuda/error.h>
@@ -160,6 +163,15 @@ inline void AssertOneAPISupport() {
 #endif  // XGBOOST_USE_ONEAPI
 }
 
+template <typename Idx, typename V, typename Comp = std::less<V>>
+std::vector<Idx> ArgSort(std::vector<V> const &array, Comp comp = std::less<V>{}) {
+  std::vector<Idx> result(array.size());
+  std::iota(result.begin(), result.end(), 0);
+  std::stable_sort(
+      result.begin(), result.end(),
+      [&array, comp](Idx const &l, Idx const &r) { return comp(array[l], array[r]); });
+  return result;
+}
 }  // namespace common
 }  // namespace xgboost
 #endif  // XGBOOST_COMMON_COMMON_H_
diff --git a/src/common/group_data.h b/src/common/group_data.h
index 0144d8099926..476b4925bff3 100644
--- a/src/common/group_data.h
+++ b/src/common/group_data.h
@@ -17,6 +17,7 @@
 #include <cstddef>
 #include <vector>
 #include <algorithm>
+#include <utility>
 
 #include "xgboost/base.h"
 
@@ -56,10 +57,10 @@ class ParallelGroupBuilder {
   void InitBudget(std::size_t max_key, int nthread) {
     thread_rptr_.resize(nthread);
     for (std::size_t i = 0; i < thread_rptr_.size(); ++i) {
-      thread_rptr_[i].resize(max_key - std::min(base_row_offset_, max_key));
-      std::fill(thread_rptr_[i].begin(), thread_rptr_[i].end(), 0);
+      thread_rptr_[i].resize(max_key - std::min(base_row_offset_, max_key), 0);
     }
   }
+
   /*!
    * \brief step 2: add budget to each key
    * \param key the key
@@ -74,6 +75,7 @@ class ParallelGroupBuilder {
     }
     trptr[offset_key] += nelem;
   }
+
   /*! \brief step 3: initialize the necessary storage */
   inline void InitStorage() {
     // set rptr to correct size
@@ -101,6 +103,7 @@ class ParallelGroupBuilder {
     }
     data_.resize(rptr_.back());
   }
+
   /*!
    * \brief step 4: add data to the allocated space,
    *   the calls to this function should be exactly match previous call to AddBudget
@@ -109,10 +112,10 @@ class ParallelGroupBuilder {
    * \param value The value to be pushed to the group.
    * \param threadid the id of thread that calls this function
    */
-  void Push(std::size_t key, ValueType value, int threadid) {
+  void Push(std::size_t key, ValueType&& value, int threadid) {
     size_t offset_key = key - base_row_offset_;
     SizeType &rp = thread_rptr_[threadid][offset_key];
-    data_[rp++] = value;
+    data_[rp++] = std::move(value);
   }
 
  private:
diff --git a/src/common/hist_util.h b/src/common/hist_util.h
index d86b73135f34..0334b901224a 100644
--- a/src/common/hist_util.h
+++ b/src/common/hist_util.h
@@ -116,26 +116,14 @@ inline HistogramCuts SketchOnDMatrix(DMatrix *m, int32_t max_bins) {
   for (auto& column : column_sizes) {
     column.resize(info.num_col_, 0);
   }
-  for (auto const& page : m->GetBatches<SparsePage>()) {
-    page.data.HostVector();
-    page.offset.HostVector();
-    ParallelFor(page.Size(), threads, [&](size_t i) {
-      auto &local_column_sizes = column_sizes.at(omp_get_thread_num());
-      auto row = page[i];
-      auto const *p_row = row.data();
-      for (size_t j = 0; j < row.size(); ++j) {
-        local_column_sizes.at(p_row[j].index)++;
-      }
-    });
-  }
   std::vector<bst_row_t> reduced(info.num_col_, 0);
-
-  ParallelFor(info.num_col_, threads, [&](size_t i) {
-    for (auto const &thread : column_sizes) {
-      reduced[i] += thread[i];
+  for (auto const& page : m->GetBatches<SparsePage>()) {
+    auto const &entries_per_column =
+        HostSketchContainer::CalcColumnSize(page, info.num_col_, threads);
+    for (size_t i = 0; i < entries_per_column.size(); ++i) {
+      reduced[i] += entries_per_column[i];
     }
-  });
-
+  }
   HostSketchContainer container(reduced, max_bins,
                                 HostSketchContainer::UseGroup(info));
   for (auto const &page : m->GetBatches<SparsePage>()) {
diff --git a/src/common/quantile.cc b/src/common/quantile.cc
index 374864c8f4b0..9ab48a304b77 100644
--- a/src/common/quantile.cc
+++ b/src/common/quantile.cc
@@ -25,34 +25,67 @@ HostSketchContainer::HostSketchContainer(std::vector<bst_row_t> columns_size,
   }
 }
 
-std::vector<bst_feature_t> LoadBalance(SparsePage const &page,
-                                       std::vector<size_t> columns_size,
-                                       size_t const nthreads) {
-  /* Some sparse datasets have their mass concentrating on small
-   * number of features.  To avoid wating for a few threads running
-   * forever, we here distirbute different number of columns to
-   * different threads according to number of entries. */
-  size_t const total_entries = page.data.Size();
+std::vector<bst_row_t>
+HostSketchContainer::CalcColumnSize(SparsePage const &batch,
+                                    bst_feature_t const n_columns,
+                                    size_t const nthreads) {
+  auto page = batch.GetView();
+  std::vector<std::vector<bst_row_t>> column_sizes(nthreads);
+  for (auto &column : column_sizes) {
+    column.resize(n_columns, 0);
+  }
+
+  ParallelFor(page.Size(), nthreads, [&](size_t i) {
+    auto &local_column_sizes = column_sizes.at(omp_get_thread_num());
+    auto row = page[i];
+    auto const *p_row = row.data();
+    for (size_t j = 0; j < row.size(); ++j) {
+      local_column_sizes.at(p_row[j].index)++;
+    }
+  });
+  std::vector<bst_row_t> entries_per_columns(n_columns, 0);
+  ParallelFor(n_columns, nthreads, [&](size_t i) {
+    for (auto const &thread : column_sizes) {
+      entries_per_columns[i] += thread[i];
+    }
+  });
+  return entries_per_columns;
+}
+
+std::vector<bst_feature_t> HostSketchContainer::LoadBalance(
+    SparsePage const &batch, bst_feature_t n_columns, size_t const nthreads) {
+  /* Some sparse datasets have their mass concentrating on small number of features.  To
+   * avoid wating for a few threads running forever, we here distirbute different number
+   * of columns to different threads according to number of entries.
+   */
+  auto page = batch.GetView();
+  size_t const total_entries = page.data.size();
   size_t const entries_per_thread = common::DivRoundUp(total_entries, nthreads);
 
-  std::vector<bst_feature_t> cols_ptr(nthreads+1, 0);
+  std::vector<std::vector<bst_row_t>> column_sizes(nthreads);
+  for (auto& column : column_sizes) {
+    column.resize(n_columns, 0);
+  }
+  std::vector<bst_row_t> entries_per_columns =
+      CalcColumnSize(batch, n_columns, nthreads);
+  std::vector<bst_feature_t> cols_ptr(nthreads + 1, 0);
   size_t count {0};
   size_t current_thread {1};
 
-  for (auto col : columns_size) {
-    cols_ptr[current_thread]++;  // add one column to thread
+  for (auto col : entries_per_columns) {
+    cols_ptr.at(current_thread)++;  // add one column to thread
     count += col;
-    if (count > entries_per_thread + 1) {
+    CHECK_LE(count, total_entries);
+    if (count > entries_per_thread) {
       current_thread++;
       count = 0;
-      cols_ptr[current_thread] = cols_ptr[current_thread-1];
+      cols_ptr.at(current_thread) = cols_ptr[current_thread-1];
     }
   }
   // Idle threads.
   for (; current_thread < cols_ptr.size() - 1; ++current_thread) {
     cols_ptr[current_thread+1] = cols_ptr[current_thread];
   }
-
   return cols_ptr;
 }
 
@@ -67,11 +100,10 @@ void HostSketchContainer::PushRowPage(SparsePage const &page,
   // Use group index for weights?
   auto batch = page.GetView();
   dmlc::OMPException exec;
-  // Parallel over columns.  Asumming the data is dense, each thread owns a set of
-  // consecutive columns.
+  // Parallel over columns.  Each thread owns a set of consecutive columns.
   auto const ncol = static_cast<uint32_t>(info.num_col_);
   auto const is_dense = info.num_nonzero_ == info.num_col_ * info.num_row_;
-  auto thread_columns_ptr = LoadBalance(page, columns_size_, nthread);
+  auto thread_columns_ptr = LoadBalance(page, info.num_col_, nthread);
 
 #pragma omp parallel num_threads(nthread)
   {
@@ -112,58 +144,158 @@ void HostSketchContainer::PushRowPage(SparsePage const &page,
   monitor_.Stop(__func__);
 }
 
-void AddCutPoint(WQuantileSketch<float, float>::SummaryContainer const &summary,
-                 int max_bin, HistogramCuts *cuts) {
-  size_t required_cuts = std::min(summary.size, static_cast<size_t>(max_bin));
-  auto& cut_values = cuts->cut_values_.HostVector();
-  for (size_t i = 1; i < required_cuts; ++i) {
-    bst_float cpt = summary.data[i].value;
-    if (i == 1 || cpt > cuts->cut_values_.ConstHostVector().back()) {
-      cut_values.push_back(cpt);
-    }
+void HostSketchContainer::GatherSketchInfo(
+    std::vector<WQSketch::SummaryContainer> const &reduced,
+    std::vector<size_t> *p_worker_segments,
+    std::vector<bst_row_t> *p_sketches_scan,
+    std::vector<WQSketch::Entry> *p_global_sketches) {
+  auto& worker_segments = *p_worker_segments;
+  worker_segments.resize(1, 0);
+  auto world = rabit::GetWorldSize();
+  auto rank = rabit::GetRank();
+  auto n_columns = sketches_.size();
+
+  std::vector<bst_row_t> sketch_size;
+  for (auto const& sketch : reduced) {
+    sketch_size.push_back(sketch.size);
+  }
+  std::vector<bst_row_t>& sketches_scan = *p_sketches_scan;
+  sketches_scan.resize((n_columns + 1) * world, 0);
+  size_t beg_scan = rank * (n_columns + 1);
+  std::partial_sum(sketch_size.cbegin(), sketch_size.cend(),
+                   sketches_scan.begin() + beg_scan + 1);
+  // Gather all column pointers
+  rabit::Allreduce<rabit::op::Sum>(sketches_scan.data(), sketches_scan.size());
+
+  for (int32_t i = 0; i < world; ++i) {
+    size_t back = (i + 1) * (n_columns + 1) - 1;
+    auto n_entries = sketches_scan.at(back);
+    worker_segments.push_back(n_entries);
+  }
+  // Offset of sketch from each worker.
+  std::partial_sum(worker_segments.begin(), worker_segments.end(),
+                   worker_segments.begin());
+  CHECK_GE(worker_segments.size(), 1);
+  auto total = worker_segments.back();
+
+  auto& global_sketches = *p_global_sketches;
+  global_sketches.resize(total, WQSketch::Entry{0, 0, 0, 0});
+  auto worker_sketch = Span<WQSketch::Entry>{global_sketches}.subspan(
+      worker_segments[rank], worker_segments[rank + 1] - worker_segments[rank]);
+  size_t cursor = 0;
+  for (auto const &sketch : reduced) {
+    std::copy(sketch.data, sketch.data + sketch.size,
+              worker_sketch.begin() + cursor);
+    cursor += sketch.size;
   }
+
+  static_assert(sizeof(WQSketch::Entry) / 4 == sizeof(float), "");
+  rabit::Allreduce<rabit::op::Sum>(
+      reinterpret_cast<float *>(global_sketches.data()),
+      global_sketches.size() * sizeof(WQSketch::Entry) / sizeof(float));
 }
 
-void HostSketchContainer::MakeCuts(HistogramCuts* cuts) {
+void HostSketchContainer::AllReduce(
+    std::vector<WQSketch::SummaryContainer> *p_reduced,
+    std::vector<int32_t>* p_num_cuts) {
   monitor_.Start(__func__);
-  rabit::Allreduce<rabit::op::Sum>(columns_size_.data(), columns_size_.size());
-  std::vector<WQSketch::SummaryContainer> reduced(sketches_.size());
-  std::vector<int32_t> num_cuts;
-  size_t nbytes = 0;
+  auto& num_cuts = *p_num_cuts;
+  CHECK_EQ(num_cuts.size(), 0);
+  auto &reduced = *p_reduced;
+  reduced.resize(sketches_.size());
+
+  size_t n_columns = sketches_.size();
+  rabit::Allreduce<rabit::op::Max>(&n_columns, 1);
+  CHECK_EQ(n_columns, sketches_.size()) << "Number of columns differs across workers";
+
+  // Prune the intermediate num cuts for synchronization.
+  std::vector<bst_row_t> global_column_size(columns_size_);
+  rabit::Allreduce<rabit::op::Sum>(global_column_size.data(), global_column_size.size());
+
+size_t nbytes = 0;
   for (size_t i = 0; i < sketches_.size(); ++i) {
     int32_t intermediate_num_cuts =  static_cast<int32_t>(std::min(
-        columns_size_[i], static_cast<size_t>(max_bins_ * WQSketch::kFactor)));
-    if (columns_size_[i] != 0) {
+        global_column_size[i], static_cast<size_t>(max_bins_ * WQSketch::kFactor)));
+    if (global_column_size[i] != 0) {
       WQSketch::SummaryContainer out;
       sketches_[i].GetSummary(&out);
       reduced[i].Reserve(intermediate_num_cuts);
       CHECK(reduced[i].data);
       reduced[i].SetPrune(out, intermediate_num_cuts);
+      nbytes = std::max(
+          WQSketch::SummaryContainer::CalcMemCost(intermediate_num_cuts),
+          nbytes);
     }
+
     num_cuts.push_back(intermediate_num_cuts);
-    nbytes = std::max(
-        WQSketch::SummaryContainer::CalcMemCost(intermediate_num_cuts), nbytes);
   }
+  auto world = rabit::GetWorldSize();
+  if (world == 1) {
+    return;
+  }
+
+  std::vector<size_t> worker_segments(1, 0);  // CSC pointer to sketches.
+  std::vector<bst_row_t> sketches_scan((n_columns + 1) * world, 0);
+
+  std::vector<WQSketch::Entry> global_sketches;
+  this->GatherSketchInfo(reduced, &worker_segments, &sketches_scan,
+                         &global_sketches);
+
+  std::vector<WQSketch::SummaryContainer> final_sketches(n_columns);
+  ParallelFor(n_columns, omp_get_max_threads(), [&](size_t fidx) {
+    int32_t intermediate_num_cuts = num_cuts[fidx];
+    auto nbytes =
+        WQSketch::SummaryContainer::CalcMemCost(intermediate_num_cuts);
+
+    for (int32_t i = 1; i < world + 1; ++i) {
+      auto size = worker_segments.at(i) - worker_segments[i - 1];
+      auto worker_sketches = Span<WQSketch::Entry>{global_sketches}.subspan(
+          worker_segments[i - 1], size);
+      auto worker_scan =
+          Span<bst_row_t>(sketches_scan)
+              .subspan((i - 1) * (n_columns + 1), (n_columns + 1));
+
+      auto worker_feature = worker_sketches.subspan(
+          worker_scan[fidx], worker_scan[fidx + 1] - worker_scan[fidx]);
+      CHECK(worker_feature.data());
+      WQSummary<float, float> summary(worker_feature.data(),
+                                      worker_feature.size());
+      auto &out = final_sketches.at(fidx);
+      out.Reduce(summary, nbytes);
+    }
+
+    reduced.at(fidx).Reserve(intermediate_num_cuts);
+    reduced.at(fidx).SetPrune(final_sketches.at(fidx), intermediate_num_cuts);
+  });
+  monitor_.Stop(__func__);
+}
 
-  if (rabit::IsDistributed()) {
-    // FIXME(trivialfis): This call will allocate nbytes * num_columns on rabit, which
-    // may generate oom error when data is sparse.  To fix it, we need to:
-    //   - gather the column offsets over all workers.
-    //   - run rabit::allgather on sketch data to collect all data.
-    //   - merge all gathered sketches based on worker offsets and column offsets of data
-    //     from each worker.
-    // See GPU implementation for details.
-    rabit::SerializeReducer<WQSketch::SummaryContainer> sreducer;
-    sreducer.Allreduce(dmlc::BeginPtr(reduced), nbytes, reduced.size());
+void AddCutPoint(WQuantileSketch<float, float>::SummaryContainer const &summary,
+                 int max_bin, HistogramCuts *cuts) {
+  size_t required_cuts = std::min(summary.size, static_cast<size_t>(max_bin));
+  auto& cut_values = cuts->cut_values_.HostVector();
+  for (size_t i = 1; i < required_cuts; ++i) {
+    bst_float cpt = summary.data[i].value;
+    if (i == 1 || cpt > cuts->cut_values_.ConstHostVector().back()) {
+      cut_values.push_back(cpt);
+    }
   }
+}
+
+void HostSketchContainer::MakeCuts(HistogramCuts* cuts) {
+  monitor_.Start(__func__);
+  std::vector<WQSketch::SummaryContainer> reduced;
+  std::vector<int32_t> num_cuts;
+  this->AllReduce(&reduced, &num_cuts);
 
   cuts->min_vals_.HostVector().resize(sketches_.size(), 0.0f);
+
   for (size_t fid = 0; fid < reduced.size(); ++fid) {
     WQSketch::SummaryContainer a;
     size_t max_num_bins = std::min(num_cuts[fid], max_bins_);
     a.Reserve(max_num_bins + 1);
     CHECK(a.data);
-    if (columns_size_[fid] != 0) {
+    if (num_cuts[fid] != 0) {
       a.SetPrune(reduced[fid], max_num_bins + 1);
       CHECK(a.data && reduced[fid].data);
       const bst_float mval = a.data[0].value;
@@ -173,6 +305,7 @@ void HostSketchContainer::MakeCuts(HistogramCuts* cuts) {
       const float mval = 1e-5f;
       cuts->min_vals_.HostVector()[fid] = mval;
     }
+
     AddCutPoint(a, max_num_bins, cuts);
     // push a value that is greater than anything
     const bst_float cpt
diff --git a/src/common/quantile.h b/src/common/quantile.h
index 11e2530f748e..a70bf809ea28 100644
--- a/src/common/quantile.h
+++ b/src/common/quantile.h
@@ -166,6 +166,16 @@ struct WQSummary {
    * \param src source sketch
    */
   inline void CopyFrom(const WQSummary &src) {
+    if (!src.data) {
+      CHECK_EQ(src.size, 0);
+      size = 0;
+      return;
+    }
+    if (!data) {
+      CHECK_EQ(this->size, 0);
+      CHECK_EQ(src.size, 0);
+      return;
+    }
     size = src.size;
     std::memcpy(data, src.data, sizeof(Entry) * size);
   }
@@ -721,6 +731,14 @@ class HostSketchContainer {
     return use_group_ind;
   }
 
+  static std::vector<bst_row_t> CalcColumnSize(SparsePage const &page,
+                                               bst_feature_t const n_columns,
+                                               size_t const nthreads);
+
+  static std::vector<bst_feature_t> LoadBalance(SparsePage const &page,
+                                                bst_feature_t n_columns,
+                                                size_t const nthreads);
+
   static uint32_t SearchGroupIndFromRow(std::vector<bst_uint> const &group_ptr,
                                         size_t const base_rowid) {
     CHECK_LT(base_rowid, group_ptr.back())
@@ -730,6 +748,14 @@ class HostSketchContainer {
         group_ptr.cbegin() - 1;
     return group_ind;
   }
+  // Gather sketches from all workers.
+  void GatherSketchInfo(std::vector<WQSketch::SummaryContainer> const &reduced,
+                        std::vector<bst_row_t> *p_worker_segments,
+                        std::vector<bst_row_t> *p_sketches_scan,
+                        std::vector<WQSketch::Entry> *p_global_sketches);
+  // Merge sketches from all workers.
+  void AllReduce(std::vector<WQSketch::SummaryContainer> *p_reduced,
+                 std::vector<int32_t>* p_num_cuts);
 
   /* \brief Push a CSR matrix. */
   void PushRowPage(SparsePage const& page, MetaInfo const& info);
diff --git a/src/common/random.cc b/src/common/random.cc
new file mode 100644
index 000000000000..f386cad916b2
--- /dev/null
+++ b/src/common/random.cc
@@ -0,0 +1,38 @@
+/*!
+ * Copyright 2020 by XGBoost Contributors
+ * \file random.cc
+ */
+#include "random.h"
+
+namespace xgboost {
+namespace common {
+std::shared_ptr<HostDeviceVector<bst_feature_t>> ColumnSampler::ColSample(
+    std::shared_ptr<HostDeviceVector<bst_feature_t>> p_features,
+    float colsample) {
+  if (colsample == 1.0f) {
+    return p_features;
+  }
+  const auto &features = p_features->HostVector();
+  CHECK_GT(features.size(), 0);
+
+  int n = std::max(1, static_cast<int>(colsample * features.size()));
+  auto p_new_features = std::make_shared<HostDeviceVector<bst_feature_t>>();
+  auto &new_features = *p_new_features;
+
+  if (feature_weights_.size() != 0) {
+    new_features.HostVector() = WeightedSamplingWithoutReplacement(
+        p_features->HostVector(), feature_weights_, n);
+  } else {
+    new_features.Resize(features.size());
+    std::copy(features.begin(), features.end(),
+              new_features.HostVector().begin());
+    std::shuffle(new_features.HostVector().begin(),
+                 new_features.HostVector().end(), rng_);
+    new_features.Resize(n);
+  }
+  std::sort(new_features.HostVector().begin(), new_features.HostVector().end());
+  return p_new_features;
+}
+
+}  // namespace common
+}  // namespace xgboost
diff --git a/src/common/random.h b/src/common/random.h
index 45af80ce030b..7fd461d22d0f 100644
--- a/src/common/random.h
+++ b/src/common/random.h
@@ -1,5 +1,5 @@
 /*!
- * Copyright 2015 by Contributors
+ * Copyright 2015-2020 by Contributors
  * \file random.h
  * \brief Utility related to random.
  * \author Tianqi Chen
@@ -10,14 +10,17 @@
 #include <rabit/rabit.h>
 #include <xgboost/logging.h>
 #include <algorithm>
+#include <functional>
 #include <vector>
 #include <limits>
 #include <map>
 #include <memory>
 #include <numeric>
 #include <random>
+#include <utility>
 
 #include "xgboost/host_device_vector.h"
+#include "common.h"
 
 namespace xgboost {
 namespace common {
@@ -75,6 +78,38 @@ using GlobalRandomEngine = RandomEngine;
  */
 GlobalRandomEngine& GlobalRandom(); // NOLINT(*)
 
+/*
+ * Original paper:
+ * Weighted Random Sampling (2005; Efraimidis, Spirakis)
+ *
+ * Blog:
+ * https://timvieira.github.io/blog/post/2019/09/16/algorithms-for-sampling-without-replacement/
+*/
+template <typename T>
+std::vector<T> WeightedSamplingWithoutReplacement(
+    std::vector<T> const &array, std::vector<float> const &weights, size_t n) {
+  // ES sampling.
+  CHECK_EQ(array.size(), weights.size());
+  std::vector<float> keys(weights.size());
+  std::uniform_real_distribution<float> dist;
+  auto& rng = GlobalRandom();
+  for (size_t i = 0; i < array.size(); ++i) {
+    auto w = std::max(weights.at(i), kRtEps);
+    auto u = dist(rng);
+    auto k = std::log(u) / w;
+    keys[i] = k;
+  }
+  auto ind = ArgSort<size_t>(keys, std::greater<>{});
+  ind.resize(n);
+
+  std::vector<T> results(ind.size());
+  for (size_t k = 0; k < ind.size(); ++k) {
+    auto idx = ind[k];
+    results[k] = array[idx];
+  }
+  return results;
+}
+
 /**
  * \class ColumnSampler
  *
@@ -82,36 +117,18 @@ GlobalRandomEngine& GlobalRandom(); // NOLINT(*)
  * colsample_bynode parameters. Should be initialised before tree construction and to
  * reset when tree construction is completed.
  */
-
 class ColumnSampler {
   std::shared_ptr<HostDeviceVector<bst_feature_t>> feature_set_tree_;
   std::map<int, std::shared_ptr<HostDeviceVector<bst_feature_t>>> feature_set_level_;
+  std::vector<float> feature_weights_;
   float colsample_bylevel_{1.0f};
   float colsample_bytree_{1.0f};
   float colsample_bynode_{1.0f};
   GlobalRandomEngine rng_;
 
-  std::shared_ptr<HostDeviceVector<bst_feature_t>> ColSample(
-      std::shared_ptr<HostDeviceVector<bst_feature_t>> p_features, float colsample) {
-    if (colsample == 1.0f) return p_features;
-    const auto& features = p_features->HostVector();
-    CHECK_GT(features.size(), 0);
-    int n = std::max(1, static_cast<int>(colsample * features.size()));
-    auto p_new_features = std::make_shared<HostDeviceVector<bst_feature_t>>();
-    auto& new_features = *p_new_features;
-    new_features.Resize(features.size());
-    std::copy(features.begin(), features.end(),
-              new_features.HostVector().begin());
-    std::shuffle(new_features.HostVector().begin(),
-                 new_features.HostVector().end(), rng_);
-    new_features.Resize(n);
-    std::sort(new_features.HostVector().begin(),
-              new_features.HostVector().end());
-
-    return p_new_features;
-  }
-
  public:
+  std::shared_ptr<HostDeviceVector<bst_feature_t>> ColSample(
+      std::shared_ptr<HostDeviceVector<bst_feature_t>> p_features, float colsample);
   /**
    * \brief Column sampler constructor.
    * \note This constructor manually sets the rng seed
@@ -139,8 +156,10 @@ class ColumnSampler {
    * \param colsample_bytree
    * \param skip_index_0      (Optional) True to skip index 0.
    */
-  void Init(int64_t num_col, float colsample_bynode, float colsample_bylevel,
+  void Init(int64_t num_col, std::vector<float> feature_weights,
+            float colsample_bynode, float colsample_bylevel,
             float colsample_bytree, bool skip_index_0 = false) {
+    feature_weights_ = std::move(feature_weights);
     colsample_bylevel_ = colsample_bylevel;
     colsample_bytree_ = colsample_bytree;
     colsample_bynode_ = colsample_bynode;
diff --git a/src/common/version.cc b/src/common/version.cc
index 3fb2e5c24392..e9d4fe9d13d8 100644
--- a/src/common/version.cc
+++ b/src/common/version.cc
@@ -49,9 +49,9 @@ Version::TripletT Version::Load(dmlc::Stream* fi) {
     LOG(FATAL) << msg;
   }
 
-  CHECK_EQ(fi->Read(&major, sizeof(major)), sizeof(major)) << msg;
-  CHECK_EQ(fi->Read(&minor, sizeof(major)), sizeof(minor)) << msg;
-  CHECK_EQ(fi->Read(&patch, sizeof(major)), sizeof(patch)) << msg;
+  CHECK(fi->Read(&major)) << msg;
+  CHECK(fi->Read(&minor)) << msg;
+  CHECK(fi->Read(&patch)) << msg;
 
   return std::make_tuple(major, minor, patch);
 }
@@ -69,9 +69,9 @@ void Version::Save(dmlc::Stream* fo) {
   std::tie(major, minor, patch) = Self();
   std::string verstr { u8"version:" };
   fo->Write(&verstr[0], verstr.size());
-  fo->Write(&major, sizeof(major));
-  fo->Write(&minor, sizeof(minor));
-  fo->Write(&patch, sizeof(patch));
+  fo->Write(major);
+  fo->Write(minor);
+  fo->Write(patch);
 }
 
 std::string Version::String(TripletT const& version) {
diff --git a/src/data/data.cc b/src/data/data.cc
index 401a35081830..d7d18f189642 100644
--- a/src/data/data.cc
+++ b/src/data/data.cc
@@ -83,7 +83,7 @@ void LoadScalarField(dmlc::Stream* strm, const std::string& expected_name,
   CHECK(strm->Read(&is_scalar)) << invalid;
   CHECK(is_scalar)
     << invalid << "Expected field " << expected_name << " to be a scalar; got a vector";
-  CHECK(strm->Read(field, sizeof(T))) << invalid;
+  CHECK(strm->Read(field)) << invalid;
 }
 
 template <typename T>
@@ -293,6 +293,9 @@ MetaInfo MetaInfo::Slice(common::Span<int32_t const> ridxs) const {
   } else {
     out.base_margin_.HostVector() = Gather(this->base_margin_.HostVector(), ridxs);
   }
+
+  out.feature_weigths.Resize(this->feature_weigths.Size());
+  out.feature_weigths.Copy(this->feature_weigths);
   return out;
 }
 
@@ -377,6 +380,16 @@ void MetaInfo::SetInfo(const char* key, const void* dptr, DataType dtype, size_t
     labels.resize(num);
     DISPATCH_CONST_PTR(dtype, dptr, cast_dptr,
                        std::copy(cast_dptr, cast_dptr + num, labels.begin()));
+  } else if (!std::strcmp(key, "feature_weights")) {
+    auto &h_feature_weights = feature_weigths.HostVector();
+    h_feature_weights.resize(num);
+    DISPATCH_CONST_PTR(
+        dtype, dptr, cast_dptr,
+        std::copy(cast_dptr, cast_dptr + num, h_feature_weights.begin()));
+    bool valid =
+        std::all_of(h_feature_weights.cbegin(), h_feature_weights.cend(),
+                    [](float w) { return w >= 0; });
+    CHECK(valid) << "Feature weight must be greater than 0.";
   } else {
     LOG(FATAL) << "Unknown key for MetaInfo: " << key;
   }
@@ -396,6 +409,8 @@ void MetaInfo::GetInfo(char const *key, bst_ulong *out_len, DataType dtype,
       vec = &this->labels_lower_bound_.HostVector();
     } else if (!std::strcmp(key, "label_upper_bound")) {
       vec = &this->labels_upper_bound_.HostVector();
+    } else if (!std::strcmp(key, "feature_weights")) {
+      vec = &this->feature_weigths.HostVector();
     } else {
       LOG(FATAL) << "Unknown float field name: " << key;
     }
@@ -497,6 +512,11 @@ void MetaInfo::Extend(MetaInfo const& that, bool accumulate_rows) {
     auto &h_feature_types = feature_types.HostVector();
     LoadFeatureType(this->feature_type_names, &h_feature_types);
   }
+  if (!that.feature_weigths.Empty()) {
+    this->feature_weigths.Resize(that.feature_weigths.Size());
+    this->feature_weigths.SetDevice(that.feature_weigths.DeviceIdx());
+    this->feature_weigths.Copy(that.feature_weigths);
+  }
 }
 
 void MetaInfo::Validate(int32_t device) const {
@@ -538,6 +558,11 @@ void MetaInfo::Validate(int32_t device) const {
     check_device(labels_lower_bound_);
     return;
   }
+  if (feature_weigths.Size() != 0) {
+    CHECK_EQ(feature_weigths.Size(), num_col_)
+        << "Size of feature_weights must equal to number of columns.";
+    check_device(feature_weigths);
+  }
   if (labels_upper_bound_.Size() != 0) {
     CHECK_EQ(labels_upper_bound_.Size(), num_row_)
         << "Size of label_upper_bound must equal to number of rows.";
@@ -628,14 +653,18 @@ DMatrix* DMatrix::Load(const std::string& uri,
     std::unique_ptr<dmlc::Stream> fi(dmlc::Stream::Create(fname.c_str(), "r", true));
     if (fi != nullptr) {
       common::PeekableInStream is(fi.get());
-      if (is.PeekRead(&magic, sizeof(magic)) == sizeof(magic) &&
-        magic == data::SimpleDMatrix::kMagic) {
-        DMatrix* dmat = new data::SimpleDMatrix(&is);
-        if (!silent) {
-          LOG(CONSOLE) << dmat->Info().num_row_ << 'x' << dmat->Info().num_col_ << " matrix with "
-            << dmat->Info().num_nonzero_ << " entries loaded from " << uri;
+      if (is.PeekRead(&magic, sizeof(magic)) == sizeof(magic)) {
+        if (!DMLC_IO_NO_ENDIAN_SWAP) {
+          dmlc::ByteSwap(&magic, sizeof(magic), 1);
+        }
+        if (magic == data::SimpleDMatrix::kMagic) {
+          DMatrix* dmat = new data::SimpleDMatrix(&is);
+          if (!silent) {
+            LOG(CONSOLE) << dmat->Info().num_row_ << 'x' << dmat->Info().num_col_ << " matrix with "
+              << dmat->Info().num_nonzero_ << " entries loaded from " << uri;
+          }
+          return dmat;
         }
-        return dmat;
       }
     }
   }
@@ -811,10 +840,11 @@ uint64_t SparsePage::Push(const AdapterBatchT& batch, float missing, int nthread
   // Set number of threads but keep old value so we can reset it after
   const int nthreadmax = omp_get_max_threads();
   if (nthread <= 0) nthread = nthreadmax;
-  int nthread_original = omp_get_max_threads();
+  const int nthread_original = omp_get_max_threads();
   omp_set_num_threads(nthread);
   auto& offset_vec = offset.HostVector();
   auto& data_vec = data.HostVector();
+
   size_t builder_base_row_offset = this->Size();
   common::ParallelGroupBuilder<
       Entry, std::remove_reference<decltype(offset_vec)>::type::value_type>
@@ -829,48 +859,74 @@ uint64_t SparsePage::Push(const AdapterBatchT& batch, float missing, int nthread
           last_line.GetElement(last_line.Size() - 1).row_idx - base_rowid;
     }
   }
-  builder.InitBudget(expected_rows, nthread);
+  size_t batch_size = batch.Size();
+  const size_t thread_size = batch_size/nthread;
+  builder.InitBudget(expected_rows+1, nthread);
   uint64_t max_columns = 0;
-
+  if (batch_size == 0) {
+    omp_set_num_threads(nthread_original);
+    return max_columns;
+  }
+  std::vector<std::vector<uint64_t>> max_columns_vector(nthread);
+  dmlc::OMPException exec;
   // First-pass over the batch counting valid elements
-  size_t batch_size = batch.Size();
-#pragma omp parallel for schedule(static)
-  for (omp_ulong i = 0; i < static_cast<omp_ulong>(batch_size);
-       ++i) {  // NOLINT(*)
-    int tid = omp_get_thread_num();
-    auto line = batch.GetLine(i);
-    for (auto j = 0ull; j < line.Size(); j++) {
-      data::COOTuple element = line.GetElement(j);
-      max_columns =
-          std::max(max_columns, static_cast<uint64_t>(element.column_idx + 1));
-      if (!common::CheckNAN(element.value) && element.value != missing) {
-        size_t key = element.row_idx - base_rowid;
-        // Adapter row index is absolute, here we want it relative to
-        // current page
-        CHECK_GE(key, builder_base_row_offset);
-        builder.AddBudget(key, tid);
+#pragma omp parallel num_threads(nthread)
+  {
+    exec.Run([&]() {
+      int tid = omp_get_thread_num();
+      size_t begin = tid*thread_size;
+      size_t end = tid != (nthread-1) ? (tid+1)*thread_size : batch_size;
+      max_columns_vector[tid].resize(1, 0);
+      uint64_t& max_columns_local = max_columns_vector[tid][0];
+
+      for (size_t i = begin; i < end; ++i) {
+        auto line = batch.GetLine(i);
+        for (auto j = 0ull; j < line.Size(); j++) {
+          auto element = line.GetElement(j);
+          const size_t key = element.row_idx - base_rowid;
+          CHECK_GE(key,  builder_base_row_offset);
+          max_columns_local =
+              std::max(max_columns_local, static_cast<uint64_t>(element.column_idx + 1));
+
+          if (!common::CheckNAN(element.value) && element.value != missing) {
+            // Adapter row index is absolute, here we want it relative to
+            // current page
+            builder.AddBudget(key, tid);
+          }
+        }
       }
-    }
+    });
   }
+  exec.Rethrow();
+  for (const auto & max : max_columns_vector) {
+    max_columns = std::max(max_columns, max[0]);
+  }
+
   builder.InitStorage();
 
   // Second pass over batch, placing elements in correct position
-#pragma omp parallel for schedule(static)
-  for (omp_ulong i = 0; i < static_cast<omp_ulong>(batch_size);
-       ++i) {  // NOLINT(*)
-    int tid = omp_get_thread_num();
-    auto line = batch.GetLine(i);
-    for (auto j = 0ull; j < line.Size(); j++) {
-      auto element = line.GetElement(j);
-      if (!common::CheckNAN(element.value) && element.value != missing) {
-        size_t key = element.row_idx -
-                     base_rowid;  // Adapter row index is absolute, here we want
-                                  // it relative to current page
-        builder.Push(key, Entry(element.column_idx, element.value), tid);
+
+#pragma omp parallel num_threads(nthread)
+  {
+    exec.Run([&]() {
+      int tid = omp_get_thread_num();
+      size_t begin = tid*thread_size;
+      size_t end = tid != (nthread-1) ? (tid+1)*thread_size : batch_size;
+      for (size_t i = begin; i < end; ++i) {
+        auto line = batch.GetLine(i);
+        for (auto j = 0ull; j < line.Size(); j++) {
+          auto element = line.GetElement(j);
+          const size_t key = (element.row_idx - base_rowid);
+          if (!common::CheckNAN(element.value) && element.value != missing) {
+            builder.Push(key, Entry(element.column_idx, element.value), tid);
+          }
+        }
       }
-    }
+    });
   }
+  exec.Rethrow();
   omp_set_num_threads(nthread_original);
+
   return max_columns;
 }
 
diff --git a/src/data/data.cu b/src/data/data.cu
index 5e63a828c207..15260498734d 100644
--- a/src/data/data.cu
+++ b/src/data/data.cu
@@ -58,6 +58,15 @@ void CopyGroupInfoImpl(ArrayInterface column, std::vector<bst_group_t>* out) {
   std::partial_sum(out->begin(), out->end(), out->begin());
 }
 
+namespace {
+// thrust::all_of tries to copy lambda function.
+struct AllOfOp {
+  __device__ bool operator()(float w) {
+    return w >= 0;
+  }
+};
+}  // anonymous namespace
+
 void MetaInfo::SetInfo(const char * c_key, std::string const& interface_str) {
   Json j_interface = Json::Load({interface_str.c_str(), interface_str.size()});
   auto const& j_arr = get<Array>(j_interface);
@@ -82,6 +91,21 @@ void MetaInfo::SetInfo(const char * c_key, std::string const& interface_str) {
   } else if (key == "group") {
     CopyGroupInfoImpl(array_interface, &group_ptr_);
     return;
+  } else if (key == "label_lower_bound") {
+    CopyInfoImpl(array_interface, &labels_lower_bound_);
+    return;
+  } else if (key == "label_upper_bound") {
+    CopyInfoImpl(array_interface, &labels_upper_bound_);
+    return;
+  } else if (key == "feature_weights") {
+    CopyInfoImpl(array_interface, &feature_weigths);
+    auto d_feature_weights = feature_weigths.ConstDeviceSpan();
+    auto valid =
+        thrust::all_of(thrust::device, d_feature_weights.data(),
+                       d_feature_weights.data() + d_feature_weights.size(),
+                       AllOfOp{});
+    CHECK(valid) << "Feature weight must be greater than 0.";
+    return;
   } else {
     LOG(FATAL) << "Unknown metainfo: " << key;
   }
diff --git a/src/data/simple_dmatrix.cc b/src/data/simple_dmatrix.cc
index f054ff64a490..06fa385b48de 100644
--- a/src/data/simple_dmatrix.cc
+++ b/src/data/simple_dmatrix.cc
@@ -192,8 +192,7 @@ SimpleDMatrix::SimpleDMatrix(AdapterT* adapter, float missing, int nthread) {
 
 SimpleDMatrix::SimpleDMatrix(dmlc::Stream* in_stream) {
   int tmagic;
-  CHECK(in_stream->Read(&tmagic, sizeof(tmagic)) == sizeof(tmagic))
-      << "invalid input file format";
+  CHECK(in_stream->Read(&tmagic)) << "invalid input file format";
   CHECK_EQ(tmagic, kMagic) << "invalid format, magic number mismatch";
   info_.LoadBinary(in_stream);
   in_stream->Read(&sparse_page_.offset.HostVector());
@@ -203,7 +202,7 @@ SimpleDMatrix::SimpleDMatrix(dmlc::Stream* in_stream) {
 void SimpleDMatrix::SaveToLocalFile(const std::string& fname) {
     std::unique_ptr<dmlc::Stream> fo(dmlc::Stream::Create(fname.c_str(), "w"));
     int tmagic = kMagic;
-    fo->Write(&tmagic, sizeof(tmagic));
+    fo->Write(tmagic);
     info_.SaveBinary(fo.get());
     fo->Write(sparse_page_.offset.HostVector());
     fo->Write(sparse_page_.data.HostVector());
diff --git a/src/data/sparse_page_source.h b/src/data/sparse_page_source.h
index 108af403b1a3..6db6de9fad55 100644
--- a/src/data/sparse_page_source.h
+++ b/src/data/sparse_page_source.h
@@ -144,7 +144,7 @@ class ExternalMemoryPrefetcher : dmlc::DataIter<PageT> {
       std::unique_ptr<dmlc::Stream> finfo(
           dmlc::Stream::Create(info.name_info.c_str(), "r"));
       int tmagic;
-      CHECK_EQ(finfo->Read(&tmagic, sizeof(tmagic)), sizeof(tmagic));
+      CHECK(finfo->Read(&tmagic));
       CHECK_EQ(tmagic, kMagic) << "invalid format, magic number mismatch";
     }
     files_.resize(info.name_shards.size());
@@ -359,7 +359,7 @@ class SparsePageSource {
       std::unique_ptr<dmlc::Stream> fo(
           dmlc::Stream::Create(cache_info_.name_info.c_str(), "w"));
       int tmagic = kMagic;
-      fo->Write(&tmagic, sizeof(tmagic));
+      fo->Write(tmagic);
       // Either every row has query ID or none at all
       CHECK(qids.empty() || qids.size() == info.num_row_);
       info.SaveBinary(fo.get());
diff --git a/src/gbm/gbtree_model.cc b/src/gbm/gbtree_model.cc
index 8ebd8284c269..4a20b48f7d1d 100644
--- a/src/gbm/gbtree_model.cc
+++ b/src/gbm/gbtree_model.cc
@@ -12,18 +12,35 @@ namespace gbm {
 
 void GBTreeModel::Save(dmlc::Stream* fo) const {
   CHECK_EQ(param.num_trees, static_cast<int32_t>(trees.size()));
-  fo->Write(&param, sizeof(param));
+
+  if (DMLC_IO_NO_ENDIAN_SWAP) {
+    fo->Write(&param, sizeof(param));
+  } else {
+    auto x = param.ByteSwap();
+    fo->Write(&x, sizeof(x));
+  }
   for (const auto & tree : trees) {
     tree->Save(fo);
   }
   if (tree_info.size() != 0) {
-    fo->Write(dmlc::BeginPtr(tree_info), sizeof(int32_t) * tree_info.size());
+    if (DMLC_IO_NO_ENDIAN_SWAP) {
+      fo->Write(dmlc::BeginPtr(tree_info), sizeof(int32_t) * tree_info.size());
+    } else {
+      for (const auto& e : tree_info) {
+        auto x = e;
+        dmlc::ByteSwap(&x, sizeof(x), 1);
+        fo->Write(&x, sizeof(x));
+      }
+    }
   }
 }
 
 void GBTreeModel::Load(dmlc::Stream* fi) {
   CHECK_EQ(fi->Read(&param, sizeof(param)), sizeof(param))
       << "GBTree: invalid model file";
+  if (!DMLC_IO_NO_ENDIAN_SWAP) {
+    param = param.ByteSwap();
+  }
   trees.clear();
   trees_to_update.clear();
   for (int32_t i = 0; i < param.num_trees; ++i) {
@@ -33,9 +50,16 @@ void GBTreeModel::Load(dmlc::Stream* fi) {
   }
   tree_info.resize(param.num_trees);
   if (param.num_trees != 0) {
-    CHECK_EQ(
-        fi->Read(dmlc::BeginPtr(tree_info), sizeof(int32_t) * param.num_trees),
-        sizeof(int32_t) * param.num_trees);
+    if (DMLC_IO_NO_ENDIAN_SWAP) {
+      CHECK_EQ(
+          fi->Read(dmlc::BeginPtr(tree_info), sizeof(int32_t) * param.num_trees),
+          sizeof(int32_t) * param.num_trees);
+    } else {
+      for (auto& info : tree_info) {
+        CHECK_EQ(fi->Read(&info, sizeof(int32_t)), sizeof(int32_t));
+        dmlc::ByteSwap(&info, sizeof(info), 1);
+      }
+    }
   }
 }
 
diff --git a/src/gbm/gbtree_model.h b/src/gbm/gbtree_model.h
index 7ac7d8f470a2..5a89878d3816 100644
--- a/src/gbm/gbtree_model.h
+++ b/src/gbm/gbtree_model.h
@@ -61,6 +61,21 @@ struct GBTreeModelParam : public dmlc::Parameter<GBTreeModelParam> {
         .set_default(0)
         .describe("Reserved option for vector tree.");
   }
+
+  // Swap byte order for all fields. Useful for transporting models between machines with different
+  // endianness (big endian vs little endian)
+  inline GBTreeModelParam ByteSwap() const {
+    GBTreeModelParam x = *this;
+    dmlc::ByteSwap(&x.num_trees, sizeof(x.num_trees), 1);
+    dmlc::ByteSwap(&x.deprecated_num_roots, sizeof(x.deprecated_num_roots), 1);
+    dmlc::ByteSwap(&x.deprecated_num_feature, sizeof(x.deprecated_num_feature), 1);
+    dmlc::ByteSwap(&x.pad_32bit, sizeof(x.pad_32bit), 1);
+    dmlc::ByteSwap(&x.deprecated_num_pbuffer, sizeof(x.deprecated_num_pbuffer), 1);
+    dmlc::ByteSwap(&x.deprecated_num_output_group, sizeof(x.deprecated_num_output_group), 1);
+    dmlc::ByteSwap(&x.size_leaf_vector, sizeof(x.size_leaf_vector), 1);
+    dmlc::ByteSwap(x.reserved, sizeof(x.reserved[0]), sizeof(x.reserved) / sizeof(x.reserved[0]));
+    return x;
+  }
 };
 
 struct GBTreeModel : public Model {
diff --git a/src/learner.cc b/src/learner.cc
index 47080a5c12b9..8210c4d1c89b 100644
--- a/src/learner.cc
+++ b/src/learner.cc
@@ -128,6 +128,19 @@ struct LearnerModelParamLegacy : public dmlc::Parameter<LearnerModelParamLegacy>
     std::string str = get<String const>(j_param.at("base_score"));
     from_chars(str.c_str(), str.c_str() + str.size(), base_score);
   }
+  inline LearnerModelParamLegacy ByteSwap() const {
+    LearnerModelParamLegacy x = *this;
+    dmlc::ByteSwap(&x.base_score, sizeof(x.base_score), 1);
+    dmlc::ByteSwap(&x.num_feature, sizeof(x.num_feature), 1);
+    dmlc::ByteSwap(&x.num_class, sizeof(x.num_class), 1);
+    dmlc::ByteSwap(&x.contain_extra_attrs, sizeof(x.contain_extra_attrs), 1);
+    dmlc::ByteSwap(&x.contain_eval_metrics, sizeof(x.contain_eval_metrics), 1);
+    dmlc::ByteSwap(&x.major_version, sizeof(x.major_version), 1);
+    dmlc::ByteSwap(&x.minor_version, sizeof(x.minor_version), 1);
+    dmlc::ByteSwap(x.reserved, sizeof(x.reserved[0]), sizeof(x.reserved) / sizeof(x.reserved[0]));
+    return x;
+  }
+
   // declare parameters
   DMLC_DECLARE_PARAMETER(LearnerModelParamLegacy) {
     DMLC_DECLARE_FIELD(base_score)
@@ -694,7 +707,9 @@ class LearnerIO : public LearnerConfiguration {
     // read parameter
     CHECK_EQ(fi->Read(&mparam_, sizeof(mparam_)), sizeof(mparam_))
         << "BoostLearner: wrong model format";
-
+    if (!DMLC_IO_NO_ENDIAN_SWAP) {
+      mparam_ = mparam_.ByteSwap();
+    }
     CHECK(fi->Read(&tparam_.objective)) << "BoostLearner: wrong model format";
     CHECK(fi->Read(&tparam_.booster)) << "BoostLearner: wrong model format";
 
@@ -828,7 +843,12 @@ class LearnerIO : public LearnerConfiguration {
     }
     std::string header {"binf"};
     fo->Write(header.data(), 4);
-    fo->Write(&mparam, sizeof(LearnerModelParamLegacy));
+    if (DMLC_IO_NO_ENDIAN_SWAP) {
+      fo->Write(&mparam, sizeof(LearnerModelParamLegacy));
+    } else {
+      LearnerModelParamLegacy x = mparam.ByteSwap();
+      fo->Write(&x, sizeof(LearnerModelParamLegacy));
+    }
     fo->Write(tparam_.objective);
     fo->Write(tparam_.booster);
     gbm_->Save(fo);
@@ -867,7 +887,13 @@ class LearnerIO : public LearnerConfiguration {
       // concatonate the model and config at final output, it's a temporary solution for
       // continuing support for binary model format
       fo->Write(&serialisation_header_[0], serialisation_header_.size());
-      fo->Write(&json_offset, sizeof(json_offset));
+      if (DMLC_IO_NO_ENDIAN_SWAP) {
+        fo->Write(&json_offset, sizeof(json_offset));
+      } else {
+        auto x = json_offset;
+        dmlc::ByteSwap(&x, sizeof(x), 1);
+        fo->Write(&x, sizeof(json_offset));
+      }
       fo->Write(&binary_buf[0], binary_buf.size());
       fo->Write(&config_str[0], config_str.size());
     }
@@ -904,6 +930,9 @@ class LearnerIO : public LearnerConfiguration {
 )doc";
       int64_t sz {-1};
       CHECK_EQ(fp.Read(&sz, sizeof(sz)), sizeof(sz));
+      if (!DMLC_IO_NO_ENDIAN_SWAP) {
+        dmlc::ByteSwap(&sz, sizeof(sz), 1);
+      }
       CHECK_GT(sz, 0);
       size_t json_offset = static_cast<size_t>(sz);
       std::string buffer;
diff --git a/src/tree/param.h b/src/tree/param.h
index 280f06066e44..dedc2a7f0ff5 100644
--- a/src/tree/param.h
+++ b/src/tree/param.h
@@ -239,6 +239,10 @@ struct TrainParam : public XGBoostParameter<TrainParam> {
     if (this->max_leaves > 0) {
       n_nodes = this->max_leaves * 2 - 1;
     } else {
+      // bst_node_t will overflow.
+      CHECK_LE(this->max_depth, 31)
+          << "max_depth can not be greater than 31 as that might generate 2 ** "
+             "32 - 1 nodes.";
       n_nodes = (1 << (this->max_depth + 1)) - 1;
     }
     CHECK_NE(n_nodes, 0);
diff --git a/src/tree/tree_model.cc b/src/tree/tree_model.cc
index 8f45621ca15e..7f9721aef1d9 100644
--- a/src/tree/tree_model.cc
+++ b/src/tree/tree_model.cc
@@ -664,13 +664,26 @@ bst_node_t RegTree::GetNumSplitNodes() const {
 
 void RegTree::Load(dmlc::Stream* fi) {
   CHECK_EQ(fi->Read(&param, sizeof(TreeParam)), sizeof(TreeParam));
+  if (!DMLC_IO_NO_ENDIAN_SWAP) {
+    param = param.ByteSwap();
+  }
   nodes_.resize(param.num_nodes);
   stats_.resize(param.num_nodes);
   CHECK_NE(param.num_nodes, 0);
   CHECK_EQ(fi->Read(dmlc::BeginPtr(nodes_), sizeof(Node) * nodes_.size()),
            sizeof(Node) * nodes_.size());
+  if (!DMLC_IO_NO_ENDIAN_SWAP) {
+    for (Node& node : nodes_) {
+      node = node.ByteSwap();
+    }
+  }
   CHECK_EQ(fi->Read(dmlc::BeginPtr(stats_), sizeof(RTreeNodeStat) * stats_.size()),
            sizeof(RTreeNodeStat) * stats_.size());
+  if (!DMLC_IO_NO_ENDIAN_SWAP) {
+    for (RTreeNodeStat& stat : stats_) {
+      stat = stat.ByteSwap();
+    }
+  }
   // chg deleted nodes
   deleted_nodes_.resize(0);
   for (int i = 1; i < param.num_nodes; ++i) {
@@ -683,11 +696,32 @@ void RegTree::Load(dmlc::Stream* fi) {
 void RegTree::Save(dmlc::Stream* fo) const {
   CHECK_EQ(param.num_nodes, static_cast<int>(nodes_.size()));
   CHECK_EQ(param.num_nodes, static_cast<int>(stats_.size()));
-  fo->Write(&param, sizeof(TreeParam));
   CHECK_EQ(param.deprecated_num_roots, 1);
   CHECK_NE(param.num_nodes, 0);
-  fo->Write(dmlc::BeginPtr(nodes_), sizeof(Node) * nodes_.size());
-  fo->Write(dmlc::BeginPtr(stats_), sizeof(RTreeNodeStat) * nodes_.size());
+
+  if (DMLC_IO_NO_ENDIAN_SWAP) {
+    fo->Write(&param, sizeof(TreeParam));
+  } else {
+    TreeParam x = param.ByteSwap();
+    fo->Write(&x, sizeof(x));
+  }
+
+  if (DMLC_IO_NO_ENDIAN_SWAP) {
+    fo->Write(dmlc::BeginPtr(nodes_), sizeof(Node) * nodes_.size());
+  } else {
+    for (const Node& node : nodes_) {
+      Node x = node.ByteSwap();
+      fo->Write(&x, sizeof(x));
+    }
+  }
+  if (DMLC_IO_NO_ENDIAN_SWAP) {
+    fo->Write(dmlc::BeginPtr(stats_), sizeof(RTreeNodeStat) * nodes_.size());
+  } else {
+    for (const RTreeNodeStat& stat : stats_) {
+      RTreeNodeStat x = stat.ByteSwap();
+      fo->Write(&x, sizeof(x));
+    }
+  }
 }
 
 void RegTree::LoadModel(Json const& in) {
diff --git a/src/tree/updater_colmaker.cc b/src/tree/updater_colmaker.cc
index 951cfdb5ec27..45cdb0ba9163 100644
--- a/src/tree/updater_colmaker.cc
+++ b/src/tree/updater_colmaker.cc
@@ -235,8 +235,10 @@ class ColMaker: public TreeUpdater {
         }
       }
       {
-        column_sampler_.Init(fmat.Info().num_col_, param_.colsample_bynode,
-                             param_.colsample_bylevel, param_.colsample_bytree);
+        column_sampler_.Init(fmat.Info().num_col_,
+                             fmat.Info().feature_weigths.ConstHostVector(),
+                             param_.colsample_bynode, param_.colsample_bylevel,
+                             param_.colsample_bytree);
       }
       {
         // setup temp space for each thread
diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu
index 5cbe75350402..3535a59d6f85 100644
--- a/src/tree/updater_gpu_hist.cu
+++ b/src/tree/updater_gpu_hist.cu
@@ -266,8 +266,10 @@ struct GPUHistMakerDevice {
   // Note that the column sampler must be passed by value because it is not
   // thread safe
   void Reset(HostDeviceVector<GradientPair>* dh_gpair, DMatrix* dmat, int64_t num_columns) {
-    this->column_sampler.Init(num_columns, param.colsample_bynode,
-      param.colsample_bylevel, param.colsample_bytree);
+    auto const& info = dmat->Info();
+    this->column_sampler.Init(num_columns, info.feature_weigths.HostVector(),
+                              param.colsample_bynode, param.colsample_bylevel,
+                              param.colsample_bytree);
     dh::safe_cuda(cudaSetDevice(device_id));
     this->interaction_constraints.Reset();
     std::fill(node_sum_gradients.begin(), node_sum_gradients.end(),
diff --git a/src/tree/updater_quantile_hist.cc b/src/tree/updater_quantile_hist.cc
index 37a90dfebd74..95d3c2008ef9 100644
--- a/src/tree/updater_quantile_hist.cc
+++ b/src/tree/updater_quantile_hist.cc
@@ -841,11 +841,13 @@ void QuantileHistMaker::Builder<GradientSumT>::InitData(const GHistIndexMatrix&
   // store a pointer to the tree
   p_last_tree_ = &tree;
   if (data_layout_ == kDenseDataOneBased) {
-    column_sampler_.Init(info.num_col_, param_.colsample_bynode, param_.colsample_bylevel,
-            param_.colsample_bytree, true);
+    column_sampler_.Init(info.num_col_, info.feature_weigths.ConstHostVector(),
+                         param_.colsample_bynode, param_.colsample_bylevel,
+                         param_.colsample_bytree, true);
   } else {
-    column_sampler_.Init(info.num_col_, param_.colsample_bynode, param_.colsample_bylevel,
-            param_.colsample_bytree,  false);
+    column_sampler_.Init(info.num_col_, info.feature_weigths.ConstHostVector(),
+                         param_.colsample_bynode, param_.colsample_bylevel,
+                         param_.colsample_bytree, false);
   }
   if (data_layout_ == kDenseDataZeroBased || data_layout_ == kDenseDataOneBased) {
     /* specialized code for dense data:
diff --git a/tests/ci_build/Dockerfile.s390x b/tests/ci_build/Dockerfile.s390x
new file mode 100644
index 000000000000..5ad4a7888feb
--- /dev/null
+++ b/tests/ci_build/Dockerfile.s390x
@@ -0,0 +1,27 @@
+FROM s390x/ubuntu:20.04
+
+# Environment
+ENV DEBIAN_FRONTEND noninteractive
+SHELL ["/bin/bash", "-c"]   # Use Bash as shell
+
+# Install all basic requirements
+RUN \
+    apt-get update && \
+    apt-get install -y --no-install-recommends tar unzip wget git build-essential ninja-build \
+      cmake time python3 python3-pip python3-numpy python3-scipy python3-sklearn r-base && \
+    python3 -m pip install pytest hypothesis
+
+ENV GOSU_VERSION 1.10
+
+# Install lightweight sudo (not bound to TTY)
+RUN set -ex; \
+    wget -O /usr/local/bin/gosu "https://github.com/tianon/gosu/releases/download/$GOSU_VERSION/gosu-amd64" && \
+    chmod +x /usr/local/bin/gosu && \
+    gosu nobody true
+
+# Default entry-point to use if running locally
+# It will preserve attributes of created files
+COPY entrypoint.sh /scripts/
+
+WORKDIR /workspace
+ENTRYPOINT ["/scripts/entrypoint.sh"]
diff --git a/tests/ci_build/deploy_jvm_packages.sh b/tests/ci_build/deploy_jvm_packages.sh
index 950cfeb38573..50a190862dd1 100755
--- a/tests/ci_build/deploy_jvm_packages.sh
+++ b/tests/ci_build/deploy_jvm_packages.sh
@@ -3,22 +3,32 @@
 set -e
 set -x
 
-if [ $# -ne 1 ]; then
-  echo "Usage: $0 [spark version]"
+if [ $# -ne 2 ]; then
+  echo "Usage: $0 [spark version] [build_gpu? 0 or 1]"
   exit 1
 fi
 
 spark_version=$1
+build_gpu=$2
 
 # Initialize local Maven repository
 ./tests/ci_build/initialize_maven.sh
 
-rm -rf build/
 cd jvm-packages
+rm -rf $(find . -name target)
+rm -rf ../build/
 
 # Re-build package without Mock Rabit
 # Deploy to S3 bucket xgboost-maven-repo
-mvn --no-transfer-progress package deploy -P release-to-s3 -Dspark.version=${spark_version} -DskipTests
+if [[ "$build_gpu" == "0" ]]
+then
+  # Build CPU artifact
+  mvn --no-transfer-progress package deploy -P release-to-s3 -Dspark.version=${spark_version} -DskipTests
+else
+  # Build GPU artifact
+  sed -i -e 's/<artifactId>xgboost\(.*\)_\(.*\)<\/artifactId>/<artifactId>xgboost\1-gpu_\2<\/artifactId>/' $(find . -name pom.xml)
+  mvn --no-transfer-progress package deploy -Duse.cuda=ON -P release-to-s3 -Dspark.version=${spark_version} -DskipTests
+fi
 
 set +x
 set +e
diff --git a/tests/ci_build/doxygen.sh b/tests/ci_build/doxygen.sh
deleted file mode 100755
index 41757eb6935f..000000000000
--- a/tests/ci_build/doxygen.sh
+++ /dev/null
@@ -1,19 +0,0 @@
-#!/bin/bash
-
-if [ $# -ne 1 ]; then
-  echo "Usage: $0 [branch name]"
-  exit 1
-fi
-
-set -e
-set -x
-
-branch_name=$1
-
-rm -rf build
-mkdir build
-cd build
-cmake .. -DBUILD_C_DOC=ON
-make -j
-
-tar cvjf ${branch_name}.tar.bz2 doc_doxygen/
diff --git a/tests/cpp/c_api/test_c_api.cc b/tests/cpp/c_api/test_c_api.cc
index f4c2722fe92c..664118780cc3 100644
--- a/tests/cpp/c_api/test_c_api.cc
+++ b/tests/cpp/c_api/test_c_api.cc
@@ -23,9 +23,9 @@ TEST(CAPI, XGDMatrixCreateFromMatDT) {
   std::shared_ptr<xgboost::DMatrix> *dmat =
       static_cast<std::shared_ptr<xgboost::DMatrix> *>(handle);
   xgboost::MetaInfo &info = (*dmat)->Info();
-  ASSERT_EQ(info.num_col_, 2);
-  ASSERT_EQ(info.num_row_, 3);
-  ASSERT_EQ(info.num_nonzero_, 6);
+  ASSERT_EQ(info.num_col_, 2ul);
+  ASSERT_EQ(info.num_row_, 3ul);
+  ASSERT_EQ(info.num_nonzero_, 6ul);
 
   for (const auto &batch : (*dmat)->GetBatches<xgboost::SparsePage>()) {
     ASSERT_EQ(batch[0][0].fvalue, 0.0f);
@@ -38,9 +38,9 @@ TEST(CAPI, XGDMatrixCreateFromMatDT) {
 }
 
 TEST(CAPI, XGDMatrixCreateFromMatOmp) {
-  std::vector<int> num_rows = {100, 11374, 15000};
+  std::vector<bst_ulong> num_rows = {100, 11374, 15000};
   for (auto row : num_rows) {
-    int num_cols = 50;
+    bst_ulong num_cols = 50;
     int num_missing = 5;
     DMatrixHandle handle;
     std::vector<float> data(num_cols * row, 1.5);
diff --git a/tests/cpp/common/test_common.cc b/tests/cpp/common/test_common.cc
new file mode 100644
index 000000000000..006860b11af2
--- /dev/null
+++ b/tests/cpp/common/test_common.cc
@@ -0,0 +1,13 @@
+#include <gtest/gtest.h>
+#include "../../../src/common/common.h"
+
+namespace xgboost {
+namespace common {
+TEST(ArgSort, Basic) {
+  std::vector<float> inputs {3.0, 2.0, 1.0};
+  auto ret = ArgSort<bst_feature_t>(inputs);
+  std::vector<bst_feature_t> sol{2, 1, 0};
+  ASSERT_EQ(ret, sol);
+}
+}  // namespace common
+}  // namespace xgboost
diff --git a/tests/cpp/common/test_hist_util.cc b/tests/cpp/common/test_hist_util.cc
index 0fad360f4298..24c23b3e2608 100644
--- a/tests/cpp/common/test_hist_util.cc
+++ b/tests/cpp/common/test_hist_util.cc
@@ -159,10 +159,10 @@ TEST(CutsBuilder, SearchGroupInd) {
   HistogramCuts hmat;
 
   size_t group_ind = HostSketchContainer::SearchGroupIndFromRow(p_mat->Info().group_ptr_, 0);
-  ASSERT_EQ(group_ind, 0);
+  ASSERT_EQ(group_ind, 0ul);
 
   group_ind = HostSketchContainer::SearchGroupIndFromRow(p_mat->Info().group_ptr_, 5);
-  ASSERT_EQ(group_ind, 2);
+  ASSERT_EQ(group_ind, 2ul);
 
   EXPECT_ANY_THROW(HostSketchContainer::SearchGroupIndFromRow(p_mat->Info().group_ptr_, 17));
 
@@ -189,7 +189,7 @@ TEST(HistUtil, DenseCutsCategorical) {
        EXPECT_LT(cuts.MinValues()[0], x_sorted.front());
        EXPECT_GT(cuts_from_sketch.front(), x_sorted.front());
        EXPECT_GE(cuts_from_sketch.back(), x_sorted.back());
-       EXPECT_EQ(cuts_from_sketch.size(), num_categories);
+       EXPECT_EQ(cuts_from_sketch.size(), static_cast<size_t>(num_categories));
      }
    }
 }
diff --git a/tests/cpp/common/test_hist_util.h b/tests/cpp/common/test_hist_util.h
index bd88d14ef1f2..d025e5ea60bf 100644
--- a/tests/cpp/common/test_hist_util.h
+++ b/tests/cpp/common/test_hist_util.h
@@ -162,7 +162,7 @@ inline void ValidateColumn(const HistogramCuts& cuts, int column_idx,
 
   // Check all cut points are unique
   EXPECT_EQ(std::set<float>(cuts_begin, cuts_end).size(),
-            cuts_end - cuts_begin);
+            static_cast<size_t>(cuts_end - cuts_begin));
 
   auto unique = std::set<float>(sorted_column.begin(), sorted_column.end());
   if (unique.size() <= num_bins) {
@@ -189,7 +189,7 @@ inline void ValidateCuts(const HistogramCuts& cuts, DMatrix* dmat,
   // Collect data into columns
   std::vector<std::vector<float>> columns(dmat->Info().num_col_);
   for (auto& batch : dmat->GetBatches<SparsePage>()) {
-    ASSERT_GT(batch.Size(), 0);
+    ASSERT_GT(batch.Size(), 0ul);
     for (auto i = 0ull; i < batch.Size(); i++) {
       for (auto e : batch[i]) {
         columns[e.index].push_back(e.fvalue);
diff --git a/tests/cpp/common/test_json.cc b/tests/cpp/common/test_json.cc
index ba3b12e337e2..029beee8d48b 100644
--- a/tests/cpp/common/test_json.cc
+++ b/tests/cpp/common/test_json.cc
@@ -222,7 +222,7 @@ TEST(Json, ParseArray) {
   auto json = Json::Load(StringView{str.c_str(), str.size()});
   json = json["nodes"];
   std::vector<Json> arr = get<JsonArray>(json);
-  ASSERT_EQ(arr.size(), 3);
+  ASSERT_EQ(arr.size(), 3ul);
   Json v0 = arr[0];
   ASSERT_EQ(get<Integer>(v0["depth"]), 3);
   ASSERT_NEAR(get<Number>(v0["gain"]), 10.4866, kRtEps);
@@ -284,7 +284,7 @@ TEST(Json, EmptyArray) {
   std::istringstream iss(str);
   auto json = Json::Load(StringView{str.c_str(), str.size()});
   auto arr = get<JsonArray>(json["leaf_vector"]);
-  ASSERT_EQ(arr.size(), 0);
+  ASSERT_EQ(arr.size(), 0ul);
 }
 
 TEST(Json, Boolean) {
@@ -315,7 +315,7 @@ TEST(Json, AssigningObjects) {
     Json json;
     json = JsonObject();
     json["Okay"] = JsonArray();
-    ASSERT_EQ(get<JsonArray>(json["Okay"]).size(), 0);
+    ASSERT_EQ(get<JsonArray>(json["Okay"]).size(), 0ul);
   }
 
   {
@@ -453,7 +453,8 @@ TEST(Json, Invalid) {
       Json load{Json::Load(StringView(str.c_str(), str.size()))};
     } catch (dmlc::Error const &e) {
       std::string msg = e.what();
-      ASSERT_NE(msg.find("EOF"), std::string::npos);
+      ASSERT_TRUE(msg.find("EOF") != std::string::npos
+                  || msg.find("255") != std::string::npos);  // EOF is printed as 255 on s390x
       has_thrown = true;
     };
     ASSERT_TRUE(has_thrown);
diff --git a/tests/cpp/common/test_quantile.cc b/tests/cpp/common/test_quantile.cc
index c273658e54cb..fa748de1cc6c 100644
--- a/tests/cpp/common/test_quantile.cc
+++ b/tests/cpp/common/test_quantile.cc
@@ -5,14 +5,122 @@
 
 namespace xgboost {
 namespace common {
+
+TEST(Quantile, LoadBalance) {
+  size_t constexpr kRows = 1000, kCols = 100;
+  auto m = RandomDataGenerator{kRows, kCols, 0}.GenerateDMatrix();
+  std::vector<bst_feature_t> cols_ptr;
+  for (auto const &page : m->GetBatches<SparsePage>()) {
+    cols_ptr = HostSketchContainer::LoadBalance(page, kCols, 13);
+  }
+  size_t n_cols = 0;
+  for (size_t i = 1; i < cols_ptr.size(); ++i) {
+    n_cols += cols_ptr[i] - cols_ptr[i - 1];
+  }
+  CHECK_EQ(n_cols, kCols);
+}
+
+void TestDistributedQuantile(size_t rows, size_t cols) {
+  std::string msg {"Skipping AllReduce test"};
+  int32_t constexpr kWorkers = 4;
+  InitRabitContext(msg, kWorkers);
+  auto world = rabit::GetWorldSize();
+  if (world != 1) {
+    ASSERT_EQ(world, kWorkers);
+  } else {
+    return;
+  }
+
+  std::vector<MetaInfo> infos(2);
+  auto& h_weights = infos.front().weights_.HostVector();
+  h_weights.resize(rows);
+  SimpleLCG lcg;
+  SimpleRealUniformDistribution<float> dist(3, 1000);
+  std::generate(h_weights.begin(), h_weights.end(), [&]() { return dist(&lcg); });
+  std::vector<bst_row_t> column_size(cols, rows);
+  size_t n_bins = 64;
+
+  // Generate cuts for distributed environment.
+  auto sparsity = 0.5f;
+  auto rank = rabit::GetRank();
+  HostSketchContainer sketch_distributed(column_size, n_bins, false);
+  auto m = RandomDataGenerator{rows, cols, sparsity}
+               .Seed(rank)
+               .Lower(.0f)
+               .Upper(1.0f)
+               .GenerateDMatrix();
+  for (auto const &page : m->GetBatches<SparsePage>()) {
+    sketch_distributed.PushRowPage(page, m->Info());
+  }
+  HistogramCuts distributed_cuts;
+  sketch_distributed.MakeCuts(&distributed_cuts);
+
+  // Generate cuts for single node environment
+  rabit::Finalize();
+  CHECK_EQ(rabit::GetWorldSize(), 1);
+  std::for_each(column_size.begin(), column_size.end(), [=](auto& size) { size *= world; });
+  HostSketchContainer sketch_on_single_node(column_size, n_bins, false);
+  for (auto rank = 0; rank < world; ++rank) {
+    auto m = RandomDataGenerator{rows, cols, sparsity}
+                 .Seed(rank)
+                 .Lower(.0f)
+                 .Upper(1.0f)
+                 .GenerateDMatrix();
+    for (auto const &page : m->GetBatches<SparsePage>()) {
+      sketch_on_single_node.PushRowPage(page, m->Info());
+    }
+  }
+
+  HistogramCuts single_node_cuts;
+  sketch_on_single_node.MakeCuts(&single_node_cuts);
+
+  auto const& sptrs = single_node_cuts.Ptrs();
+  auto const& dptrs = distributed_cuts.Ptrs();
+  auto const& svals = single_node_cuts.Values();
+  auto const& dvals = distributed_cuts.Values();
+  auto const& smins = single_node_cuts.MinValues();
+  auto const& dmins = distributed_cuts.MinValues();
+
+  ASSERT_EQ(sptrs.size(), dptrs.size());
+  for (size_t i = 0; i < sptrs.size(); ++i) {
+    ASSERT_EQ(sptrs[i], dptrs[i]);
+  }
+
+  ASSERT_EQ(svals.size(), dvals.size());
+  for (size_t i = 0; i < svals.size(); ++i) {
+    ASSERT_NEAR(svals[i], dvals[i], 2e-2f);
+  }
+
+  ASSERT_EQ(smins.size(), dmins.size());
+  for (size_t i = 0; i < smins.size(); ++i) {
+    ASSERT_FLOAT_EQ(smins[i], dmins[i]);
+  }
+}
+
+TEST(Quantile, DistributedBasic) {
+#if defined(__unix__)
+  constexpr size_t kRows = 10, kCols = 10;
+  TestDistributedQuantile(kRows, kCols);
+#endif
+}
+
+TEST(Quantile, Distributed) {
+#if defined(__unix__)
+  constexpr size_t kRows = 1000, kCols = 200;
+  TestDistributedQuantile(kRows, kCols);
+#endif
+}
+
 TEST(Quantile, SameOnAllWorkers) {
+#if defined(__unix__)
   std::string msg{"Skipping Quantile AllreduceBasic test"};
-  size_t constexpr kWorkers = 4;
+  int32_t constexpr kWorkers = 4;
   InitRabitContext(msg, kWorkers);
   auto world = rabit::GetWorldSize();
   if (world != 1) {
     CHECK_EQ(world, kWorkers);
   } else {
+    LOG(WARNING) << msg;
     return;
   }
 
@@ -72,6 +180,8 @@ TEST(Quantile, SameOnAllWorkers) {
           }
         }
       });
+  rabit::Finalize();
+#endif  // defined(__unix__)
 }
 }  // namespace common
 }  // namespace xgboost
diff --git a/tests/cpp/common/test_quantile.h b/tests/cpp/common/test_quantile.h
index 7dea0b17deb3..e91f19ef84a8 100644
--- a/tests/cpp/common/test_quantile.h
+++ b/tests/cpp/common/test_quantile.h
@@ -7,7 +7,7 @@
 
 namespace xgboost {
 namespace common {
-inline void InitRabitContext(std::string msg, size_t n_workers) {
+inline void InitRabitContext(std::string msg, int32_t n_workers) {
   auto port = std::getenv("DMLC_TRACKER_PORT");
   std::string port_str;
   if (port) {
@@ -35,7 +35,7 @@ template <typename Fn> void RunWithSeedsAndBins(size_t rows, Fn fn) {
   for (size_t i = 0; i < bins.size() - 1; ++i) {
     bins[i] = i * 35 + 2;
   }
-  bins.back() = rows + 80;  // provide a bin number greater than rows.
+  bins.back() = rows + 160;  // provide a bin number greater than rows.
 
   std::vector<MetaInfo> infos(2);
   auto& h_weights = infos.front().weights_.HostVector();
diff --git a/tests/cpp/common/test_random.cc b/tests/cpp/common/test_random.cc
index dc7b38554162..9b2a1515543f 100644
--- a/tests/cpp/common/test_random.cc
+++ b/tests/cpp/common/test_random.cc
@@ -8,9 +8,10 @@ namespace common {
 TEST(ColumnSampler, Test) {
   int n = 128;
   ColumnSampler cs;
+  std::vector<float> feature_weights;
 
   // No node sampling
-  cs.Init(n, 1.0f, 0.5f, 0.5f);
+  cs.Init(n, feature_weights, 1.0f, 0.5f, 0.5f);
   auto set0 = cs.GetFeatureSet(0);
   ASSERT_EQ(set0->Size(), 32);
 
@@ -23,7 +24,7 @@ TEST(ColumnSampler, Test) {
   ASSERT_EQ(set2->Size(), 32);
 
   // Node sampling
-  cs.Init(n, 0.5f, 1.0f, 0.5f);
+  cs.Init(n, feature_weights, 0.5f, 1.0f, 0.5f);
   auto set3 = cs.GetFeatureSet(0);
   ASSERT_EQ(set3->Size(), 32);
 
@@ -33,19 +34,19 @@ TEST(ColumnSampler, Test) {
   ASSERT_EQ(set4->Size(), 32);
 
   // No level or node sampling, should be the same at different depth
-  cs.Init(n, 1.0f, 1.0f, 0.5f);
+  cs.Init(n, feature_weights, 1.0f, 1.0f, 0.5f);
   ASSERT_EQ(cs.GetFeatureSet(0)->HostVector(),
             cs.GetFeatureSet(1)->HostVector());
 
-  cs.Init(n, 1.0f, 1.0f, 1.0f);
+  cs.Init(n, feature_weights, 1.0f, 1.0f, 1.0f);
   auto set5 = cs.GetFeatureSet(0);
   ASSERT_EQ(set5->Size(), n);
-  cs.Init(n, 1.0f, 1.0f, 1.0f);
+  cs.Init(n, feature_weights, 1.0f, 1.0f, 1.0f);
   auto set6 = cs.GetFeatureSet(0);
   ASSERT_EQ(set5->HostVector(), set6->HostVector());
 
   // Should always be a minimum of one feature
-  cs.Init(n, 1e-16f, 1e-16f, 1e-16f);
+  cs.Init(n, feature_weights, 1e-16f, 1e-16f, 1e-16f);
   ASSERT_EQ(cs.GetFeatureSet(0)->Size(), 1);
 }
 
@@ -56,13 +57,13 @@ TEST(ColumnSampler, ThreadSynchronisation) {
   size_t iterations = 10;
   size_t levels = 5;
   std::vector<bst_feature_t> reference_result;
-  bool success =
-      true;  // Cannot use google test asserts in multithreaded region
+  std::vector<float> feature_weights;
+  bool success = true; // Cannot use google test asserts in multithreaded region
 #pragma omp parallel num_threads(num_threads)
   {
     for (auto j = 0ull; j < iterations; j++) {
       ColumnSampler cs(j);
-      cs.Init(n, 0.5f, 0.5f, 0.5f);
+      cs.Init(n, feature_weights, 0.5f, 0.5f, 0.5f);
       for (auto level = 0ull; level < levels; level++) {
         auto result = cs.GetFeatureSet(level)->ConstHostVector();
 #pragma omp single
@@ -76,5 +77,54 @@ TEST(ColumnSampler, ThreadSynchronisation) {
   }
   ASSERT_TRUE(success);
 }
+
+TEST(ColumnSampler, WeightedSampling) {
+  auto test_basic = [](int first) {
+    std::vector<float> feature_weights(2);
+    feature_weights[0] = std::abs(first - 1.0f);
+    feature_weights[1] = first - 0.0f;
+    ColumnSampler cs{0};
+    cs.Init(2, feature_weights, 1.0, 1.0, 0.5);
+    auto feature_sets = cs.GetFeatureSet(0);
+    auto const &h_feat_set = feature_sets->HostVector();
+    ASSERT_EQ(h_feat_set.size(), 1);
+    ASSERT_EQ(h_feat_set[0], first - 0);
+  };
+
+  test_basic(0);
+  test_basic(1);
+
+  size_t constexpr kCols = 64;
+  std::vector<float> feature_weights(kCols);
+  SimpleLCG rng;
+  SimpleRealUniformDistribution<float> dist(.0f, 12.0f);
+  std::generate(feature_weights.begin(), feature_weights.end(), [&]() { return dist(&rng); });
+  ColumnSampler cs{0};
+  cs.Init(kCols, feature_weights, 0.5f, 1.0f, 1.0f);
+  std::vector<bst_feature_t> features(kCols);
+  std::iota(features.begin(), features.end(), 0);
+  std::vector<float> freq(kCols, 0);
+  for (size_t i = 0; i < 1024; ++i) {
+    auto fset = cs.GetFeatureSet(0);
+    ASSERT_EQ(kCols * 0.5, fset->Size());
+    auto const& h_fset = fset->HostVector();
+    for (auto f : h_fset) {
+      freq[f] += 1.0f;
+    }
+  }
+
+  auto norm = std::accumulate(freq.cbegin(), freq.cend(), .0f);
+  for (auto& f : freq) {
+    f /= norm;
+  }
+  norm = std::accumulate(feature_weights.cbegin(), feature_weights.cend(), .0f);
+  for (auto& f : feature_weights) {
+    f /= norm;
+  }
+
+  for (size_t i = 0; i < feature_weights.size(); ++i) {
+    EXPECT_NEAR(freq[i], feature_weights[i], 1e-2);
+  }
+}
 }  // namespace common
 }  // namespace xgboost
diff --git a/tests/cpp/test_serialization.cc b/tests/cpp/test_serialization.cc
index 23334408f1fa..66428e8de950 100644
--- a/tests/cpp/test_serialization.cc
+++ b/tests/cpp/test_serialization.cc
@@ -179,7 +179,6 @@ TEST_F(SerializationTest, Exact) {
                             {"nthread", "1"},
                             {"base_score", "3.14195265"},
                             {"max_depth", "2"},
-                            {"enable_experimental_json_serialization", "1"},
                             {"tree_method", "exact"}},
                            fmap_, p_dmat_);
 
@@ -189,7 +188,6 @@ TEST_F(SerializationTest, Exact) {
                             {"base_score", "3.14195265"},
                             {"max_depth", "2"},
                             {"num_parallel_tree", "4"},
-                            {"enable_experimental_json_serialization", "1"},
                             {"tree_method", "exact"}},
                            fmap_, p_dmat_);
 
@@ -198,7 +196,6 @@ TEST_F(SerializationTest, Exact) {
                             {"nthread", "1"},
                             {"base_score", "3.14195265"},
                             {"max_depth", "2"},
-                            {"enable_experimental_json_serialization", "1"},
                             {"tree_method", "exact"}},
                            fmap_, p_dmat_);
 }
@@ -208,7 +205,6 @@ TEST_F(SerializationTest, Approx) {
                             {"seed", "0"},
                             {"nthread", "1"},
                             {"max_depth", "2"},
-                            {"enable_experimental_json_serialization", "1"},
                             {"tree_method", "approx"}},
                            fmap_, p_dmat_);
 
@@ -217,7 +213,6 @@ TEST_F(SerializationTest, Approx) {
                             {"nthread", "1"},
                             {"max_depth", "2"},
                             {"num_parallel_tree", "4"},
-                            {"enable_experimental_json_serialization", "1"},
                             {"tree_method", "approx"}},
                            fmap_, p_dmat_);
 
@@ -225,7 +220,6 @@ TEST_F(SerializationTest, Approx) {
                             {"seed", "0"},
                             {"nthread", "1"},
                             {"max_depth", "2"},
-                            {"enable_experimental_json_serialization", "1"},
                             {"tree_method", "approx"}},
                            fmap_, p_dmat_);
 }
@@ -235,7 +229,6 @@ TEST_F(SerializationTest, Hist) {
                             {"seed", "0"},
                             {"nthread", "1"},
                             {"max_depth", "2"},
-                            {"enable_experimental_json_serialization", "1"},
                             {"tree_method", "hist"}},
                            fmap_, p_dmat_);
 
@@ -244,7 +237,6 @@ TEST_F(SerializationTest, Hist) {
                             {"nthread", "1"},
                             {"max_depth", "2"},
                             {"num_parallel_tree", "4"},
-                            {"enable_experimental_json_serialization", "1"},
                             {"tree_method", "hist"}},
                            fmap_, p_dmat_);
 
@@ -252,7 +244,6 @@ TEST_F(SerializationTest, Hist) {
                             {"seed", "0"},
                             {"nthread", "1"},
                             {"max_depth", "2"},
-                            {"enable_experimental_json_serialization", "1"},
                             {"tree_method", "hist"}},
                            fmap_, p_dmat_);
 }
@@ -261,7 +252,6 @@ TEST_F(SerializationTest, CPUCoordDescent) {
   TestLearnerSerialization({{"booster", "gblinear"},
                             {"seed", "0"},
                             {"nthread", "1"},
-                            {"enable_experimental_json_serialization", "1"},
                             {"updater", "coord_descent"}},
                            fmap_, p_dmat_);
 }
@@ -270,7 +260,6 @@ TEST_F(SerializationTest, CPUCoordDescent) {
 TEST_F(SerializationTest, GpuHist) {
   TestLearnerSerialization({{"booster", "gbtree"},
                             {"seed", "0"},
-                            {"enable_experimental_json_serialization", "1"},
                             {"nthread", "1"},
                             {"max_depth", "2"},
                             {"tree_method", "gpu_hist"}},
@@ -278,7 +267,6 @@ TEST_F(SerializationTest, GpuHist) {
 
   TestLearnerSerialization({{"booster", "gbtree"},
                             {"seed", "0"},
-                            {"enable_experimental_json_serialization", "1"},
                             {"nthread", "1"},
                             {"max_depth", "2"},
                             {"num_parallel_tree", "4"},
@@ -287,7 +275,6 @@ TEST_F(SerializationTest, GpuHist) {
 
   TestLearnerSerialization({{"booster", "dart"},
                             {"seed", "0"},
-                            {"enable_experimental_json_serialization", "1"},
                             {"nthread", "1"},
                             {"max_depth", "2"},
                             {"tree_method", "gpu_hist"}},
@@ -345,7 +332,6 @@ TEST_F(SerializationTest, GPUCoordDescent) {
   TestLearnerSerialization({{"booster", "gblinear"},
                             {"seed", "0"},
                             {"nthread", "1"},
-                            {"enable_experimental_json_serialization", "1"},
                             {"updater", "gpu_coord_descent"}},
                            fmap_, p_dmat_);
 }
@@ -380,7 +366,6 @@ TEST_F(LogitSerializationTest, Exact) {
                             {"seed", "0"},
                             {"nthread", "1"},
                             {"max_depth", "2"},
-                            {"enable_experimental_json_serialization", "1"},
                             {"tree_method", "exact"}},
                            fmap_, p_dmat_);
 
@@ -389,7 +374,6 @@ TEST_F(LogitSerializationTest, Exact) {
                             {"seed", "0"},
                             {"nthread", "1"},
                             {"max_depth", "2"},
-                            {"enable_experimental_json_serialization", "1"},
                             {"tree_method", "exact"}},
                            fmap_, p_dmat_);
 }
@@ -400,7 +384,6 @@ TEST_F(LogitSerializationTest, Approx) {
                             {"seed", "0"},
                             {"nthread", "1"},
                             {"max_depth", "2"},
-                            {"enable_experimental_json_serialization", "1"},
                             {"tree_method", "approx"}},
                            fmap_, p_dmat_);
 
@@ -409,7 +392,6 @@ TEST_F(LogitSerializationTest, Approx) {
                             {"seed", "0"},
                             {"nthread", "1"},
                             {"max_depth", "2"},
-                            {"enable_experimental_json_serialization", "1"},
                             {"tree_method", "approx"}},
                            fmap_, p_dmat_);
 }
@@ -420,7 +402,6 @@ TEST_F(LogitSerializationTest, Hist) {
                             {"seed", "0"},
                             {"nthread", "1"},
                             {"max_depth", "2"},
-                            {"enable_experimental_json_serialization", "1"},
                             {"tree_method", "hist"}},
                            fmap_, p_dmat_);
 
@@ -429,7 +410,6 @@ TEST_F(LogitSerializationTest, Hist) {
                             {"seed", "0"},
                             {"nthread", "1"},
                             {"max_depth", "2"},
-                            {"enable_experimental_json_serialization", "1"},
                             {"tree_method", "hist"}},
                            fmap_, p_dmat_);
 }
@@ -438,7 +418,6 @@ TEST_F(LogitSerializationTest, CPUCoordDescent) {
   TestLearnerSerialization({{"booster", "gblinear"},
                             {"seed", "0"},
                             {"nthread", "1"},
-                            {"enable_experimental_json_serialization", "1"},
                             {"updater", "coord_descent"}},
                            fmap_, p_dmat_);
 }
@@ -450,14 +429,12 @@ TEST_F(LogitSerializationTest, GpuHist) {
                             {"seed", "0"},
                             {"nthread", "1"},
                             {"max_depth", "2"},
-                            {"enable_experimental_json_serialization", "1"},
                             {"tree_method", "gpu_hist"}},
                            fmap_, p_dmat_);
 
   TestLearnerSerialization({{"booster", "gbtree"},
                             {"objective", "binary:logistic"},
                             {"seed", "0"},
-                            {"enable_experimental_json_serialization", "1"},
                             {"nthread", "1"},
                             {"max_depth", "2"},
                             {"num_parallel_tree", "4"},
@@ -469,7 +446,6 @@ TEST_F(LogitSerializationTest, GpuHist) {
                             {"seed", "0"},
                             {"nthread", "1"},
                             {"max_depth", "2"},
-                            {"enable_experimental_json_serialization", "1"},
                             {"tree_method", "gpu_hist"}},
                            fmap_, p_dmat_);
 }
@@ -479,7 +455,6 @@ TEST_F(LogitSerializationTest, GPUCoordDescent) {
                             {"objective", "binary:logistic"},
                             {"seed", "0"},
                             {"nthread", "1"},
-                            {"enable_experimental_json_serialization", "1"},
                             {"updater", "gpu_coord_descent"}},
                            fmap_, p_dmat_);
 }
@@ -515,7 +490,6 @@ TEST_F(MultiClassesSerializationTest, Exact) {
                             {"seed", "0"},
                             {"nthread", "1"},
                             {"max_depth", std::to_string(kClasses)},
-                            {"enable_experimental_json_serialization", "1"},
                             {"tree_method", "exact"}},
                            fmap_, p_dmat_);
 
@@ -525,7 +499,6 @@ TEST_F(MultiClassesSerializationTest, Exact) {
                             {"nthread", "1"},
                             {"max_depth", std::to_string(kClasses)},
                             {"num_parallel_tree", "4"},
-                            {"enable_experimental_json_serialization", "1"},
                             {"tree_method", "exact"}},
                            fmap_, p_dmat_);
 
@@ -534,7 +507,6 @@ TEST_F(MultiClassesSerializationTest, Exact) {
                             {"seed", "0"},
                             {"nthread", "1"},
                             {"max_depth", std::to_string(kClasses)},
-                            {"enable_experimental_json_serialization", "1"},
                             {"tree_method", "exact"}},
                            fmap_, p_dmat_);
 }
@@ -545,7 +517,6 @@ TEST_F(MultiClassesSerializationTest, Approx) {
                             {"seed", "0"},
                             {"nthread", "1"},
                             {"max_depth", std::to_string(kClasses)},
-                            {"enable_experimental_json_serialization", "1"},
                             {"tree_method", "approx"}},
                            fmap_, p_dmat_);
 
@@ -554,7 +525,6 @@ TEST_F(MultiClassesSerializationTest, Approx) {
                             {"seed", "0"},
                             {"nthread", "1"},
                             {"max_depth", std::to_string(kClasses)},
-                            {"enable_experimental_json_serialization", "1"},
                             {"tree_method", "approx"}},
                            fmap_, p_dmat_);
 }
@@ -565,7 +535,6 @@ TEST_F(MultiClassesSerializationTest, Hist) {
                             {"seed", "0"},
                             {"nthread", "1"},
                             {"max_depth", std::to_string(kClasses)},
-                            {"enable_experimental_json_serialization", "1"},
                             {"tree_method", "hist"}},
                            fmap_, p_dmat_);
 
@@ -574,7 +543,6 @@ TEST_F(MultiClassesSerializationTest, Hist) {
                             {"seed", "0"},
                             {"nthread", "1"},
                             {"max_depth", std::to_string(kClasses)},
-                            {"enable_experimental_json_serialization", "1"},
                             {"num_parallel_tree", "4"},
                             {"tree_method", "hist"}},
                            fmap_, p_dmat_);
@@ -584,7 +552,6 @@ TEST_F(MultiClassesSerializationTest, Hist) {
                             {"seed", "0"},
                             {"nthread", "1"},
                             {"max_depth", std::to_string(kClasses)},
-                            {"enable_experimental_json_serialization", "1"},
                             {"tree_method", "hist"}},
                            fmap_, p_dmat_);
 }
@@ -593,7 +560,6 @@ TEST_F(MultiClassesSerializationTest, CPUCoordDescent) {
   TestLearnerSerialization({{"booster", "gblinear"},
                             {"seed", "0"},
                             {"nthread", "1"},
-                            {"enable_experimental_json_serialization", "1"},
                             {"updater", "coord_descent"}},
                            fmap_, p_dmat_);
 }
@@ -609,7 +575,6 @@ TEST_F(MultiClassesSerializationTest, GpuHist) {
                             // different result (1e-7) with CPU predictor for some
                             // entries.
                             {"predictor", "gpu_predictor"},
-                            {"enable_experimental_json_serialization", "1"},
                             {"tree_method", "gpu_hist"}},
                            fmap_, p_dmat_);
 
@@ -621,7 +586,6 @@ TEST_F(MultiClassesSerializationTest, GpuHist) {
                             // GPU_Hist has higher floating point error. 1e-6 doesn't work
                             // after num_parallel_tree goes to 4
                             {"num_parallel_tree", "3"},
-                            {"enable_experimental_json_serialization", "1"},
                             {"tree_method", "gpu_hist"}},
                            fmap_, p_dmat_);
 
@@ -630,7 +594,6 @@ TEST_F(MultiClassesSerializationTest, GpuHist) {
                             {"seed", "0"},
                             {"nthread", "1"},
                             {"max_depth", std::to_string(kClasses)},
-                            {"enable_experimental_json_serialization", "1"},
                             {"tree_method", "gpu_hist"}},
                            fmap_, p_dmat_);
 }
@@ -640,7 +603,6 @@ TEST_F(MultiClassesSerializationTest, GPUCoordDescent) {
                             {"num_class", std::to_string(kClasses)},
                             {"seed", "0"},
                             {"nthread", "1"},
-                            {"enable_experimental_json_serialization", "1"},
                             {"updater", "gpu_coord_descent"}},
                            fmap_, p_dmat_);
 }
diff --git a/tests/cpp/tree/test_gpu_hist.cu b/tests/cpp/tree/test_gpu_hist.cu
index fd5c9f43fb2a..5199a27d26e8 100644
--- a/tests/cpp/tree/test_gpu_hist.cu
+++ b/tests/cpp/tree/test_gpu_hist.cu
@@ -204,12 +204,11 @@ TEST(GpuHist, EvaluateRootSplit) {
   ASSERT_EQ(maker.hist.Data().size(), hist.size());
   thrust::copy(hist.begin(), hist.end(),
     maker.hist.Data().begin());
+  std::vector<float> feature_weights;
 
-  maker.column_sampler.Init(kNCols,
-    param.colsample_bynode,
-    param.colsample_bylevel,
-    param.colsample_bytree,
-    false);
+  maker.column_sampler.Init(kNCols, feature_weights, param.colsample_bynode,
+                            param.colsample_bylevel, param.colsample_bytree,
+                            false);
 
   RegTree tree;
   MetaInfo info;
@@ -506,5 +505,17 @@ TEST(GpuHist, ConfigIO) {
   ASSERT_EQ(j_updater, j_updater_roundtrip);
 }
 
+TEST(GpuHist, MaxDepth) {
+  GenericParameter generic_param(CreateEmptyGenericParam(0));
+  size_t constexpr kRows = 16;
+  size_t constexpr kCols = 4;
+  auto p_mat = RandomDataGenerator{kRows, kCols, 0}.GenerateDMatrix();
+
+  auto learner = std::unique_ptr<Learner>(Learner::Create({p_mat}));
+  learner->SetParam("max_depth", "32");
+  learner->Configure();
+
+  ASSERT_THROW({learner->UpdateOneIter(0, p_mat);}, dmlc::Error);
+}
 }  // namespace tree
 }  // namespace xgboost
diff --git a/tests/cpp/tree/test_tree_model.cc b/tests/cpp/tree/test_tree_model.cc
index dbf2b80a2de4..1dbc5fc2c89b 100644
--- a/tests/cpp/tree/test_tree_model.cc
+++ b/tests/cpp/tree/test_tree_model.cc
@@ -6,6 +6,7 @@
 #include "xgboost/json_io.h"
 
 namespace xgboost {
+#if DMLC_IO_NO_ENDIAN_SWAP  // skip on big-endian machines
 // Manually construct tree in binary format
 // Do not use structs in case they change
 // We want to preserve backwards compatibility
@@ -85,6 +86,7 @@ TEST(Tree, Load) {
   EXPECT_EQ(tree[1].LeafValue(), 0.1f);
   EXPECT_TRUE(tree[1].IsLeaf());
 }
+#endif  // DMLC_IO_NO_ENDIAN_SWAP
 
 TEST(Tree, AllocateNode) {
   RegTree tree;
diff --git a/tests/python-gpu/test_device_quantile_dmatrix.py b/tests/python-gpu/test_device_quantile_dmatrix.py
index f0978a0afaf4..c44de28bd2ff 100644
--- a/tests/python-gpu/test_device_quantile_dmatrix.py
+++ b/tests/python-gpu/test_device_quantile_dmatrix.py
@@ -16,6 +16,20 @@ def test_dmatrix_numpy_init(self):
                            match='is not supported for DeviceQuantileDMatrix'):
             xgb.DeviceQuantileDMatrix(data, np.ones(5, dtype=np.float64))
 
+    @pytest.mark.skipif(**tm.no_cupy())
+    def test_dmatrix_feature_weights(self):
+        import cupy as cp
+        rng = cp.random.RandomState(1994)
+        data = rng.randn(5, 5)
+        m = xgb.DMatrix(data)
+
+        feature_weights = rng.uniform(size=5)
+        m.set_info(feature_weights=feature_weights)
+
+        cp.testing.assert_array_equal(
+            cp.array(m.get_float_info('feature_weights')),
+            feature_weights.astype(np.float32))
+
     @pytest.mark.skipif(**tm.no_cupy())
     def test_dmatrix_cupy_init(self):
         import cupy as cp
diff --git a/tests/python/test_basic.py b/tests/python/test_basic.py
index dad7ddc9db0c..acfb6db560be 100644
--- a/tests/python/test_basic.py
+++ b/tests/python/test_basic.py
@@ -110,16 +110,19 @@ def test_multiclass(self):
         # error must be smaller than 10%
         assert err < 0.1
 
-        # save dmatrix into binary buffer
-        dtest.save_binary('dtest.buffer')
-        # save model
-        bst.save_model('xgb.model')
-        # load model and data in
-        bst2 = xgb.Booster(model_file='xgb.model')
-        dtest2 = xgb.DMatrix('dtest.buffer')
-        preds2 = bst2.predict(dtest2)
-        # assert they are the same
-        assert np.sum(np.abs(preds2 - preds)) == 0
+        with tempfile.TemporaryDirectory() as tmpdir:
+            dtest_path = os.path.join(tmpdir, 'dtest.buffer')
+            model_path = os.path.join(tmpdir, 'xgb.model')
+            # save dmatrix into binary buffer
+            dtest.save_binary(dtest_path)
+            # save model
+            bst.save_model(model_path)
+            # load model and data in
+            bst2 = xgb.Booster(model_file=model_path)
+            dtest2 = xgb.DMatrix(dtest_path)
+            preds2 = bst2.predict(dtest2)
+            # assert they are the same
+            assert np.sum(np.abs(preds2 - preds)) == 0
 
     def test_dump(self):
         data = np.random.randn(100, 2)
diff --git a/tests/python/test_basic_models.py b/tests/python/test_basic_models.py
index 3eafdf71d821..529f7784c60d 100644
--- a/tests/python/test_basic_models.py
+++ b/tests/python/test_basic_models.py
@@ -6,6 +6,7 @@
 import testing as tm
 import pytest
 import locale
+import tempfile
 
 dpath = 'demo/data/'
 dtrain = xgb.DMatrix(dpath + 'agaricus.txt.train')
@@ -60,15 +61,20 @@ def test_dart(self):
         # error must be smaller than 10%
         assert err < 0.1
 
-        # save dmatrix into binary buffer
-        dtest.save_binary('dtest.buffer')
-        model_path = 'xgb.model.dart'
-        # save model
-        bst.save_model(model_path)
-        # load model and data in
-        bst2 = xgb.Booster(params=param, model_file='xgb.model.dart')
-        dtest2 = xgb.DMatrix('dtest.buffer')
+        with tempfile.TemporaryDirectory() as tmpdir:
+            dtest_path = os.path.join(tmpdir, 'dtest.dmatrix')
+            model_path = os.path.join(tmpdir, 'xgboost.model.dart')
+            # save dmatrix into binary buffer
+            dtest.save_binary(dtest_path)
+            model_path = model_path
+            # save model
+            bst.save_model(model_path)
+            # load model and data in
+            bst2 = xgb.Booster(params=param, model_file=model_path)
+            dtest2 = xgb.DMatrix(dtest_path)
+
         preds2 = bst2.predict(dtest2, ntree_limit=num_round)
+
         # assert they are the same
         assert np.sum(np.abs(preds2 - preds)) == 0
 
@@ -103,7 +109,6 @@ def my_logloss(preds, dtrain):
         for ii in range(len(preds_list)):
             for jj in range(ii + 1, len(preds_list)):
                 assert np.sum(np.abs(preds_list[ii] - preds_list[jj])) > 0
-        os.remove(model_path)
 
     def run_eta_decay(self, tree_method):
         watchlist = [(dtest, 'eval'), (dtrain, 'train')]
diff --git a/tests/python/test_cli.py b/tests/python/test_cli.py
index 3ff37ea521c8..e437f426cc6f 100644
--- a/tests/python/test_cli.py
+++ b/tests/python/test_cli.py
@@ -47,9 +47,12 @@ def test_cli_model(self):
         seed = 1994
 
         with tempfile.TemporaryDirectory() as tmpdir:
-            model_out_cli = os.path.join(tmpdir, 'test_load_cli_model-cli.bin')
-            model_out_py = os.path.join(tmpdir, 'test_cli_model-py.bin')
-            config_path = os.path.join(tmpdir, 'test_load_cli_model.conf')
+            model_out_cli = os.path.join(
+                tmpdir, 'test_load_cli_model-cli.json')
+            model_out_py = os.path.join(
+                tmpdir, 'test_cli_model-py.json')
+            config_path = os.path.join(
+                tmpdir, 'test_load_cli_model.conf')
 
             train_conf = self.template.format(data_path=data_path,
                                               seed=seed,
@@ -121,6 +124,8 @@ def test_cli_help(self):
         v = xgboost.__version__
         if v.find('SNAPSHOT') != -1:
             assert msg.split(':')[1].strip() == v.split('-')[0]
+        elif v.find('rc') != -1:
+            assert msg.split(':')[1].strip() == v.split('rc')[0]
         else:
             assert msg.split(':')[1].strip() == v
 
diff --git a/tests/python/test_demos.py b/tests/python/test_demos.py
index 8b6535dbff45..33e64f7dd40b 100644
--- a/tests/python/test_demos.py
+++ b/tests/python/test_demos.py
@@ -1,12 +1,10 @@
 import os
 import subprocess
-import sys
 import pytest
 import testing as tm
 
 
-CURRENT_DIR = os.path.dirname(__file__)
-ROOT_DIR = os.path.dirname(os.path.dirname(CURRENT_DIR))
+ROOT_DIR = tm.PROJECT_ROOT
 DEMO_DIR = os.path.join(ROOT_DIR, 'demo')
 PYTHON_DEMO_DIR = os.path.join(DEMO_DIR, 'guide-python')
 
@@ -19,21 +17,27 @@ def test_basic_walkthrough():
     os.remove('dump.raw.txt')
 
 
+@pytest.mark.skipif(**tm.no_matplotlib())
 def test_custom_multiclass_objective():
     script = os.path.join(PYTHON_DEMO_DIR, 'custom_softmax.py')
     cmd = ['python', script, '--plot=0']
     subprocess.check_call(cmd)
 
 
+@pytest.mark.skipif(**tm.no_matplotlib())
 def test_custom_rmsle_objective():
-    major, minor = sys.version_info[:2]
-    if minor < 6:
-        pytest.skip('Skipping RMLSE test due to Python version being too low.')
     script = os.path.join(PYTHON_DEMO_DIR, 'custom_rmsle.py')
     cmd = ['python', script, '--plot=0']
     subprocess.check_call(cmd)
 
 
+@pytest.mark.skipif(**tm.no_matplotlib())
+def test_feature_weights_demo():
+    script = os.path.join(PYTHON_DEMO_DIR, 'feature_weights.py')
+    cmd = ['python', script, '--plot=0']
+    subprocess.check_call(cmd)
+
+
 @pytest.mark.skipif(**tm.no_sklearn())
 def test_sklearn_demo():
     script = os.path.join(PYTHON_DEMO_DIR, 'sklearn_examples.py')
@@ -105,6 +109,8 @@ def test_evals_result_demo():
     subprocess.check_call(cmd)
 
 
+@pytest.mark.skipif(**tm.no_sklearn())
+@pytest.mark.skipif(**tm.no_pandas())
 def test_aft_demo():
     script = os.path.join(DEMO_DIR, 'aft_survival', 'aft_survival_demo.py')
     cmd = ['python', script]
diff --git a/tests/python/test_dmatrix.py b/tests/python/test_dmatrix.py
index ecf5f60411bf..f641ea2c54f4 100644
--- a/tests/python/test_dmatrix.py
+++ b/tests/python/test_dmatrix.py
@@ -99,6 +99,11 @@ def test_slice(self):
         X = rng.randn(100, 100)
         y = rng.randint(low=0, high=3, size=100)
         d = xgb.DMatrix(X, y)
+        np.testing.assert_equal(d.get_label(), y.astype(np.float32))
+
+        fw = rng.uniform(size=100).astype(np.float32)
+        d.set_info(feature_weights=fw)
+
         eval_res_0 = {}
         booster = xgb.train(
             {'num_class': 3, 'objective': 'multi:softprob'}, d,
@@ -106,19 +111,23 @@ def test_slice(self):
 
         predt = booster.predict(d)
         predt = predt.reshape(100 * 3, 1)
+
         d.set_base_margin(predt)
 
         ridxs = [1, 2, 3, 4, 5, 6]
-        d = d.slice(ridxs)
-        sliced_margin = d.get_float_info('base_margin')
+        sliced = d.slice(ridxs)
+
+        sliced_margin = sliced.get_float_info('base_margin')
         assert sliced_margin.shape[0] == len(ridxs) * 3
 
         eval_res_1 = {}
-        xgb.train({'num_class': 3, 'objective': 'multi:softprob'}, d,
-                  num_boost_round=2, evals=[(d, 'd')], evals_result=eval_res_1)
+        xgb.train({'num_class': 3, 'objective': 'multi:softprob'}, sliced,
+                  num_boost_round=2, evals=[(sliced, 'd')],
+                  evals_result=eval_res_1)
 
         eval_res_0 = eval_res_0['d']['merror']
         eval_res_1 = eval_res_1['d']['merror']
+
         for i in range(len(eval_res_0)):
             assert abs(eval_res_0[i] - eval_res_1[i]) < 0.02
 
@@ -196,13 +205,33 @@ def test_get_info(self):
         dtrain.get_float_info('base_margin')
         dtrain.get_uint_info('group_ptr')
 
+    def test_feature_weights(self):
+        kRows = 10
+        kCols = 50
+        rng = np.random.RandomState(1994)
+        fw = rng.uniform(size=kCols)
+        X = rng.randn(kRows, kCols)
+        m = xgb.DMatrix(X)
+        m.set_info(feature_weights=fw)
+        np.testing.assert_allclose(fw, m.get_float_info('feature_weights'))
+        # Handle empty
+        m.set_info(feature_weights=np.empty((0, 0)))
+
+        assert m.get_float_info('feature_weights').shape[0] == 0
+
+        fw -= 1
+
+        def assign_weight():
+            m.set_info(feature_weights=fw)
+        self.assertRaises(ValueError, assign_weight)
+
     def test_sparse_dmatrix_csr(self):
         nrow = 100
         ncol = 1000
         x = rand(nrow, ncol, density=0.0005, format='csr', random_state=rng)
         assert x.indices.max() < ncol - 1
         x.data[:] = 1
-        dtrain = xgb.DMatrix(x, label=np.random.binomial(1, 0.3, nrow))
+        dtrain = xgb.DMatrix(x, label=rng.binomial(1, 0.3, nrow))
         assert (dtrain.num_row(), dtrain.num_col()) == (nrow, ncol)
         watchlist = [(dtrain, 'train')]
         param = {'max_depth': 3, 'objective': 'binary:logistic', 'verbosity': 0}
@@ -215,7 +244,7 @@ def test_sparse_dmatrix_csc(self):
         x = rand(nrow, ncol, density=0.0005, format='csc', random_state=rng)
         assert x.indices.max() < nrow - 1
         x.data[:] = 1
-        dtrain = xgb.DMatrix(x, label=np.random.binomial(1, 0.3, nrow))
+        dtrain = xgb.DMatrix(x, label=rng.binomial(1, 0.3, nrow))
         assert (dtrain.num_row(), dtrain.num_col()) == (nrow, ncol)
         watchlist = [(dtrain, 'train')]
         param = {'max_depth': 3, 'objective': 'binary:logistic', 'verbosity': 0}
diff --git a/tests/python/test_early_stopping.py b/tests/python/test_early_stopping.py
index 9338c095d657..4fca3e59302b 100644
--- a/tests/python/test_early_stopping.py
+++ b/tests/python/test_early_stopping.py
@@ -82,6 +82,7 @@ def test_cv_early_stopping(self):
         self.assert_metrics_length(cv, 1)
 
     @pytest.mark.skipif(**tm.no_sklearn())
+    @pytest.mark.skipif(**tm.no_pandas())
     def test_cv_early_stopping_with_multiple_eval_sets_and_metrics(self):
         from sklearn.datasets import load_breast_cancer
 
diff --git a/tests/python/test_model_compatibility.py b/tests/python/test_model_compatibility.py
index 55110720bf92..e02134d6cc71 100644
--- a/tests/python/test_model_compatibility.py
+++ b/tests/python/test_model_compatibility.py
@@ -1,10 +1,12 @@
 import xgboost
 import os
 import generate_models as gm
+import testing as tm
 import json
 import zipfile
 import pytest
 import copy
+import urllib.request
 
 
 def run_model_param_check(config):
@@ -87,6 +89,7 @@ def run_scikit_model_check(name, path):
         assert False
 
 
+@pytest.mark.skipif(**tm.no_sklearn())
 def test_model_compatibility():
     '''Test model compatibility, can only be run on CI as others don't
     have the credentials.
@@ -94,17 +97,9 @@ def test_model_compatibility():
     '''
     path = os.path.dirname(os.path.abspath(__file__))
     path = os.path.join(path, 'models')
-    try:
-        import boto3
-        import botocore
-    except ImportError:
-        pytest.skip(
-            'Skiping compatibility tests as boto3 is not installed.')
-
-    s3_bucket = boto3.resource('s3').Bucket('xgboost-ci-jenkins-artifacts')
-    zip_path = 'xgboost_model_compatibility_test.zip'
-    s3_bucket.download_file(zip_path, zip_path)
 
+    zip_path, _ = urllib.request.urlretrieve('https://xgboost-ci-jenkins-artifacts.s3-us-west-2' +
+                                             '.amazonaws.com/xgboost_model_compatibility_test.zip')
     with zipfile.ZipFile(zip_path, 'r') as z:
         z.extractall(path)
 
diff --git a/tests/python/test_plotting.py b/tests/python/test_plotting.py
index 18b0b83c7d60..e5e3a96e1bb0 100644
--- a/tests/python/test_plotting.py
+++ b/tests/python/test_plotting.py
@@ -14,27 +14,27 @@
 except ImportError:
     pass
 
+pytestmark = pytest.mark.skipif(**tm.no_multiple(tm.no_matplotlib(),
+                                                 tm.no_graphviz()))
 
-pytestmark = pytest.mark.skipif(**tm.no_multiple(tm.no_matplotlib(), tm.no_graphviz()))
-
-
-dpath = 'demo/data/'
-rng = np.random.RandomState(1994)
+dpath = 'demo/data/agaricus.txt.train'
 
 
 class TestPlotting(unittest.TestCase):
-
     def test_plotting(self):
-        bst2 = xgb.Booster(model_file='xgb.model')
+        m = xgb.DMatrix(dpath)
+        booster = xgb.train({'max_depth': 2, 'eta': 1,
+                             'objective': 'binary:logistic'}, m,
+                            num_boost_round=2)
 
-        ax = xgb.plot_importance(bst2)
+        ax = xgb.plot_importance(booster)
         assert isinstance(ax, Axes)
         assert ax.get_title() == 'Feature importance'
         assert ax.get_xlabel() == 'F score'
         assert ax.get_ylabel() == 'Features'
         assert len(ax.patches) == 4
 
-        ax = xgb.plot_importance(bst2, color='r',
+        ax = xgb.plot_importance(booster, color='r',
                                  title='t', xlabel='x', ylabel='y')
         assert isinstance(ax, Axes)
         assert ax.get_title() == 't'
@@ -44,7 +44,7 @@ def test_plotting(self):
         for p in ax.patches:
             assert p.get_facecolor() == (1.0, 0, 0, 1.0)  # red
 
-        ax = xgb.plot_importance(bst2, color=['r', 'r', 'b', 'b'],
+        ax = xgb.plot_importance(booster, color=['r', 'r', 'b', 'b'],
                                  title=None, xlabel=None, ylabel=None)
         assert isinstance(ax, Axes)
         assert ax.get_title() == ''
@@ -56,10 +56,10 @@ def test_plotting(self):
         assert ax.patches[2].get_facecolor() == (0, 0, 1.0, 1.0)  # blue
         assert ax.patches[3].get_facecolor() == (0, 0, 1.0, 1.0)  # blue
 
-        g = xgb.to_graphviz(bst2, num_trees=0)
+        g = xgb.to_graphviz(booster, num_trees=0)
         assert isinstance(g, Source)
 
-        ax = xgb.plot_tree(bst2, num_trees=0)
+        ax = xgb.plot_tree(booster, num_trees=0)
         assert isinstance(ax, Axes)
 
     def test_importance_plot_lim(self):
diff --git a/tests/python/test_with_dask.py b/tests/python/test_with_dask.py
index dc5c155e6027..145fa0b524cd 100644
--- a/tests/python/test_with_dask.py
+++ b/tests/python/test_with_dask.py
@@ -501,17 +501,20 @@ def run_updater_test(self, client, params, num_rounds, dataset,
                                  num_boost_round=num_rounds,
                                  evals=[(m, 'train')])['history']
         note(history)
-        assert tm.non_increasing(history['train'][dataset.metric])
+        history = history['train'][dataset.metric]
+        assert tm.non_increasing(history)
+        # Make sure that it's decreasing
+        assert history[-1] < history[0]
 
     @given(params=hist_parameter_strategy,
-           num_rounds=strategies.integers(10, 20),
+           num_rounds=strategies.integers(20, 30),
            dataset=tm.dataset_strategy)
     @settings(deadline=None)
     def test_hist(self, params, num_rounds, dataset, client):
         self.run_updater_test(client, params, num_rounds, dataset, 'hist')
 
     @given(params=exact_parameter_strategy,
-           num_rounds=strategies.integers(10, 20),
+           num_rounds=strategies.integers(20, 30),
            dataset=tm.dataset_strategy)
     @settings(deadline=None)
     def test_approx(self, client, params, num_rounds, dataset):
@@ -524,8 +527,7 @@ def run_quantile(self, name):
         exe = None
         for possible_path in {'./testxgboost', './build/testxgboost',
                               '../build/testxgboost',
-                              '../cpu-build/testxgboost',
-                              '../gpu-build/testxgboost'}:
+                              '../cpu-build/testxgboost'}:
             if os.path.exists(possible_path):
                 exe = possible_path
         if exe is None:
@@ -542,7 +544,7 @@ def runit(worker_addr, rabit_args):
             port = port.split('=')
             env = os.environ.copy()
             env[port[0]] = port[1]
-            return subprocess.run([exe, test], env=env, stdout=subprocess.PIPE)
+            return subprocess.run([exe, test], env=env, capture_output=True)
 
         with LocalCluster(n_workers=4) as cluster:
             with Client(cluster) as client:
@@ -555,6 +557,7 @@ def runit(worker_addr, rabit_args):
                                      workers=workers,
                                      rabit_args=rabit_args)
                 results = client.gather(futures)
+
                 for ret in results:
                     msg = ret.stdout.decode('utf-8')
                     assert msg.find('1 test from Quantile') != -1, msg
@@ -563,4 +566,14 @@ def runit(worker_addr, rabit_args):
     @pytest.mark.skipif(**tm.no_dask())
     @pytest.mark.gtest
     def test_quantile_basic(self):
+        self.run_quantile('DistributedBasic')
+
+    @pytest.mark.skipif(**tm.no_dask())
+    @pytest.mark.gtest
+    def test_quantile(self):
+        self.run_quantile('Distributed')
+
+    @pytest.mark.skipif(**tm.no_dask())
+    @pytest.mark.gtest
+    def test_quantile_same_on_all_workers(self):
         self.run_quantile('SameOnAllWorkers')
diff --git a/tests/python/test_with_sklearn.py b/tests/python/test_with_sklearn.py
index 7f62a3e83052..ce0b57e823ff 100644
--- a/tests/python/test_with_sklearn.py
+++ b/tests/python/test_with_sklearn.py
@@ -1,3 +1,5 @@
+import collections
+import importlib.util
 import numpy as np
 import xgboost as xgb
 from xgboost.sklearn import XGBoostLabelEncoder
@@ -654,6 +656,7 @@ def test_validation_weights_xgbmodel():
                 eval_set=[(X_train, y_train), (X_test, y_test)],
                 sample_weight_eval_set=[weights_train])
 
+
 def test_validation_weights_xgbclassifier():
     from sklearn.datasets import make_hastie_10_2
 
@@ -920,6 +923,64 @@ def test_pandas_input():
                                np.array([0, 1]))
 
 
+def run_feature_weights(increasing):
+    with TemporaryDirectory() as tmpdir:
+        kRows = 512
+        kCols = 64
+        colsample_bynode = 0.5
+        reg = xgb.XGBRegressor(tree_method='hist',
+                               colsample_bynode=colsample_bynode)
+        X = rng.randn(kRows, kCols)
+        y = rng.randn(kRows)
+        fw = np.ones(shape=(kCols,))
+        for i in range(kCols):
+            if increasing:
+                fw[i] *= float(i)
+            else:
+                fw[i] *= float(kCols - i)
+
+        reg.fit(X, y, feature_weights=fw)
+        model_path = os.path.join(tmpdir, 'model.json')
+        reg.save_model(model_path)
+        with open(model_path) as fd:
+            model = json.load(fd)
+
+        parser_path = os.path.join(tm.PROJECT_ROOT, 'demo', 'json-model',
+                                   'json_parser.py')
+        spec = importlib.util.spec_from_file_location("JsonParser",
+                                                      parser_path)
+        foo = importlib.util.module_from_spec(spec)
+        spec.loader.exec_module(foo)
+        model = foo.Model(model)
+        splits = {}
+        total_nodes = 0
+        for tree in model.trees:
+            n_nodes = len(tree.nodes)
+            total_nodes += n_nodes
+            for n in range(n_nodes):
+                if tree.is_leaf(n):
+                    continue
+                if splits.get(tree.split_index(n), None) is None:
+                    splits[tree.split_index(n)] = 1
+                else:
+                    splits[tree.split_index(n)] += 1
+
+        od = collections.OrderedDict(sorted(splits.items()))
+        tuples = [(k, v) for k, v in od.items()]
+        k, v = list(zip(*tuples))
+        w = np.polyfit(k, v, deg=1)
+        return w
+
+
+def test_feature_weights():
+    poly_increasing = run_feature_weights(True)
+    poly_decreasing = run_feature_weights(False)
+    # Approxmated test, this is dependent on the implementation of random
+    # number generator in std library.
+    assert poly_increasing[0] > 0.08
+    assert poly_decreasing[0] < -0.08
+
+
 class TestBoostFromPrediction(unittest.TestCase):
     def run_boost_from_prediction(self, tree_method):
         from sklearn.datasets import load_breast_cancer
diff --git a/tests/python/testing.py b/tests/python/testing.py
index c3f78f78e966..30b44079607b 100644
--- a/tests/python/testing.py
+++ b/tests/python/testing.py
@@ -2,13 +2,17 @@
 import os
 from xgboost.compat import SKLEARN_INSTALLED, PANDAS_INSTALLED
 from xgboost.compat import DASK_INSTALLED
+import pytest
+import tempfile
+import xgboost as xgb
+import numpy as np
+
+hypothesis = pytest.importorskip('hypothesis')
+sklearn = pytest.importorskip('sklearn')
 from hypothesis import strategies
 from hypothesis.extra.numpy import arrays
 from joblib import Memory
 from sklearn import datasets
-import tempfile
-import xgboost as xgb
-import numpy as np
 
 try:
     import cupy as cp
diff --git a/tests/travis/run_test.sh b/tests/travis/run_test.sh
index a0e1c9f28651..500aa1e57ae1 100755
--- a/tests/travis/run_test.sh
+++ b/tests/travis/run_test.sh
@@ -88,3 +88,19 @@ if [ ${TASK} == "cmake_test" ]; then
     cd ..
     rm -rf build
 fi
+
+if [ ${TASK} == "s390x_test" ]; then
+    set -e
+
+    # Build and run C++ tests
+    rm -rf build
+    mkdir build && cd build
+    cmake .. -DCMAKE_VERBOSE_MAKEFILE=ON -DGOOGLE_TEST=ON -DUSE_OPENMP=ON -DUSE_DMLC_GTEST=ON -GNinja
+    time ninja -v
+    ./testxgboost
+
+    # Run model compatibility tests
+    cd ..
+    python3 -m pip install --user pytest hypothesis
+    PYTHONPATH=./python-package python3 -m pytest --fulltrace -v -rxXs tests/python/ -k 'test_model'
+fi
diff --git a/tests/travis/setup.sh b/tests/travis/setup.sh
index 5a7a91671da0..0e9f7e8fd687 100755
--- a/tests/travis/setup.sh
+++ b/tests/travis/setup.sh
@@ -20,6 +20,15 @@ if [ ${TASK} == "cmake_test" ] && [ ${TRAVIS_OS_NAME} == "osx" ]; then
     sudo softwareupdate -i "Command Line Tools (macOS High Sierra version 10.13) for Xcode-9.3"
 fi
 
+if [ ${TASK} == "s390x_test" ] && [ ${TRAVIS_CPU_ARCH} == "s390x" ]; then
+    sudo snap install cmake --channel=3.17/beta --classic
+    export PATH=/snap/bin:${PATH}
+    cmake --version
+    sudo apt-get update
+    sudo apt-get install -y --no-install-recommends tar unzip wget git build-essential ninja-build \
+      time python3 python3-pip python3-numpy python3-scipy python3-sklearn r-base
+fi
+
 if [ ${TASK} == "python_sdist_test" ] && [ ${TRAVIS_OS_NAME} == "linux" ]; then
     wget https://github.com/Kitware/CMake/releases/download/v3.17.1/cmake-3.17.1-Linux-x86_64.sh
     sudo bash cmake-3.17.1-Linux-x86_64.sh --prefix=/usr/local --skip-license