diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 9d4196feb754..5c73ffa27aa9 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -17,7 +17,7 @@ jobs: strategy: fail-fast: false matrix: - os: [windows-latest, windows-2016, ubuntu-latest] + os: [windows-latest, ubuntu-latest] steps: - uses: actions/checkout@v2 @@ -40,12 +40,92 @@ jobs: cd jvm-packages mvn test -pl :xgboost4j_2.12 + lint: + runs-on: ubuntu-latest + name: Code linting for Python and C++ + steps: + - uses: actions/checkout@v2 + with: + submodules: 'true' + - uses: actions/setup-python@v2 + with: + python-version: '3.7' + architecture: 'x64' + - name: Install Python packages + run: | + python -m pip install wheel setuptools + python -m pip install pylint cpplint numpy scipy scikit-learn + - name: Run lint + run: | + make lint + + doxygen: + runs-on: ubuntu-latest + name: Generate C/C++ API doc using Doxygen + steps: + - uses: actions/checkout@v2 + with: + submodules: 'true' + - uses: actions/setup-python@v2 + with: + python-version: '3.7' + architecture: 'x64' + - name: Install system packages + run: | + sudo apt-get install -y --no-install-recommends doxygen graphviz ninja-build + python -m pip install wheel setuptools + python -m pip install awscli + - name: Run Doxygen + run: | + mkdir build + cd build + cmake .. -DBUILD_C_DOC=ON -GNinja + ninja -v doc_doxygen + - name: Extract branch name + shell: bash + run: echo "##[set-output name=branch;]$(echo ${GITHUB_REF#refs/heads/})" + id: extract_branch + if: github.ref == 'refs/heads/master' || contains(github.ref, 'refs/heads/release_') + - name: Publish + run: | + cd build/ + tar cvjf ${{ steps.extract_branch.outputs.branch }}.tar.bz2 doc_doxygen/ + python -m awscli s3 cp ./${{ steps.extract_branch.outputs.branch }}.tar.bz2 s3://xgboost-docs/ --acl public-read + if: github.ref == 'refs/heads/master' || contains(github.ref, 'refs/heads/release_') + env: + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID_IAM_S3_UPLOADER }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY_IAM_S3_UPLOADER }} + + sphinx: + runs-on: ubuntu-latest + name: Build docs using Sphinx + steps: + - uses: actions/checkout@v2 + with: + submodules: 'true' + - uses: actions/setup-python@v2 + with: + python-version: '3.7' + architecture: 'x64' + - name: Install system packages + run: | + sudo apt-get install -y --no-install-recommends graphviz + python -m pip install wheel setuptools + python -m pip install -r doc/requirements.txt + - name: Extract branch name + shell: bash + run: echo "##[set-output name=branch;]$(echo ${GITHUB_REF#refs/heads/})" + id: extract_branch + if: github.ref == 'refs/heads/master' || contains(github.ref, 'refs/heads/release_') + - name: Run Sphinx + run: | + make -C doc html + env: + SPHINX_GIT_BRANCH: ${{ steps.extract_branch.outputs.branch }} lintr: runs-on: ${{ matrix.config.os }} - name: Run R linters on OS ${{ matrix.config.os }}, R ${{ matrix.config.r }}, Compiler ${{ matrix.config.compiler }}, Build ${{ matrix.config.build }} - strategy: matrix: config: @@ -83,23 +163,16 @@ jobs: R.exe CMD INSTALL . Rscript.exe tests/run_lint.R - test-with-R: runs-on: ${{ matrix.config.os }} - name: Test R on OS ${{ matrix.config.os }}, R ${{ matrix.config.r }}, Compiler ${{ matrix.config.compiler }}, Build ${{ matrix.config.build }} - strategy: fail-fast: false matrix: config: - - {os: windows-latest, r: 'release', compiler: 'msvc', build: 'autotools'} - {os: windows-2016, r: 'release', compiler: 'msvc', build: 'autotools'} - - {os: windows-latest, r: 'release', compiler: 'msvc', build: 'cmake'} - {os: windows-2016, r: 'release', compiler: 'msvc', build: 'cmake'} - - {os: windows-latest, r: 'release', compiler: 'mingw', build: 'autotools'} - {os: windows-2016, r: 'release', compiler: 'mingw', build: 'autotools'} - - {os: windows-latest, r: 'release', compiler: 'mingw', build: 'cmake'} - {os: windows-2016, r: 'release', compiler: 'mingw', build: 'cmake'} env: R_REMOTES_NO_ERRORS_FROM_WARNINGS: true @@ -130,8 +203,8 @@ jobs: - uses: actions/setup-python@v2 with: - python-version: '3.6' # Version range or exact version of a Python version to use, using SemVer's version range syntax - architecture: 'x64' # optional x64 or x86. Defaults to x64 if not specified + python-version: '3.7' + architecture: 'x64' - name: Test R run: | diff --git a/.travis.yml b/.travis.yml index d0f72423b6ea..5f782ffe472a 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,38 +1,33 @@ -# disable sudo for container build. sudo: required -# Enabling test OS X -os: - - linux - - osx - osx_image: xcode10.1 dist: bionic -# Use Build Matrix to do lint and build seperately env: - matrix: - # python package test - - TASK=python_test - # test installation of Python source distribution - - TASK=python_sdist_test - # java package test - - TASK=java_test - # cmake test - - TASK=cmake_test - global: - secure: "PR16i9F8QtNwn99C5NDp8nptAS+97xwDtXEJJfEiEVhxPaaRkOp0MPWhogCaK0Eclxk1TqkgWbdXFknwGycX620AzZWa/A1K3gAs+GrpzqhnPMuoBJ0Z9qxXTbSJvCyvMbYwVrjaxc/zWqdMU8waWz8A7iqKGKs/SqbQ3rO6v7c=" - secure: "dAGAjBokqm/0nVoLMofQni/fWIBcYSmdq4XvCBX1ZAMDsWnuOfz/4XCY6h2lEI1rVHZQ+UdZkc9PioOHGPZh5BnvE49/xVVWr9c4/61lrDOlkD01ZjSAeoV0fAZq+93V/wPl4QV+MM+Sem9hNNzFSbN5VsQLAiWCSapWsLdKzqA=" -matrix: - exclude: +jobs: + include: - os: linux + arch: amd64 + env: TASK=python_sdist_test + - os: osx + arch: amd64 env: TASK=python_test - - os: linux + - os: osx + arch: amd64 + env: TASK=python_sdist_test + - os: osx + arch: amd64 env: TASK=java_test - - os: linux + - os: osx + arch: amd64 env: TASK=cmake_test + - os: linux + arch: s390x + env: TASK=s390x_test # dependent brew packages addons: @@ -47,6 +42,9 @@ addons: - wget - r update: true + apt: + packages: + - snapd before_install: - source tests/travis/travis_setup_env.sh diff --git a/CMakeLists.txt b/CMakeLists.txt index e26265130e07..24b9ac3adcaf 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,5 +1,5 @@ cmake_minimum_required(VERSION 3.13) -project(xgboost LANGUAGES CXX C VERSION 1.2.0) +project(xgboost LANGUAGES CXX C VERSION 1.3.0) include(cmake/Utils.cmake) list(APPEND CMAKE_MODULE_PATH "${xgboost_SOURCE_DIR}/cmake/modules") cmake_policy(SET CMP0022 NEW) diff --git a/Jenkinsfile b/Jenkinsfile index 60e8116f330b..54c8b9565ec8 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -49,24 +49,12 @@ pipeline { stash name: 'srcs' } } - stage('Jenkins Linux: Formatting Check') { - agent none - steps { - script { - parallel ([ - 'clang-tidy': { ClangTidy() }, - 'lint': { Lint() }, - 'sphinx-doc': { SphinxDoc() }, - 'doxygen': { Doxygen() } - ]) - } - } - } stage('Jenkins Linux: Build') { agent none steps { script { parallel ([ + 'clang-tidy': { ClangTidy() }, 'build-cpu': { BuildCPU() }, 'build-cpu-rabit-mock': { BuildCPUMock() }, 'build-cpu-non-omp': { BuildCPUNonOmp() }, @@ -152,50 +140,6 @@ def ClangTidy() { } } -def Lint() { - node('linux && cpu') { - unstash name: 'srcs' - echo "Running lint..." - def container_type = "cpu" - def docker_binary = "docker" - sh """ - ${dockerRun} ${container_type} ${docker_binary} bash -c "source activate cpu_test && make lint" - """ - deleteDir() - } -} - -def SphinxDoc() { - node('linux && cpu') { - unstash name: 'srcs' - echo "Running sphinx-doc..." - def container_type = "cpu" - def docker_binary = "docker" - def docker_extra_params = "CI_DOCKER_EXTRA_PARAMS_INIT='-e SPHINX_GIT_BRANCH=${BRANCH_NAME}'" - sh """#!/bin/bash - ${docker_extra_params} ${dockerRun} ${container_type} ${docker_binary} bash -c "source activate cpu_test && make -C doc html" - """ - deleteDir() - } -} - -def Doxygen() { - node('linux && cpu') { - unstash name: 'srcs' - echo "Running doxygen..." - def container_type = "cpu" - def docker_binary = "docker" - sh """ - ${dockerRun} ${container_type} ${docker_binary} tests/ci_build/doxygen.sh ${BRANCH_NAME} - """ - if (env.BRANCH_NAME == 'master' || env.BRANCH_NAME.startsWith('release')) { - echo 'Uploading doc...' - s3Upload file: "build/${BRANCH_NAME}.tar.bz2", bucket: 'xgboost-docs', acl: 'PublicRead', path: "doxygen/${BRANCH_NAME}.tar.bz2" - } - deleteDir() - } -} - def BuildCPU() { node('linux && cpu') { unstash name: 'srcs' @@ -301,7 +245,7 @@ def BuildCUDA(args) { } def BuildJVMPackagesWithCUDA(args) { - node('linux && gpu') { + node('linux && mgpu') { unstash name: 'srcs' echo "Build XGBoost4J-Spark with Spark ${args.spark_version}, CUDA ${args.cuda_version}" def container_type = "jvm_gpu_build" @@ -496,10 +440,11 @@ def DeployJVMPackages(args) { unstash name: 'srcs' if (env.BRANCH_NAME == 'master' || env.BRANCH_NAME.startsWith('release')) { echo 'Deploying to xgboost-maven-repo S3 repo...' - def container_type = "jvm" - def docker_binary = "docker" sh """ - ${dockerRun} ${container_type} ${docker_binary} tests/ci_build/deploy_jvm_packages.sh ${args.spark_version} + ${dockerRun} jvm docker tests/ci_build/deploy_jvm_packages.sh ${args.spark_version} 0 + """ + sh """ + ${dockerRun} jvm_gpu_build docker --build-arg CUDA_VERSION=10.0 tests/ci_build/deploy_jvm_packages.sh ${args.spark_version} 1 """ } deleteDir() diff --git a/R-package/DESCRIPTION b/R-package/DESCRIPTION index b5d7585a3ca8..1a35eaa0612a 100644 --- a/R-package/DESCRIPTION +++ b/R-package/DESCRIPTION @@ -1,7 +1,7 @@ Package: xgboost Type: Package Title: Extreme Gradient Boosting -Version: 1.2.0.1 +Version: 1.3.0.1 Date: 2020-02-21 Authors@R: c( person("Tianqi", "Chen", role = c("aut"), diff --git a/R-package/R/xgb.ggplot.R b/R-package/R/xgb.ggplot.R index 3b76e9facf42..339e0fac1600 100644 --- a/R-package/R/xgb.ggplot.R +++ b/R-package/R/xgb.ggplot.R @@ -99,6 +99,85 @@ xgb.ggplot.deepness <- function(model = NULL, which = c("2x1", "max.depth", "med } } +#' @rdname xgb.plot.shap.summary +#' @export +xgb.ggplot.shap.summary <- function(data, shap_contrib = NULL, features = NULL, top_n = 10, model = NULL, + trees = NULL, target_class = NULL, approxcontrib = FALSE, subsample = NULL) { + data_list <- xgb.shap.data( + data = data, + shap_contrib = shap_contrib, + features = features, + top_n = top_n, + model = model, + trees = trees, + target_class = target_class, + approxcontrib = approxcontrib, + subsample = subsample, + max_observations = 10000 # 10,000 samples per feature. + ) + p_data <- prepare.ggplot.shap.data(data_list, normalize = TRUE) + # Reverse factor levels so that the first level is at the top of the plot + p_data[, "feature" := factor(feature, rev(levels(feature)))] + + p <- ggplot2::ggplot(p_data, ggplot2::aes(x = feature, y = shap_value, colour = feature_value)) + + ggplot2::geom_jitter(alpha = 0.5, width = 0.1) + + ggplot2::scale_colour_viridis_c(limits = c(-3, 3), option = "plasma", direction = -1) + + ggplot2::geom_abline(slope = 0, intercept = 0, colour = "darkgrey") + + ggplot2::coord_flip() + + p +} + +#' Combine and melt feature values and SHAP contributions for sample +#' observations. +#' +#' Conforms to data format required for ggplot functions. +#' +#' Internal utility function. +#' +#' @param data_list List containing 'data' and 'shap_contrib' returned by +#' \code{xgb.shap.data()}. +#' @param normalize Whether to standardize feature values to have mean 0 and +#' standard deviation 1 (useful for comparing multiple features on the same +#' plot). Default \code{FALSE}. +#' +#' @return A data.table containing the observation ID, the feature name, the +#' feature value (normalized if specified), and the SHAP contribution value. +prepare.ggplot.shap.data <- function(data_list, normalize = FALSE) { + data <- data_list[["data"]] + shap_contrib <- data_list[["shap_contrib"]] + + data <- data.table::as.data.table(as.matrix(data)) + if (normalize) { + data[, (names(data)) := lapply(.SD, normalize)] + } + data[, "id" := seq_len(nrow(data))] + data_m <- data.table::melt.data.table(data, id.vars = "id", variable.name = "feature", value.name = "feature_value") + + shap_contrib <- data.table::as.data.table(as.matrix(shap_contrib)) + shap_contrib[, "id" := seq_len(nrow(shap_contrib))] + shap_contrib_m <- data.table::melt.data.table(shap_contrib, id.vars = "id", variable.name = "feature", value.name = "shap_value") + + p_data <- data.table::merge.data.table(data_m, shap_contrib_m, by = c("id", "feature")) + + p_data +} + +#' Scale feature value to have mean 0, standard deviation 1 +#' +#' This is used to compare multiple features on the same plot. +#' Internal utility function +#' +#' @param x Numeric vector +#' +#' @return Numeric vector with mean 0 and sd 1. +normalize <- function(x) { + loc <- mean(x, na.rm = TRUE) + scale <- stats::sd(x, na.rm = TRUE) + + (x - loc) / scale +} + # Plot multiple ggplot graph aligned by rows and columns. # ... the plots # cols number of columns @@ -131,5 +210,5 @@ multiplot <- function(..., cols = 1) { globalVariables(c( "Cluster", "ggplot", "aes", "geom_bar", "coord_flip", "xlab", "ylab", "ggtitle", "theme", - "element_blank", "element_text", "V1", "Weight" + "element_blank", "element_text", "V1", "Weight", "feature" )) diff --git a/R-package/R/xgb.plot.shap.R b/R-package/R/xgb.plot.shap.R index a44d4b570a09..d9ea69786ad9 100644 --- a/R-package/R/xgb.plot.shap.R +++ b/R-package/R/xgb.plot.shap.R @@ -81,6 +81,7 @@ #' xgb.plot.shap(agaricus.test$data, model = bst, features = "odor=none") #' contr <- predict(bst, agaricus.test$data, predcontrib = TRUE) #' xgb.plot.shap(agaricus.test$data, contr, model = bst, top_n = 12, n_col = 3) +#' xgb.ggplot.shap.summary(agaricus.test$data, contr, model = bst, top_n = 12) # Summary plot #' #' # multiclass example - plots for each class separately: #' nclass <- 3 @@ -99,6 +100,7 @@ #' n_col = 2, col = col, pch = 16, pch_NA = 17) #' xgb.plot.shap(x, model = mbst, trees = trees0 + 2, target_class = 2, top_n = 4, #' n_col = 2, col = col, pch = 16, pch_NA = 17) +#' xgb.ggplot.shap.summary(x, model = mbst, target_class = 0, top_n = 4) # Summary plot #' #' @rdname xgb.plot.shap #' @export @@ -109,69 +111,33 @@ xgb.plot.shap <- function(data, shap_contrib = NULL, features = NULL, top_n = 1, plot_NA = TRUE, col_NA = rgb(0.7, 0, 1, 0.6), pch_NA = '.', pos_NA = 1.07, plot_loess = TRUE, col_loess = 2, span_loess = 0.5, which = c("1d", "2d"), plot = TRUE, ...) { - - if (!is.matrix(data) && !inherits(data, "dgCMatrix")) - stop("data: must be either matrix or dgCMatrix") - - if (is.null(shap_contrib) && (is.null(model) || !inherits(model, "xgb.Booster"))) - stop("when shap_contrib is not provided, one must provide an xgb.Booster model") - - if (is.null(features) && (is.null(model) || !inherits(model, "xgb.Booster"))) - stop("when features are not provided, one must provide an xgb.Booster model to rank the features") - - if (!is.null(shap_contrib) && - (!is.matrix(shap_contrib) || nrow(shap_contrib) != nrow(data) || ncol(shap_contrib) != ncol(data) + 1)) - stop("shap_contrib is not compatible with the provided data") - - nsample <- if (is.null(subsample)) min(100000, nrow(data)) else as.integer(subsample * nrow(data)) - idx <- sample(seq_len(nrow(data)), nsample) - data <- data[idx, ] - - if (is.null(shap_contrib)) { - shap_contrib <- predict(model, data, predcontrib = TRUE, approxcontrib = approxcontrib) - } else { - shap_contrib <- shap_contrib[idx, ] - } + data_list <- xgb.shap.data( + data = data, + shap_contrib = shap_contrib, + features = features, + top_n = top_n, + model = model, + trees = trees, + target_class = target_class, + approxcontrib = approxcontrib, + subsample = subsample, + max_observations = 100000 + ) + data <- data_list[["data"]] + shap_contrib <- data_list[["shap_contrib"]] + features <- colnames(data) which <- match.arg(which) if (which == "2d") stop("2D plots are not implemented yet") - if (is.null(features)) { - imp <- xgb.importance(model = model, trees = trees) - top_n <- as.integer(top_n[1]) - if (top_n < 1 && top_n > 100) - stop("top_n: must be an integer within [1, 100]") - features <- imp$Feature[1:min(top_n, NROW(imp))] - } - - if (is.character(features)) { - if (is.null(colnames(data))) - stop("Either provide `data` with column names or provide `features` as column indices") - features <- match(features, colnames(data)) - } - if (n_col > length(features)) n_col <- length(features) - - if (is.list(shap_contrib)) { # multiclass: either choose a class or merge - shap_contrib <- if (!is.null(target_class)) shap_contrib[[target_class + 1]] - else Reduce("+", lapply(shap_contrib, abs)) - } - - shap_contrib <- shap_contrib[, features, drop = FALSE] - data <- data[, features, drop = FALSE] - cols <- colnames(data) - if (is.null(cols)) cols <- colnames(shap_contrib) - if (is.null(cols)) cols <- paste0('X', seq_len(ncol(data))) - colnames(data) <- cols - colnames(shap_contrib) <- cols - if (plot && which == "1d") { op <- par(mfrow = c(ceiling(length(features) / n_col), n_col), oma = c(0, 0, 0, 0) + 0.2, mar = c(3.5, 3.5, 0, 0) + 0.1, mgp = c(1.7, 0.6, 0)) - for (f in cols) { + for (f in features) { ord <- order(data[, f]) x <- data[, f][ord] y <- shap_contrib[, f][ord] @@ -216,3 +182,105 @@ xgb.plot.shap <- function(data, shap_contrib = NULL, features = NULL, top_n = 1, } invisible(list(data = data, shap_contrib = shap_contrib)) } + +#' SHAP contribution dependency summary plot +#' +#' Compare SHAP contributions of different features. +#' +#' A point plot (each point representing one sample from \code{data}) is +#' produced for each feature, with the points plotted on the SHAP value axis. +#' Each point (observation) is coloured based on its feature value. The plot +#' hence allows us to see which features have a negative / positive contribution +#' on the model prediction, and whether the contribution is different for larger +#' or smaller values of the feature. We effectively try to replicate the +#' \code{summary_plot} function from https://github.com/slundberg/shap. +#' +#' @inheritParams xgb.plot.shap +#' +#' @return A \code{ggplot2} object. +#' @export +#' +#' @examples See \code{\link{xgb.plot.shap}}. +#' @seealso \code{\link{xgb.plot.shap}}, \code{\link{xgb.ggplot.shap.summary}}, +#' \code{\url{https://github.com/slundberg/shap}} +xgb.plot.shap.summary <- function(data, shap_contrib = NULL, features = NULL, top_n = 10, model = NULL, + trees = NULL, target_class = NULL, approxcontrib = FALSE, subsample = NULL) { + # Only ggplot implementation is available. + xgb.ggplot.shap.summary(data, shap_contrib, features, top_n, model, trees, target_class, approxcontrib, subsample) +} + +#' Prepare data for SHAP plots. To be used in xgb.plot.shap, xgb.plot.shap.summary, etc. +#' Internal utility function. +#' +#' @return A list containing: 'data', a matrix containing sample observations +#' and their feature values; 'shap_contrib', a matrix containing the SHAP contribution +#' values for these observations. +xgb.shap.data <- function(data, shap_contrib = NULL, features = NULL, top_n = 1, model = NULL, + trees = NULL, target_class = NULL, approxcontrib = FALSE, + subsample = NULL, max_observations = 100000) { + if (!is.matrix(data) && !inherits(data, "dgCMatrix")) + stop("data: must be either matrix or dgCMatrix") + + if (is.null(shap_contrib) && (is.null(model) || !inherits(model, "xgb.Booster"))) + stop("when shap_contrib is not provided, one must provide an xgb.Booster model") + + if (is.null(features) && (is.null(model) || !inherits(model, "xgb.Booster"))) + stop("when features are not provided, one must provide an xgb.Booster model to rank the features") + + if (!is.null(shap_contrib) && + (!is.matrix(shap_contrib) || nrow(shap_contrib) != nrow(data) || ncol(shap_contrib) != ncol(data) + 1)) + stop("shap_contrib is not compatible with the provided data") + + if (is.character(features) && is.null(colnames(data))) + stop("either provide `data` with column names or provide `features` as column indices") + + if (is.null(model$feature_names) && model$nfeatures != ncol(data)) + stop("if model has no feature_names, columns in `data` must match features in model") + + if (!is.null(subsample)) { + idx <- sample(x = seq_len(nrow(data)), size = as.integer(subsample * nrow(data)), replace = FALSE) + } else { + idx <- seq_len(min(nrow(data), max_observations)) + } + data <- data[idx, ] + if (is.null(colnames(data))) { + colnames(data) <- paste0("X", seq_len(ncol(data))) + } + + if (!is.null(shap_contrib)) { + if (is.list(shap_contrib)) { # multiclass: either choose a class or merge + shap_contrib <- if (!is.null(target_class)) shap_contrib[[target_class + 1]] else Reduce("+", lapply(shap_contrib, abs)) + } + shap_contrib <- shap_contrib[idx, ] + if (is.null(colnames(shap_contrib))) { + colnames(shap_contrib) <- paste0("X", seq_len(ncol(data))) + } + } else { + shap_contrib <- predict(model, newdata = data, predcontrib = TRUE, approxcontrib = approxcontrib) + if (is.list(shap_contrib)) { # multiclass: either choose a class or merge + shap_contrib <- if (!is.null(target_class)) shap_contrib[[target_class + 1]] else Reduce("+", lapply(shap_contrib, abs)) + } + } + + if (is.null(features)) { + if (!is.null(model$feature_names)) { + imp <- xgb.importance(model = model, trees = trees) + } else { + imp <- xgb.importance(model = model, trees = trees, feature_names = colnames(data)) + } + top_n <- top_n[1] + if (top_n < 1 | top_n > 100) stop("top_n: must be an integer within [1, 100]") + features <- imp$Feature[1:min(top_n, NROW(imp))] + } + if (is.character(features)) { + features <- match(features, colnames(data)) + } + + shap_contrib <- shap_contrib[, features, drop = FALSE] + data <- data[, features, drop = FALSE] + + list( + data = data, + shap_contrib = shap_contrib + ) +} diff --git a/R-package/tests/testthat/test_helpers.R b/R-package/tests/testthat/test_helpers.R index 2ee1acf568a7..86c0efd0207e 100644 --- a/R-package/tests/testthat/test_helpers.R +++ b/R-package/tests/testthat/test_helpers.R @@ -351,11 +351,47 @@ test_that("xgb.plot.deepness works", { xgb.ggplot.deepness(model = bst.Tree) }) +test_that("xgb.shap.data works when top_n is provided", { + data_list <- xgb.shap.data(data = sparse_matrix, model = bst.Tree, top_n = 2) + expect_equal(names(data_list), c("data", "shap_contrib")) + expect_equal(NCOL(data_list$data), 2) + expect_equal(NCOL(data_list$shap_contrib), 2) + expect_equal(NROW(data_list$data), NROW(data_list$shap_contrib)) + expect_gt(length(colnames(data_list$data)), 0) + expect_gt(length(colnames(data_list$shap_contrib)), 0) + + # for multiclass without target class provided + data_list <- xgb.shap.data(data = as.matrix(iris[, -5]), model = mbst.Tree, top_n = 2) + expect_equal(dim(data_list$shap_contrib), c(nrow(iris), 2)) + # for multiclass with target class provided + data_list <- xgb.shap.data(data = as.matrix(iris[, -5]), model = mbst.Tree, top_n = 2, target_class = 0) + expect_equal(dim(data_list$shap_contrib), c(nrow(iris), 2)) +}) + +test_that("xgb.shap.data works with subsampling", { + data_list <- xgb.shap.data(data = sparse_matrix, model = bst.Tree, top_n = 2, subsample = 0.8) + expect_equal(NROW(data_list$data), as.integer(0.8 * nrow(sparse_matrix))) + expect_equal(NROW(data_list$data), NROW(data_list$shap_contrib)) +}) + +test_that("prepare.ggplot.shap.data works", { + data_list <- xgb.shap.data(data = sparse_matrix, model = bst.Tree, top_n = 2) + plot_data <- prepare.ggplot.shap.data(data_list, normalize = TRUE) + expect_s3_class(plot_data, "data.frame") + expect_equal(names(plot_data), c("id", "feature", "feature_value", "shap_value")) + expect_s3_class(plot_data$feature, "factor") + # Each observation should have 1 row for each feature + expect_equal(nrow(plot_data), nrow(sparse_matrix) * 2) +}) + test_that("xgb.plot.shap works", { sh <- xgb.plot.shap(data = sparse_matrix, model = bst.Tree, top_n = 2, col = 4) expect_equal(names(sh), c("data", "shap_contrib")) - expect_equal(NCOL(sh$data), 2) - expect_equal(NCOL(sh$shap_contrib), 2) +}) + +test_that("xgb.plot.shap.summary works", { + xgb.plot.shap.summary(data = sparse_matrix, model = bst.Tree, top_n = 2) + xgb.ggplot.shap.summary(data = sparse_matrix, model = bst.Tree, top_n = 2) }) test_that("check.deprecation works", { diff --git a/amalgamation/xgboost-all0.cc b/amalgamation/xgboost-all0.cc index 8220135d9e32..792b43797ce5 100644 --- a/amalgamation/xgboost-all0.cc +++ b/amalgamation/xgboost-all0.cc @@ -68,6 +68,7 @@ #include "../src/learner.cc" #include "../src/logging.cc" #include "../src/common/common.cc" +#include "../src/common/random.cc" #include "../src/common/charconv.cc" #include "../src/common/timer.cc" #include "../src/common/quantile.cc" diff --git a/demo/guide-python/feature_weights.py b/demo/guide-python/feature_weights.py new file mode 100644 index 000000000000..b9cee8c050af --- /dev/null +++ b/demo/guide-python/feature_weights.py @@ -0,0 +1,49 @@ +'''Using feature weight to change column sampling. + + .. versionadded:: 1.3.0 +''' + +import numpy as np +import xgboost +from matplotlib import pyplot as plt +import argparse + + +def main(args): + rng = np.random.RandomState(1994) + + kRows = 1000 + kCols = 10 + + X = rng.randn(kRows, kCols) + y = rng.randn(kRows) + fw = np.ones(shape=(kCols,)) + for i in range(kCols): + fw[i] *= float(i) + + dtrain = xgboost.DMatrix(X, y) + dtrain.set_info(feature_weights=fw) + + bst = xgboost.train({'tree_method': 'hist', + 'colsample_bynode': 0.5}, + dtrain, num_boost_round=10, + evals=[(dtrain, 'd')]) + featue_map = bst.get_fscore() + # feature zero has 0 weight + assert featue_map.get('f0', None) is None + assert max(featue_map.values()) == featue_map.get('f9') + + if args.plot: + xgboost.plot_importance(bst) + plt.show() + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument( + '--plot', + type=int, + default=1, + help='Set to 0 to disable plotting the evaluation history.') + args = parser.parse_args() + main(args) diff --git a/demo/json-model/json_parser.py b/demo/json-model/json_parser.py index eedcbf9c2287..c41a44d881c8 100644 --- a/demo/json-model/json_parser.py +++ b/demo/json-model/json_parser.py @@ -94,7 +94,7 @@ def __str__(self): class Model: '''Gradient boosted tree model.''' - def __init__(self, m: dict): + def __init__(self, model: dict): '''Construct the Model from JSON object. parameters diff --git a/doc/conf.py b/doc/conf.py index d17f9594a285..749d400c6e8f 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -22,7 +22,7 @@ import guzzle_sphinx_theme git_branch = os.getenv('SPHINX_GIT_BRANCH', default=None) -if git_branch is None: +if not git_branch: # If SPHINX_GIT_BRANCH environment variable is not given, run git # to determine branch name git_branch = [ @@ -30,6 +30,8 @@ git.branch('-r', '--contains', 'HEAD')).rstrip('\n').split('\n') ] git_branch = [x for x in git_branch if 'HEAD' not in x] +else: + git_branch = [git_branch] print('git_branch = {}'.format(git_branch[0])) try: filename, _ = urllib.request.urlretrieve( diff --git a/doc/jvm/index.rst b/doc/jvm/index.rst index a4c9cdd53abe..6b3bf9348c12 100644 --- a/doc/jvm/index.rst +++ b/doc/jvm/index.rst @@ -65,6 +65,8 @@ This will check out the latest stable version from the Maven Central. For the latest release version number, please check `here `_. +To enable the GPU algorithm (``tree_method='gpu_hist'``), use artifacts ``xgboost4j-gpu_2.12`` and ``xgboost4j-spark-gpu_2.12`` instead (note the ``gpu`` suffix). + .. note:: Using Maven repository hosted by the XGBoost project There may be some delay until a new release becomes available to Maven Central. If you would like to access the latest release immediately, add the Maven repository hosted by the XGBoost project: @@ -83,6 +85,11 @@ For the latest release version number, please check `here `_. Installation from source ======================== diff --git a/doc/parameter.rst b/doc/parameter.rst index 626ddf10f8ab..7e7e774a2bfa 100644 --- a/doc/parameter.rst +++ b/doc/parameter.rst @@ -107,6 +107,10 @@ Parameters for Tree Booster 'colsample_bynode':0.5}`` with 64 features will leave 8 features to choose from at each split. + On Python interface, one can set the ``feature_weights`` for DMatrix to define the + probability of each feature being selected when using column sampling. There's a + similar parameter for ``fit`` method in sklearn interface. + * ``lambda`` [default=1, alias: ``reg_lambda``] - L2 regularization term on weights. Increasing this value will make model more conservative. @@ -224,7 +228,7 @@ Parameters for Tree Booster list is a group of indices of features that are allowed to interact with each other. See tutorial for more information -Additional parameters for ``hist`` and ```gpu_hist`` tree method +Additional parameters for ``hist`` and ``gpu_hist`` tree method ================================================================ * ``single_precision_histogram``, [default=``false``] diff --git a/include/xgboost/c_api.h b/include/xgboost/c_api.h index 794cbdf19e8f..4db461d11b1c 100644 --- a/include/xgboost/c_api.h +++ b/include/xgboost/c_api.h @@ -483,6 +483,34 @@ XGB_DLL int XGDMatrixGetStrFeatureInfo(DMatrixHandle handle, const char *field, bst_ulong *size, const char ***out_features); +/*! + * \brief Set meta info from dense matrix. Valid field names are: + * + * - label + * - weight + * - base_margin + * - group + * - label_lower_bound + * - label_upper_bound + * - feature_weights + * + * \param handle An instance of data matrix + * \param field Feild name + * \param data Pointer to consecutive memory storing data. + * \param size Size of the data, this is relative to size of type. (Meaning NOT number + * of bytes.) + * \param type Indicator of data type. This is defined in xgboost::DataType enum class. + * + * float = 1 + * double = 2 + * uint32_t = 3 + * uint64_t = 4 + * + * \return 0 when success, -1 when failure happens + */ +XGB_DLL int XGDMatrixSetDenseInfo(DMatrixHandle handle, const char *field, + void *data, bst_ulong size, int type); + /*! * \brief (deprecated) Use XGDMatrixSetUIntInfo instead. Set group of the training matrix * \param handle a instance of data matrix diff --git a/include/xgboost/data.h b/include/xgboost/data.h index 1ee292a89edb..f74dbd2c5a76 100644 --- a/include/xgboost/data.h +++ b/include/xgboost/data.h @@ -88,34 +88,17 @@ class MetaInfo { * \brief Type of each feature. Automatically set when feature_type_names is specifed. */ HostDeviceVector feature_types; + /* + * \brief Weight of each feature, used to define the probability of each feature being + * selected when using column sampling. + */ + HostDeviceVector feature_weigths; /*! \brief default constructor */ MetaInfo() = default; MetaInfo(MetaInfo&& that) = default; MetaInfo& operator=(MetaInfo&& that) = default; - MetaInfo& operator=(MetaInfo const& that) { - this->num_row_ = that.num_row_; - this->num_col_ = that.num_col_; - this->num_nonzero_ = that.num_nonzero_; - - this->labels_.Resize(that.labels_.Size()); - this->labels_.Copy(that.labels_); - - this->group_ptr_ = that.group_ptr_; - - this->weights_.Resize(that.weights_.Size()); - this->weights_.Copy(that.weights_); - - this->base_margin_.Resize(that.base_margin_.Size()); - this->base_margin_.Copy(that.base_margin_); - - this->labels_lower_bound_.Resize(that.labels_lower_bound_.Size()); - this->labels_lower_bound_.Copy(that.labels_lower_bound_); - - this->labels_upper_bound_.Resize(that.labels_upper_bound_.Size()); - this->labels_upper_bound_.Copy(that.labels_upper_bound_); - return *this; - } + MetaInfo& operator=(MetaInfo const& that) = delete; /*! * \brief Validate all metainfo. diff --git a/include/xgboost/generic_parameters.h b/include/xgboost/generic_parameters.h index 752342b9a90c..a78453604467 100644 --- a/include/xgboost/generic_parameters.h +++ b/include/xgboost/generic_parameters.h @@ -27,7 +27,7 @@ struct GenericParameter : public XGBoostParameter { int gpu_id; // gpu page size in external memory mode, 0 means using the default. size_t gpu_page_size; - bool enable_experimental_json_serialization {false}; + bool enable_experimental_json_serialization {true}; bool validate_parameters {false}; void CheckDeprecated() { @@ -68,7 +68,7 @@ struct GenericParameter : public XGBoostParameter { .set_lower_bound(0) .describe("GPU page size when running in external memory mode."); DMLC_DECLARE_FIELD(enable_experimental_json_serialization) - .set_default(false) + .set_default(true) .describe("Enable using JSON for memory serialization (Python Pickle, " "rabit checkpoints etc.)."); DMLC_DECLARE_FIELD(validate_parameters) diff --git a/include/xgboost/tree_model.h b/include/xgboost/tree_model.h index e7f6dc8ec089..fd9c69df3e7b 100644 --- a/include/xgboost/tree_model.h +++ b/include/xgboost/tree_model.h @@ -59,6 +59,21 @@ struct TreeParam : public dmlc::Parameter { num_nodes = 1; deprecated_num_roots = 1; } + + // Swap byte order for all fields. Useful for transporting models between machines with different + // endianness (big endian vs little endian) + inline TreeParam ByteSwap() const { + TreeParam x = *this; + dmlc::ByteSwap(&x.deprecated_num_roots, sizeof(x.deprecated_num_roots), 1); + dmlc::ByteSwap(&x.num_nodes, sizeof(x.num_nodes), 1); + dmlc::ByteSwap(&x.num_deleted, sizeof(x.num_deleted), 1); + dmlc::ByteSwap(&x.deprecated_max_depth, sizeof(x.deprecated_max_depth), 1); + dmlc::ByteSwap(&x.num_feature, sizeof(x.num_feature), 1); + dmlc::ByteSwap(&x.size_leaf_vector, sizeof(x.size_leaf_vector), 1); + dmlc::ByteSwap(x.reserved, sizeof(x.reserved[0]), sizeof(x.reserved) / sizeof(x.reserved[0])); + return x; + } + // declare the parameters DMLC_DECLARE_PARAMETER(TreeParam) { // only declare the parameters that can be set by the user. @@ -97,6 +112,16 @@ struct RTreeNodeStat { return loss_chg == b.loss_chg && sum_hess == b.sum_hess && base_weight == b.base_weight && leaf_child_cnt == b.leaf_child_cnt; } + // Swap byte order for all fields. Useful for transporting models between machines with different + // endianness (big endian vs little endian) + inline RTreeNodeStat ByteSwap() const { + RTreeNodeStat x = *this; + dmlc::ByteSwap(&x.loss_chg, sizeof(x.loss_chg), 1); + dmlc::ByteSwap(&x.sum_hess, sizeof(x.sum_hess), 1); + dmlc::ByteSwap(&x.base_weight, sizeof(x.base_weight), 1); + dmlc::ByteSwap(&x.leaf_child_cnt, sizeof(x.leaf_child_cnt), 1); + return x; + } }; /*! @@ -227,6 +252,16 @@ class RegTree : public Model { info_.leaf_value == b.info_.leaf_value; } + inline Node ByteSwap() const { + Node x = *this; + dmlc::ByteSwap(&x.parent_, sizeof(x.parent_), 1); + dmlc::ByteSwap(&x.cleft_, sizeof(x.cleft_), 1); + dmlc::ByteSwap(&x.cright_, sizeof(x.cright_), 1); + dmlc::ByteSwap(&x.sindex_, sizeof(x.sindex_), 1); + dmlc::ByteSwap(&x.info_, sizeof(x.info_), 1); + return x; + } + private: /*! * \brief in leaf node, we have weights, in non-leaf nodes, diff --git a/jvm-packages/README.md b/jvm-packages/README.md index 5bc65e343cc0..7185e951c0ca 100644 --- a/jvm-packages/README.md +++ b/jvm-packages/README.md @@ -18,11 +18,11 @@ You can find more about XGBoost on [Documentation](https://xgboost.readthedocs.o ## Add Maven Dependency -XGBoost4J, XGBoost4J-Spark, etc. in maven repository is compiled with g++-4.8.5 +XGBoost4J, XGBoost4J-Spark, etc. in maven repository is compiled with g++-4.8.5. ### Access release version -maven +Maven ``` @@ -30,66 +30,82 @@ XGBoost4J, XGBoost4J-Spark, etc. in maven repository is compiled with g++-4.8.5 xgboost4j_2.12 latest_version_num -``` - -sbt + + ml.dmlc + xgboost4j-spark_2.12 + latest_version_num + +``` + +sbt ```sbt - "ml.dmlc" %% "xgboost4j" % "latest_version_num" -``` +libraryDependencies ++= Seq( + "ml.dmlc" %% "xgboost4j" % "latest_version_num", + "ml.dmlc" %% "xgboost4j-spark" % "latest_version_num" +) +``` For the latest release version number, please check [here](https://github.com/dmlc/xgboost/releases). -if you want to use `xgboost4j-spark`, you just need to replace xgboost4j with `xgboost4j-spark` +To enable the GPU algorithm (`tree_method='gpu_hist'`), use artifacts `xgboost4j-gpu_2.12` and `xgboost4j-spark-gpu_2.12` instead. ### Access SNAPSHOT version -You need to add github as repo: +First add the following Maven repository hosted by the XGBoost project: -maven: +Maven: ```xml - GitHub Repo - GitHub Repo - https://raw.githubusercontent.com/CodingCat/xgboost/maven-repo/ + XGBoost4J Snapshot Repo + XGBoost4J Snapshot Repo + https://s3-us-west-2.amazonaws.com/xgboost-maven-repo/snapshot/ ``` sbt: - -```sbt -resolvers += "GitHub Repo" at "https://raw.githubusercontent.com/CodingCat/xgboost/maven-repo/" + +```sbt +resolvers += "XGBoost4J Snapshot Repo" at "https://s3-us-west-2.amazonaws.com/xgboost-maven-repo/snapshot/" ``` -the add dependency as following: +Then add XGBoost4J as a dependency: -maven +Maven ``` ml.dmlc xgboost4j_2.12 - latest_version_num + latest_version_num-SNAPSHOT + + + ml.dmlc + xgboost4j-spark_2.12 + latest_version_num-SNAPSHOT -``` - -sbt +``` + +sbt ```sbt - "ml.dmlc" %% "xgboost4j" % "latest_version_num" -``` +libraryDependencies ++= Seq( + "ml.dmlc" %% "xgboost4j" % "latest_version_num-SNAPSHOT", + "ml.dmlc" %% "xgboost4j-spark" % "latest_version_num-SNAPSHOT" +) +``` -For the latest release version number, please check [here](https://github.com/CodingCat/xgboost/tree/maven-repo/ml/dmlc/xgboost4j_2.12). +For the latest release version number, please check [the repository listing](https://s3-us-west-2.amazonaws.com/xgboost-maven-repo/list.html). -if you want to use `xgboost4j-spark`, you just need to replace xgboost4j with `xgboost4j-spark` +To enable the GPU algorithm (`tree_method='gpu_hist'`), use artifacts `xgboost4j-gpu_2.12` and `xgboost4j-spark-gpu_2.12` instead. ## Examples Full code examples for Scala, Java, Apache Spark, and Apache Flink can be found in the [examples package](https://github.com/dmlc/xgboost/tree/master/jvm-packages/xgboost4j-example). -**NOTE on LIBSVM Format**: +**NOTE on LIBSVM Format**: -There is an inconsistent issue between XGBoost4J-Spark and other language bindings of XGBoost. +There is an inconsistent issue between XGBoost4J-Spark and other language bindings of XGBoost. When users use Spark to load trainingset/testset in LibSVM format with the following code snippet: @@ -108,7 +124,7 @@ You can build/package xgboost4j locally with the following steps: 2. Clone this repo: `git clone --recursive https://github.com/dmlc/xgboost.git` 3. Run the following command: - With Tests: `./xgboost/jvm-packages/dev/build-linux.sh` - - Skip Tests: `./xgboost/jvm-packages/dev/build-linux.sh --skip-tests` + - Skip Tests: `./xgboost/jvm-packages/dev/build-linux.sh --skip-tests` **Windows:** 1. Ensure [Docker for Windows](https://docs.docker.com/docker-for-windows/install/) is installed. diff --git a/jvm-packages/pom.xml b/jvm-packages/pom.xml index fdca78ba403b..03de3bd1c019 100644 --- a/jvm-packages/pom.xml +++ b/jvm-packages/pom.xml @@ -6,7 +6,7 @@ ml.dmlc xgboost-jvm_2.12 - 1.2.0-SNAPSHOT + 1.3.0-SNAPSHOT pom XGBoost JVM Package JVM Package for XGBoost diff --git a/jvm-packages/xgboost4j-example/pom.xml b/jvm-packages/xgboost4j-example/pom.xml index b70f3e25f2e3..4f493caa444a 100644 --- a/jvm-packages/xgboost4j-example/pom.xml +++ b/jvm-packages/xgboost4j-example/pom.xml @@ -6,10 +6,10 @@ ml.dmlc xgboost-jvm_2.12 - 1.2.0-SNAPSHOT + 1.3.0-SNAPSHOT xgboost4j-example_2.12 - 1.2.0-SNAPSHOT + 1.3.0-SNAPSHOT jar @@ -26,7 +26,7 @@ ml.dmlc xgboost4j-spark_${scala.binary.version} - 1.2.0-SNAPSHOT + 1.3.0-SNAPSHOT org.apache.spark @@ -37,7 +37,7 @@ ml.dmlc xgboost4j-flink_${scala.binary.version} - 1.2.0-SNAPSHOT + 1.3.0-SNAPSHOT org.apache.commons diff --git a/jvm-packages/xgboost4j-flink/pom.xml b/jvm-packages/xgboost4j-flink/pom.xml index 645822d2ad07..a65823a228a9 100644 --- a/jvm-packages/xgboost4j-flink/pom.xml +++ b/jvm-packages/xgboost4j-flink/pom.xml @@ -6,10 +6,10 @@ ml.dmlc xgboost-jvm_2.12 - 1.2.0-SNAPSHOT + 1.3.0-SNAPSHOT xgboost4j-flink_2.12 - 1.2.0-SNAPSHOT + 1.3.0-SNAPSHOT @@ -26,7 +26,7 @@ ml.dmlc xgboost4j_${scala.binary.version} - 1.2.0-SNAPSHOT + 1.3.0-SNAPSHOT org.apache.commons diff --git a/jvm-packages/xgboost4j-spark/pom.xml b/jvm-packages/xgboost4j-spark/pom.xml index 115f563938ca..6435a17f37d9 100644 --- a/jvm-packages/xgboost4j-spark/pom.xml +++ b/jvm-packages/xgboost4j-spark/pom.xml @@ -6,7 +6,7 @@ ml.dmlc xgboost-jvm_2.12 - 1.2.0-SNAPSHOT + 1.3.0-SNAPSHOT xgboost4j-spark_2.12 @@ -24,7 +24,7 @@ ml.dmlc xgboost4j_${scala.binary.version} - 1.2.0-SNAPSHOT + 1.3.0-SNAPSHOT org.apache.spark diff --git a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/DataUtils.scala b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/DataUtils.scala index df787d8eb8ab..15ffe4c06c42 100644 --- a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/DataUtils.scala +++ b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/DataUtils.scala @@ -103,7 +103,8 @@ object DataUtils extends Serializable { case sparseVector: SparseVector => featureValueOfSparseVector(rowHashCode, sparseVector) } - math.abs((rowHashCode.toLong + featureValue).toString.hashCode % numPartitions) + val nonNaNFeatureValue = if (featureValue.isNaN) { 0.0f } else { featureValue } + math.abs((rowHashCode.toLong + nonNaNFeatureValue).toString.hashCode % numPartitions) } private def attachPartitionKey( diff --git a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/DeterministicPartitioningSuite.scala b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/DeterministicPartitioningSuite.scala index 986b0843b5f3..ff0492f41a4a 100644 --- a/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/DeterministicPartitioningSuite.scala +++ b/jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/DeterministicPartitioningSuite.scala @@ -16,6 +16,7 @@ package ml.dmlc.xgboost4j.scala.spark +import org.apache.spark.ml.linalg.Vectors import org.scalatest.FunSuite import org.apache.spark.sql.functions._ @@ -79,4 +80,34 @@ class DeterministicPartitioningSuite extends FunSuite with TmpFolderPerSuite wit map2 } } + + test("deterministic partitioning has a uniform repartition on dataset with missing values") { + val N = 10000 + val dataset = (0 until N).map{ n => + (n, n % 2, Vectors.sparse(3, Array(0, 1, 2), Array(Double.NaN, n, Double.NaN))) + } + + val df = ss.createDataFrame(sc.parallelize(dataset)).toDF("id", "label", "features") + + val dfRepartitioned = DataUtils.convertDataFrameToXGBLabeledPointRDDs( + col("label"), + col("features"), + lit(1.0), + lit(Float.NaN), + None, + 10, + deterministicPartition = true, + df + ).head + + val partitionsSizes = dfRepartitioned + .mapPartitions(iter => Array(iter.size.toDouble).iterator, true) + .collect() + val partitionMean = partitionsSizes.sum / partitionsSizes.length + val squaredDiffSum = partitionsSizes + .map(partitionSize => Math.pow(partitionSize - partitionMean, 2)) + val standardDeviation = math.sqrt(squaredDiffSum.sum / squaredDiffSum.length) + + assert(standardDeviation < math.sqrt(N.toDouble)) + } } diff --git a/jvm-packages/xgboost4j/pom.xml b/jvm-packages/xgboost4j/pom.xml index 927e6d42e418..fff44d9ea37d 100644 --- a/jvm-packages/xgboost4j/pom.xml +++ b/jvm-packages/xgboost4j/pom.xml @@ -6,10 +6,10 @@ ml.dmlc xgboost-jvm_2.12 - 1.2.0-SNAPSHOT + 1.3.0-SNAPSHOT xgboost4j_2.12 - 1.2.0-SNAPSHOT + 1.3.0-SNAPSHOT jar diff --git a/python-package/xgboost/VERSION b/python-package/xgboost/VERSION index 468e6c357b7a..9d7c109bb7dc 100644 --- a/python-package/xgboost/VERSION +++ b/python-package/xgboost/VERSION @@ -1 +1 @@ -1.2.0-SNAPSHOT +1.3.0-SNAPSHOT diff --git a/python-package/xgboost/core.py b/python-package/xgboost/core.py index 4bc77783ee91..c8d0460825e5 100644 --- a/python-package/xgboost/core.py +++ b/python-package/xgboost/core.py @@ -40,7 +40,7 @@ class EarlyStopException(Exception): """ def __init__(self, best_iteration): - super(EarlyStopException, self).__init__() + super().__init__() self.best_iteration = best_iteration @@ -422,7 +422,7 @@ def __init__(self, data, label=None, weight=None, base_margin=None, raise TypeError('Input data can not be a list.') self.missing = missing if missing is not None else np.nan - self.nthread = nthread if nthread is not None else 1 + self.nthread = nthread if nthread is not None else -1 self.silent = silent # force into void_p, mac need to pass things in as void_p @@ -455,7 +455,8 @@ def set_info(self, label_lower_bound=None, label_upper_bound=None, feature_names=None, - feature_types=None): + feature_types=None, + feature_weights=None): '''Set meta info for DMatrix.''' if label is not None: self.set_label(label) @@ -473,6 +474,10 @@ def set_info(self, self.feature_names = feature_names if feature_types is not None: self.feature_types = feature_types + if feature_weights is not None: + from .data import dispatch_meta_backend + dispatch_meta_backend(matrix=self, data=feature_weights, + name='feature_weights') def get_float_info(self, field): """Get float property from the DMatrix. @@ -1460,8 +1465,12 @@ def reshape_output(predt, rows): ctypes.c_uint(iteration_range[1])) # once caching is supported, we can pass id(data) as cache id. - if isinstance(data, DataFrame): - data = data.values + try: + import pandas as pd + if isinstance(data, pd.DataFrame): + data = data.values + except ImportError: + pass if isinstance(data, np.ndarray): assert data.flags.c_contiguous arr = np.array(data.reshape(data.size), copy=False, diff --git a/python-package/xgboost/data.py b/python-package/xgboost/data.py index 9491efd1c38c..e4c05dcc244e 100644 --- a/python-package/xgboost/data.py +++ b/python-package/xgboost/data.py @@ -530,22 +530,38 @@ def dispatch_data_backend(data, missing, threads, raise TypeError('Not supported type for data.' + str(type(data))) +def _to_data_type(dtype: str, name: str): + dtype_map = {'float32': 1, 'float64': 2, 'uint32': 3, 'uint64': 4} + if dtype not in dtype_map.keys(): + raise TypeError( + f'Expecting float32, float64, uint32, uint64, got {dtype} ' + + f'for {name}.') + return dtype_map[dtype] + + +def _validate_meta_shape(data): + if hasattr(data, 'shape'): + assert len(data.shape) == 1 or ( + len(data.shape) == 2 and + (data.shape[1] == 0 or data.shape[1] == 1)) + + def _meta_from_numpy(data, field, dtype, handle): data = _maybe_np_slice(data, dtype) - if dtype == 'uint32': - c_data = c_array(ctypes.c_uint32, data) - _check_call(_LIB.XGDMatrixSetUIntInfo(handle, - c_str(field), - c_array(ctypes.c_uint, data), - c_bst_ulong(len(data)))) - elif dtype == 'float': - c_data = c_array(ctypes.c_float, data) - _check_call(_LIB.XGDMatrixSetFloatInfo(handle, - c_str(field), - c_data, - c_bst_ulong(len(data)))) - else: - raise TypeError('Unsupported type ' + str(dtype) + ' for:' + field) + interface = data.__array_interface__ + assert interface.get('mask', None) is None, 'Masked array is not supported' + size = data.shape[0] + + c_type = _to_data_type(str(data.dtype), field) + ptr = interface['data'][0] + ptr = ctypes.c_void_p(ptr) + _check_call(_LIB.XGDMatrixSetDenseInfo( + handle, + c_str(field), + ptr, + c_bst_ulong(size), + c_type + )) def _meta_from_list(data, field, dtype, handle): @@ -595,6 +611,7 @@ def _meta_from_dt(data, field, dtype, handle): def dispatch_meta_backend(matrix: DMatrix, data, name: str, dtype: str = None): '''Dispatch for meta info.''' handle = matrix.handle + _validate_meta_shape(data) if data is None: return if _is_list(data): diff --git a/python-package/xgboost/sklearn.py b/python-package/xgboost/sklearn.py index f533f7f3477d..96d358128a9a 100644 --- a/python-package/xgboost/sklearn.py +++ b/python-package/xgboost/sklearn.py @@ -441,6 +441,7 @@ def load_model(self, fname): def fit(self, X, y, sample_weight=None, base_margin=None, eval_set=None, eval_metric=None, early_stopping_rounds=None, verbose=True, xgb_model=None, sample_weight_eval_set=None, + feature_weights=None, callbacks=None): # pylint: disable=invalid-name,attribute-defined-outside-init """Fit gradient boosting model @@ -459,9 +460,6 @@ def fit(self, X, y, sample_weight=None, base_margin=None, A list of (X, y) tuple pairs to use as validation sets, for which metrics will be computed. Validation metrics will help us track the performance of the model. - sample_weight_eval_set : list, optional - A list of the form [L_1, L_2, ..., L_n], where each L_i is a list of - instance weights on the i-th validation set. eval_metric : str, list of str, or callable, optional If a str, should be a built-in evaluation metric to use. See doc/parameter.rst. @@ -490,6 +488,13 @@ def fit(self, X, y, sample_weight=None, base_margin=None, xgb_model : str file name of stored XGBoost model or 'Booster' instance XGBoost model to be loaded before training (allows training continuation). + sample_weight_eval_set : list, optional + A list of the form [L_1, L_2, ..., L_n], where each L_i is a list of + instance weights on the i-th validation set. + feature_weights: array_like + Weight for each feature, defines the probability of each feature + being selected when colsample is being used. All values must be + greater than 0, otherwise a `ValueError` is thrown. callbacks : list of callback functions List of callback functions that are applied at end of each iteration. It is possible to use predefined callbacks by using :ref:`callback_api`. @@ -498,6 +503,7 @@ def fit(self, X, y, sample_weight=None, base_margin=None, .. code-block:: python [xgb.callback.reset_learning_rate(custom_rates)] + """ self.n_features_in_ = X.shape[1] @@ -505,6 +511,7 @@ def fit(self, X, y, sample_weight=None, base_margin=None, base_margin=base_margin, missing=self.missing, nthread=self.n_jobs) + train_dmatrix.set_info(feature_weights=feature_weights) evals_result = {} @@ -750,7 +757,10 @@ def intercept_(self): @xgboost_model_doc( "Implementation of the scikit-learn API for XGBoost classification.", - ['model', 'objective']) + ['model', 'objective'], extra_parameters=''' + n_estimators : int + Number of boosting rounds. +''') class XGBClassifier(XGBModel, XGBClassifierBase): # pylint: disable=missing-docstring,invalid-name,too-many-instance-attributes def __init__(self, objective="binary:logistic", **kwargs): @@ -759,7 +769,7 @@ def __init__(self, objective="binary:logistic", **kwargs): def fit(self, X, y, sample_weight=None, base_margin=None, eval_set=None, eval_metric=None, early_stopping_rounds=None, verbose=True, xgb_model=None, - sample_weight_eval_set=None, callbacks=None): + sample_weight_eval_set=None, feature_weights=None, callbacks=None): # pylint: disable = attribute-defined-outside-init,arguments-differ evals_result = {} @@ -821,6 +831,7 @@ def fit(self, X, y, sample_weight=None, base_margin=None, train_dmatrix = DMatrix(X, label=training_labels, weight=sample_weight, base_margin=base_margin, missing=self.missing, nthread=self.n_jobs) + train_dmatrix.set_info(feature_weights=feature_weights) self._Booster = train(xgb_options, train_dmatrix, self.get_num_boosting_rounds(), @@ -1014,7 +1025,7 @@ def __init__(self, **kwargs) def get_xgb_params(self): - params = super(XGBRFClassifier, self).get_xgb_params() + params = super().get_xgb_params() params['num_parallel_tree'] = self.n_estimators return params @@ -1033,7 +1044,10 @@ def __init__(self, objective="reg:squarederror", **kwargs): @xgboost_model_doc( "scikit-learn API for XGBoost random forest regression.", - ['model', 'objective']) + ['model', 'objective'], extra_parameters=''' + n_estimators : int + Number of trees in random forest to fit. +''') class XGBRFRegressor(XGBRegressor): # pylint: disable=missing-docstring def __init__(self, learning_rate=1, subsample=0.8, colsample_bynode=0.8, @@ -1043,7 +1057,7 @@ def __init__(self, learning_rate=1, subsample=0.8, colsample_bynode=0.8, reg_lambda=reg_lambda, **kwargs) def get_xgb_params(self): - params = super(XGBRFRegressor, self).get_xgb_params() + params = super().get_xgb_params() params['num_parallel_tree'] = self.n_estimators return params @@ -1101,10 +1115,10 @@ def __init__(self, objective='rank:pairwise', **kwargs): raise ValueError("please use XGBRanker for ranking task") def fit(self, X, y, group, sample_weight=None, base_margin=None, - eval_set=None, - sample_weight_eval_set=None, eval_group=None, eval_metric=None, + eval_set=None, sample_weight_eval_set=None, + eval_group=None, eval_metric=None, early_stopping_rounds=None, verbose=False, xgb_model=None, - callbacks=None): + feature_weights=None, callbacks=None): # pylint: disable = attribute-defined-outside-init,arguments-differ """Fit gradient boosting ranker @@ -1170,6 +1184,10 @@ def fit(self, X, y, group, sample_weight=None, base_margin=None, xgb_model : str file name of stored XGBoost model or 'Booster' instance XGBoost model to be loaded before training (allows training continuation). + feature_weights: array_like + Weight for each feature, defines the probability of each feature + being selected when colsample is being used. All values must be + greater than 0, otherwise a `ValueError` is thrown. callbacks : list of callback functions List of callback functions that are applied at end of each iteration. It is possible to use predefined callbacks by using @@ -1205,6 +1223,7 @@ def _dmat_init(group, **params): train_dmatrix = DMatrix(data=X, label=y, weight=sample_weight, base_margin=base_margin, missing=self.missing, nthread=self.n_jobs) + train_dmatrix.set_info(feature_weights=feature_weights) train_dmatrix.set_group(group) evals_result = {} diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc index aa6ecf43a784..397f83e69bf8 100644 --- a/src/c_api/c_api.cc +++ b/src/c_api/c_api.cc @@ -316,6 +316,17 @@ XGB_DLL int XGDMatrixGetStrFeatureInfo(DMatrixHandle handle, const char *field, API_END(); } +XGB_DLL int XGDMatrixSetDenseInfo(DMatrixHandle handle, const char *field, + void *data, xgboost::bst_ulong size, + int type) { + API_BEGIN(); + CHECK_HANDLE(); + auto &info = static_cast *>(handle)->get()->Info(); + CHECK(type >= 1 && type <= 4); + info.SetInfo(field, data, static_cast(type), size); + API_END(); +} + XGB_DLL int XGDMatrixSetGroup(DMatrixHandle handle, const unsigned* group, xgboost::bst_ulong len) { diff --git a/src/common/common.h b/src/common/common.h index b0bd6b6d6cec..a4397d1c89aa 100644 --- a/src/common/common.h +++ b/src/common/common.h @@ -9,12 +9,15 @@ #include #include +#include #include +#include #include #include #include #include #include +#include #if defined(__CUDACC__) #include @@ -160,6 +163,15 @@ inline void AssertOneAPISupport() { #endif // XGBOOST_USE_ONEAPI } +template > +std::vector ArgSort(std::vector const &array, Comp comp = std::less{}) { + std::vector result(array.size()); + std::iota(result.begin(), result.end(), 0); + std::stable_sort( + result.begin(), result.end(), + [&array, comp](Idx const &l, Idx const &r) { return comp(array[l], array[r]); }); + return result; +} } // namespace common } // namespace xgboost #endif // XGBOOST_COMMON_COMMON_H_ diff --git a/src/common/group_data.h b/src/common/group_data.h index 0144d8099926..476b4925bff3 100644 --- a/src/common/group_data.h +++ b/src/common/group_data.h @@ -17,6 +17,7 @@ #include #include #include +#include #include "xgboost/base.h" @@ -56,10 +57,10 @@ class ParallelGroupBuilder { void InitBudget(std::size_t max_key, int nthread) { thread_rptr_.resize(nthread); for (std::size_t i = 0; i < thread_rptr_.size(); ++i) { - thread_rptr_[i].resize(max_key - std::min(base_row_offset_, max_key)); - std::fill(thread_rptr_[i].begin(), thread_rptr_[i].end(), 0); + thread_rptr_[i].resize(max_key - std::min(base_row_offset_, max_key), 0); } } + /*! * \brief step 2: add budget to each key * \param key the key @@ -74,6 +75,7 @@ class ParallelGroupBuilder { } trptr[offset_key] += nelem; } + /*! \brief step 3: initialize the necessary storage */ inline void InitStorage() { // set rptr to correct size @@ -101,6 +103,7 @@ class ParallelGroupBuilder { } data_.resize(rptr_.back()); } + /*! * \brief step 4: add data to the allocated space, * the calls to this function should be exactly match previous call to AddBudget @@ -109,10 +112,10 @@ class ParallelGroupBuilder { * \param value The value to be pushed to the group. * \param threadid the id of thread that calls this function */ - void Push(std::size_t key, ValueType value, int threadid) { + void Push(std::size_t key, ValueType&& value, int threadid) { size_t offset_key = key - base_row_offset_; SizeType &rp = thread_rptr_[threadid][offset_key]; - data_[rp++] = value; + data_[rp++] = std::move(value); } private: diff --git a/src/common/hist_util.h b/src/common/hist_util.h index d86b73135f34..0334b901224a 100644 --- a/src/common/hist_util.h +++ b/src/common/hist_util.h @@ -116,26 +116,14 @@ inline HistogramCuts SketchOnDMatrix(DMatrix *m, int32_t max_bins) { for (auto& column : column_sizes) { column.resize(info.num_col_, 0); } - for (auto const& page : m->GetBatches()) { - page.data.HostVector(); - page.offset.HostVector(); - ParallelFor(page.Size(), threads, [&](size_t i) { - auto &local_column_sizes = column_sizes.at(omp_get_thread_num()); - auto row = page[i]; - auto const *p_row = row.data(); - for (size_t j = 0; j < row.size(); ++j) { - local_column_sizes.at(p_row[j].index)++; - } - }); - } std::vector reduced(info.num_col_, 0); - - ParallelFor(info.num_col_, threads, [&](size_t i) { - for (auto const &thread : column_sizes) { - reduced[i] += thread[i]; + for (auto const& page : m->GetBatches()) { + auto const &entries_per_column = + HostSketchContainer::CalcColumnSize(page, info.num_col_, threads); + for (size_t i = 0; i < entries_per_column.size(); ++i) { + reduced[i] += entries_per_column[i]; } - }); - + } HostSketchContainer container(reduced, max_bins, HostSketchContainer::UseGroup(info)); for (auto const &page : m->GetBatches()) { diff --git a/src/common/quantile.cc b/src/common/quantile.cc index 374864c8f4b0..9ab48a304b77 100644 --- a/src/common/quantile.cc +++ b/src/common/quantile.cc @@ -25,34 +25,67 @@ HostSketchContainer::HostSketchContainer(std::vector columns_size, } } -std::vector LoadBalance(SparsePage const &page, - std::vector columns_size, - size_t const nthreads) { - /* Some sparse datasets have their mass concentrating on small - * number of features. To avoid wating for a few threads running - * forever, we here distirbute different number of columns to - * different threads according to number of entries. */ - size_t const total_entries = page.data.Size(); +std::vector +HostSketchContainer::CalcColumnSize(SparsePage const &batch, + bst_feature_t const n_columns, + size_t const nthreads) { + auto page = batch.GetView(); + std::vector> column_sizes(nthreads); + for (auto &column : column_sizes) { + column.resize(n_columns, 0); + } + + ParallelFor(page.Size(), nthreads, [&](size_t i) { + auto &local_column_sizes = column_sizes.at(omp_get_thread_num()); + auto row = page[i]; + auto const *p_row = row.data(); + for (size_t j = 0; j < row.size(); ++j) { + local_column_sizes.at(p_row[j].index)++; + } + }); + std::vector entries_per_columns(n_columns, 0); + ParallelFor(n_columns, nthreads, [&](size_t i) { + for (auto const &thread : column_sizes) { + entries_per_columns[i] += thread[i]; + } + }); + return entries_per_columns; +} + +std::vector HostSketchContainer::LoadBalance( + SparsePage const &batch, bst_feature_t n_columns, size_t const nthreads) { + /* Some sparse datasets have their mass concentrating on small number of features. To + * avoid wating for a few threads running forever, we here distirbute different number + * of columns to different threads according to number of entries. + */ + auto page = batch.GetView(); + size_t const total_entries = page.data.size(); size_t const entries_per_thread = common::DivRoundUp(total_entries, nthreads); - std::vector cols_ptr(nthreads+1, 0); + std::vector> column_sizes(nthreads); + for (auto& column : column_sizes) { + column.resize(n_columns, 0); + } + std::vector entries_per_columns = + CalcColumnSize(batch, n_columns, nthreads); + std::vector cols_ptr(nthreads + 1, 0); size_t count {0}; size_t current_thread {1}; - for (auto col : columns_size) { - cols_ptr[current_thread]++; // add one column to thread + for (auto col : entries_per_columns) { + cols_ptr.at(current_thread)++; // add one column to thread count += col; - if (count > entries_per_thread + 1) { + CHECK_LE(count, total_entries); + if (count > entries_per_thread) { current_thread++; count = 0; - cols_ptr[current_thread] = cols_ptr[current_thread-1]; + cols_ptr.at(current_thread) = cols_ptr[current_thread-1]; } } // Idle threads. for (; current_thread < cols_ptr.size() - 1; ++current_thread) { cols_ptr[current_thread+1] = cols_ptr[current_thread]; } - return cols_ptr; } @@ -67,11 +100,10 @@ void HostSketchContainer::PushRowPage(SparsePage const &page, // Use group index for weights? auto batch = page.GetView(); dmlc::OMPException exec; - // Parallel over columns. Asumming the data is dense, each thread owns a set of - // consecutive columns. + // Parallel over columns. Each thread owns a set of consecutive columns. auto const ncol = static_cast(info.num_col_); auto const is_dense = info.num_nonzero_ == info.num_col_ * info.num_row_; - auto thread_columns_ptr = LoadBalance(page, columns_size_, nthread); + auto thread_columns_ptr = LoadBalance(page, info.num_col_, nthread); #pragma omp parallel num_threads(nthread) { @@ -112,58 +144,158 @@ void HostSketchContainer::PushRowPage(SparsePage const &page, monitor_.Stop(__func__); } -void AddCutPoint(WQuantileSketch::SummaryContainer const &summary, - int max_bin, HistogramCuts *cuts) { - size_t required_cuts = std::min(summary.size, static_cast(max_bin)); - auto& cut_values = cuts->cut_values_.HostVector(); - for (size_t i = 1; i < required_cuts; ++i) { - bst_float cpt = summary.data[i].value; - if (i == 1 || cpt > cuts->cut_values_.ConstHostVector().back()) { - cut_values.push_back(cpt); - } +void HostSketchContainer::GatherSketchInfo( + std::vector const &reduced, + std::vector *p_worker_segments, + std::vector *p_sketches_scan, + std::vector *p_global_sketches) { + auto& worker_segments = *p_worker_segments; + worker_segments.resize(1, 0); + auto world = rabit::GetWorldSize(); + auto rank = rabit::GetRank(); + auto n_columns = sketches_.size(); + + std::vector sketch_size; + for (auto const& sketch : reduced) { + sketch_size.push_back(sketch.size); + } + std::vector& sketches_scan = *p_sketches_scan; + sketches_scan.resize((n_columns + 1) * world, 0); + size_t beg_scan = rank * (n_columns + 1); + std::partial_sum(sketch_size.cbegin(), sketch_size.cend(), + sketches_scan.begin() + beg_scan + 1); + // Gather all column pointers + rabit::Allreduce(sketches_scan.data(), sketches_scan.size()); + + for (int32_t i = 0; i < world; ++i) { + size_t back = (i + 1) * (n_columns + 1) - 1; + auto n_entries = sketches_scan.at(back); + worker_segments.push_back(n_entries); + } + // Offset of sketch from each worker. + std::partial_sum(worker_segments.begin(), worker_segments.end(), + worker_segments.begin()); + CHECK_GE(worker_segments.size(), 1); + auto total = worker_segments.back(); + + auto& global_sketches = *p_global_sketches; + global_sketches.resize(total, WQSketch::Entry{0, 0, 0, 0}); + auto worker_sketch = Span{global_sketches}.subspan( + worker_segments[rank], worker_segments[rank + 1] - worker_segments[rank]); + size_t cursor = 0; + for (auto const &sketch : reduced) { + std::copy(sketch.data, sketch.data + sketch.size, + worker_sketch.begin() + cursor); + cursor += sketch.size; } + + static_assert(sizeof(WQSketch::Entry) / 4 == sizeof(float), ""); + rabit::Allreduce( + reinterpret_cast(global_sketches.data()), + global_sketches.size() * sizeof(WQSketch::Entry) / sizeof(float)); } -void HostSketchContainer::MakeCuts(HistogramCuts* cuts) { +void HostSketchContainer::AllReduce( + std::vector *p_reduced, + std::vector* p_num_cuts) { monitor_.Start(__func__); - rabit::Allreduce(columns_size_.data(), columns_size_.size()); - std::vector reduced(sketches_.size()); - std::vector num_cuts; - size_t nbytes = 0; + auto& num_cuts = *p_num_cuts; + CHECK_EQ(num_cuts.size(), 0); + auto &reduced = *p_reduced; + reduced.resize(sketches_.size()); + + size_t n_columns = sketches_.size(); + rabit::Allreduce(&n_columns, 1); + CHECK_EQ(n_columns, sketches_.size()) << "Number of columns differs across workers"; + + // Prune the intermediate num cuts for synchronization. + std::vector global_column_size(columns_size_); + rabit::Allreduce(global_column_size.data(), global_column_size.size()); + +size_t nbytes = 0; for (size_t i = 0; i < sketches_.size(); ++i) { int32_t intermediate_num_cuts = static_cast(std::min( - columns_size_[i], static_cast(max_bins_ * WQSketch::kFactor))); - if (columns_size_[i] != 0) { + global_column_size[i], static_cast(max_bins_ * WQSketch::kFactor))); + if (global_column_size[i] != 0) { WQSketch::SummaryContainer out; sketches_[i].GetSummary(&out); reduced[i].Reserve(intermediate_num_cuts); CHECK(reduced[i].data); reduced[i].SetPrune(out, intermediate_num_cuts); + nbytes = std::max( + WQSketch::SummaryContainer::CalcMemCost(intermediate_num_cuts), + nbytes); } + num_cuts.push_back(intermediate_num_cuts); - nbytes = std::max( - WQSketch::SummaryContainer::CalcMemCost(intermediate_num_cuts), nbytes); } + auto world = rabit::GetWorldSize(); + if (world == 1) { + return; + } + + std::vector worker_segments(1, 0); // CSC pointer to sketches. + std::vector sketches_scan((n_columns + 1) * world, 0); + + std::vector global_sketches; + this->GatherSketchInfo(reduced, &worker_segments, &sketches_scan, + &global_sketches); + + std::vector final_sketches(n_columns); + ParallelFor(n_columns, omp_get_max_threads(), [&](size_t fidx) { + int32_t intermediate_num_cuts = num_cuts[fidx]; + auto nbytes = + WQSketch::SummaryContainer::CalcMemCost(intermediate_num_cuts); + + for (int32_t i = 1; i < world + 1; ++i) { + auto size = worker_segments.at(i) - worker_segments[i - 1]; + auto worker_sketches = Span{global_sketches}.subspan( + worker_segments[i - 1], size); + auto worker_scan = + Span(sketches_scan) + .subspan((i - 1) * (n_columns + 1), (n_columns + 1)); + + auto worker_feature = worker_sketches.subspan( + worker_scan[fidx], worker_scan[fidx + 1] - worker_scan[fidx]); + CHECK(worker_feature.data()); + WQSummary summary(worker_feature.data(), + worker_feature.size()); + auto &out = final_sketches.at(fidx); + out.Reduce(summary, nbytes); + } + + reduced.at(fidx).Reserve(intermediate_num_cuts); + reduced.at(fidx).SetPrune(final_sketches.at(fidx), intermediate_num_cuts); + }); + monitor_.Stop(__func__); +} - if (rabit::IsDistributed()) { - // FIXME(trivialfis): This call will allocate nbytes * num_columns on rabit, which - // may generate oom error when data is sparse. To fix it, we need to: - // - gather the column offsets over all workers. - // - run rabit::allgather on sketch data to collect all data. - // - merge all gathered sketches based on worker offsets and column offsets of data - // from each worker. - // See GPU implementation for details. - rabit::SerializeReducer sreducer; - sreducer.Allreduce(dmlc::BeginPtr(reduced), nbytes, reduced.size()); +void AddCutPoint(WQuantileSketch::SummaryContainer const &summary, + int max_bin, HistogramCuts *cuts) { + size_t required_cuts = std::min(summary.size, static_cast(max_bin)); + auto& cut_values = cuts->cut_values_.HostVector(); + for (size_t i = 1; i < required_cuts; ++i) { + bst_float cpt = summary.data[i].value; + if (i == 1 || cpt > cuts->cut_values_.ConstHostVector().back()) { + cut_values.push_back(cpt); + } } +} + +void HostSketchContainer::MakeCuts(HistogramCuts* cuts) { + monitor_.Start(__func__); + std::vector reduced; + std::vector num_cuts; + this->AllReduce(&reduced, &num_cuts); cuts->min_vals_.HostVector().resize(sketches_.size(), 0.0f); + for (size_t fid = 0; fid < reduced.size(); ++fid) { WQSketch::SummaryContainer a; size_t max_num_bins = std::min(num_cuts[fid], max_bins_); a.Reserve(max_num_bins + 1); CHECK(a.data); - if (columns_size_[fid] != 0) { + if (num_cuts[fid] != 0) { a.SetPrune(reduced[fid], max_num_bins + 1); CHECK(a.data && reduced[fid].data); const bst_float mval = a.data[0].value; @@ -173,6 +305,7 @@ void HostSketchContainer::MakeCuts(HistogramCuts* cuts) { const float mval = 1e-5f; cuts->min_vals_.HostVector()[fid] = mval; } + AddCutPoint(a, max_num_bins, cuts); // push a value that is greater than anything const bst_float cpt diff --git a/src/common/quantile.h b/src/common/quantile.h index 11e2530f748e..a70bf809ea28 100644 --- a/src/common/quantile.h +++ b/src/common/quantile.h @@ -166,6 +166,16 @@ struct WQSummary { * \param src source sketch */ inline void CopyFrom(const WQSummary &src) { + if (!src.data) { + CHECK_EQ(src.size, 0); + size = 0; + return; + } + if (!data) { + CHECK_EQ(this->size, 0); + CHECK_EQ(src.size, 0); + return; + } size = src.size; std::memcpy(data, src.data, sizeof(Entry) * size); } @@ -721,6 +731,14 @@ class HostSketchContainer { return use_group_ind; } + static std::vector CalcColumnSize(SparsePage const &page, + bst_feature_t const n_columns, + size_t const nthreads); + + static std::vector LoadBalance(SparsePage const &page, + bst_feature_t n_columns, + size_t const nthreads); + static uint32_t SearchGroupIndFromRow(std::vector const &group_ptr, size_t const base_rowid) { CHECK_LT(base_rowid, group_ptr.back()) @@ -730,6 +748,14 @@ class HostSketchContainer { group_ptr.cbegin() - 1; return group_ind; } + // Gather sketches from all workers. + void GatherSketchInfo(std::vector const &reduced, + std::vector *p_worker_segments, + std::vector *p_sketches_scan, + std::vector *p_global_sketches); + // Merge sketches from all workers. + void AllReduce(std::vector *p_reduced, + std::vector* p_num_cuts); /* \brief Push a CSR matrix. */ void PushRowPage(SparsePage const& page, MetaInfo const& info); diff --git a/src/common/random.cc b/src/common/random.cc new file mode 100644 index 000000000000..f386cad916b2 --- /dev/null +++ b/src/common/random.cc @@ -0,0 +1,38 @@ +/*! + * Copyright 2020 by XGBoost Contributors + * \file random.cc + */ +#include "random.h" + +namespace xgboost { +namespace common { +std::shared_ptr> ColumnSampler::ColSample( + std::shared_ptr> p_features, + float colsample) { + if (colsample == 1.0f) { + return p_features; + } + const auto &features = p_features->HostVector(); + CHECK_GT(features.size(), 0); + + int n = std::max(1, static_cast(colsample * features.size())); + auto p_new_features = std::make_shared>(); + auto &new_features = *p_new_features; + + if (feature_weights_.size() != 0) { + new_features.HostVector() = WeightedSamplingWithoutReplacement( + p_features->HostVector(), feature_weights_, n); + } else { + new_features.Resize(features.size()); + std::copy(features.begin(), features.end(), + new_features.HostVector().begin()); + std::shuffle(new_features.HostVector().begin(), + new_features.HostVector().end(), rng_); + new_features.Resize(n); + } + std::sort(new_features.HostVector().begin(), new_features.HostVector().end()); + return p_new_features; +} + +} // namespace common +} // namespace xgboost diff --git a/src/common/random.h b/src/common/random.h index 45af80ce030b..7fd461d22d0f 100644 --- a/src/common/random.h +++ b/src/common/random.h @@ -1,5 +1,5 @@ /*! - * Copyright 2015 by Contributors + * Copyright 2015-2020 by Contributors * \file random.h * \brief Utility related to random. * \author Tianqi Chen @@ -10,14 +10,17 @@ #include #include #include +#include #include #include #include #include #include #include +#include #include "xgboost/host_device_vector.h" +#include "common.h" namespace xgboost { namespace common { @@ -75,6 +78,38 @@ using GlobalRandomEngine = RandomEngine; */ GlobalRandomEngine& GlobalRandom(); // NOLINT(*) +/* + * Original paper: + * Weighted Random Sampling (2005; Efraimidis, Spirakis) + * + * Blog: + * https://timvieira.github.io/blog/post/2019/09/16/algorithms-for-sampling-without-replacement/ +*/ +template +std::vector WeightedSamplingWithoutReplacement( + std::vector const &array, std::vector const &weights, size_t n) { + // ES sampling. + CHECK_EQ(array.size(), weights.size()); + std::vector keys(weights.size()); + std::uniform_real_distribution dist; + auto& rng = GlobalRandom(); + for (size_t i = 0; i < array.size(); ++i) { + auto w = std::max(weights.at(i), kRtEps); + auto u = dist(rng); + auto k = std::log(u) / w; + keys[i] = k; + } + auto ind = ArgSort(keys, std::greater<>{}); + ind.resize(n); + + std::vector results(ind.size()); + for (size_t k = 0; k < ind.size(); ++k) { + auto idx = ind[k]; + results[k] = array[idx]; + } + return results; +} + /** * \class ColumnSampler * @@ -82,36 +117,18 @@ GlobalRandomEngine& GlobalRandom(); // NOLINT(*) * colsample_bynode parameters. Should be initialised before tree construction and to * reset when tree construction is completed. */ - class ColumnSampler { std::shared_ptr> feature_set_tree_; std::map>> feature_set_level_; + std::vector feature_weights_; float colsample_bylevel_{1.0f}; float colsample_bytree_{1.0f}; float colsample_bynode_{1.0f}; GlobalRandomEngine rng_; - std::shared_ptr> ColSample( - std::shared_ptr> p_features, float colsample) { - if (colsample == 1.0f) return p_features; - const auto& features = p_features->HostVector(); - CHECK_GT(features.size(), 0); - int n = std::max(1, static_cast(colsample * features.size())); - auto p_new_features = std::make_shared>(); - auto& new_features = *p_new_features; - new_features.Resize(features.size()); - std::copy(features.begin(), features.end(), - new_features.HostVector().begin()); - std::shuffle(new_features.HostVector().begin(), - new_features.HostVector().end(), rng_); - new_features.Resize(n); - std::sort(new_features.HostVector().begin(), - new_features.HostVector().end()); - - return p_new_features; - } - public: + std::shared_ptr> ColSample( + std::shared_ptr> p_features, float colsample); /** * \brief Column sampler constructor. * \note This constructor manually sets the rng seed @@ -139,8 +156,10 @@ class ColumnSampler { * \param colsample_bytree * \param skip_index_0 (Optional) True to skip index 0. */ - void Init(int64_t num_col, float colsample_bynode, float colsample_bylevel, + void Init(int64_t num_col, std::vector feature_weights, + float colsample_bynode, float colsample_bylevel, float colsample_bytree, bool skip_index_0 = false) { + feature_weights_ = std::move(feature_weights); colsample_bylevel_ = colsample_bylevel; colsample_bytree_ = colsample_bytree; colsample_bynode_ = colsample_bynode; diff --git a/src/common/version.cc b/src/common/version.cc index 3fb2e5c24392..e9d4fe9d13d8 100644 --- a/src/common/version.cc +++ b/src/common/version.cc @@ -49,9 +49,9 @@ Version::TripletT Version::Load(dmlc::Stream* fi) { LOG(FATAL) << msg; } - CHECK_EQ(fi->Read(&major, sizeof(major)), sizeof(major)) << msg; - CHECK_EQ(fi->Read(&minor, sizeof(major)), sizeof(minor)) << msg; - CHECK_EQ(fi->Read(&patch, sizeof(major)), sizeof(patch)) << msg; + CHECK(fi->Read(&major)) << msg; + CHECK(fi->Read(&minor)) << msg; + CHECK(fi->Read(&patch)) << msg; return std::make_tuple(major, minor, patch); } @@ -69,9 +69,9 @@ void Version::Save(dmlc::Stream* fo) { std::tie(major, minor, patch) = Self(); std::string verstr { u8"version:" }; fo->Write(&verstr[0], verstr.size()); - fo->Write(&major, sizeof(major)); - fo->Write(&minor, sizeof(minor)); - fo->Write(&patch, sizeof(patch)); + fo->Write(major); + fo->Write(minor); + fo->Write(patch); } std::string Version::String(TripletT const& version) { diff --git a/src/data/data.cc b/src/data/data.cc index 401a35081830..d7d18f189642 100644 --- a/src/data/data.cc +++ b/src/data/data.cc @@ -83,7 +83,7 @@ void LoadScalarField(dmlc::Stream* strm, const std::string& expected_name, CHECK(strm->Read(&is_scalar)) << invalid; CHECK(is_scalar) << invalid << "Expected field " << expected_name << " to be a scalar; got a vector"; - CHECK(strm->Read(field, sizeof(T))) << invalid; + CHECK(strm->Read(field)) << invalid; } template @@ -293,6 +293,9 @@ MetaInfo MetaInfo::Slice(common::Span ridxs) const { } else { out.base_margin_.HostVector() = Gather(this->base_margin_.HostVector(), ridxs); } + + out.feature_weigths.Resize(this->feature_weigths.Size()); + out.feature_weigths.Copy(this->feature_weigths); return out; } @@ -377,6 +380,16 @@ void MetaInfo::SetInfo(const char* key, const void* dptr, DataType dtype, size_t labels.resize(num); DISPATCH_CONST_PTR(dtype, dptr, cast_dptr, std::copy(cast_dptr, cast_dptr + num, labels.begin())); + } else if (!std::strcmp(key, "feature_weights")) { + auto &h_feature_weights = feature_weigths.HostVector(); + h_feature_weights.resize(num); + DISPATCH_CONST_PTR( + dtype, dptr, cast_dptr, + std::copy(cast_dptr, cast_dptr + num, h_feature_weights.begin())); + bool valid = + std::all_of(h_feature_weights.cbegin(), h_feature_weights.cend(), + [](float w) { return w >= 0; }); + CHECK(valid) << "Feature weight must be greater than 0."; } else { LOG(FATAL) << "Unknown key for MetaInfo: " << key; } @@ -396,6 +409,8 @@ void MetaInfo::GetInfo(char const *key, bst_ulong *out_len, DataType dtype, vec = &this->labels_lower_bound_.HostVector(); } else if (!std::strcmp(key, "label_upper_bound")) { vec = &this->labels_upper_bound_.HostVector(); + } else if (!std::strcmp(key, "feature_weights")) { + vec = &this->feature_weigths.HostVector(); } else { LOG(FATAL) << "Unknown float field name: " << key; } @@ -497,6 +512,11 @@ void MetaInfo::Extend(MetaInfo const& that, bool accumulate_rows) { auto &h_feature_types = feature_types.HostVector(); LoadFeatureType(this->feature_type_names, &h_feature_types); } + if (!that.feature_weigths.Empty()) { + this->feature_weigths.Resize(that.feature_weigths.Size()); + this->feature_weigths.SetDevice(that.feature_weigths.DeviceIdx()); + this->feature_weigths.Copy(that.feature_weigths); + } } void MetaInfo::Validate(int32_t device) const { @@ -538,6 +558,11 @@ void MetaInfo::Validate(int32_t device) const { check_device(labels_lower_bound_); return; } + if (feature_weigths.Size() != 0) { + CHECK_EQ(feature_weigths.Size(), num_col_) + << "Size of feature_weights must equal to number of columns."; + check_device(feature_weigths); + } if (labels_upper_bound_.Size() != 0) { CHECK_EQ(labels_upper_bound_.Size(), num_row_) << "Size of label_upper_bound must equal to number of rows."; @@ -628,14 +653,18 @@ DMatrix* DMatrix::Load(const std::string& uri, std::unique_ptr fi(dmlc::Stream::Create(fname.c_str(), "r", true)); if (fi != nullptr) { common::PeekableInStream is(fi.get()); - if (is.PeekRead(&magic, sizeof(magic)) == sizeof(magic) && - magic == data::SimpleDMatrix::kMagic) { - DMatrix* dmat = new data::SimpleDMatrix(&is); - if (!silent) { - LOG(CONSOLE) << dmat->Info().num_row_ << 'x' << dmat->Info().num_col_ << " matrix with " - << dmat->Info().num_nonzero_ << " entries loaded from " << uri; + if (is.PeekRead(&magic, sizeof(magic)) == sizeof(magic)) { + if (!DMLC_IO_NO_ENDIAN_SWAP) { + dmlc::ByteSwap(&magic, sizeof(magic), 1); + } + if (magic == data::SimpleDMatrix::kMagic) { + DMatrix* dmat = new data::SimpleDMatrix(&is); + if (!silent) { + LOG(CONSOLE) << dmat->Info().num_row_ << 'x' << dmat->Info().num_col_ << " matrix with " + << dmat->Info().num_nonzero_ << " entries loaded from " << uri; + } + return dmat; } - return dmat; } } } @@ -811,10 +840,11 @@ uint64_t SparsePage::Push(const AdapterBatchT& batch, float missing, int nthread // Set number of threads but keep old value so we can reset it after const int nthreadmax = omp_get_max_threads(); if (nthread <= 0) nthread = nthreadmax; - int nthread_original = omp_get_max_threads(); + const int nthread_original = omp_get_max_threads(); omp_set_num_threads(nthread); auto& offset_vec = offset.HostVector(); auto& data_vec = data.HostVector(); + size_t builder_base_row_offset = this->Size(); common::ParallelGroupBuilder< Entry, std::remove_reference::type::value_type> @@ -829,48 +859,74 @@ uint64_t SparsePage::Push(const AdapterBatchT& batch, float missing, int nthread last_line.GetElement(last_line.Size() - 1).row_idx - base_rowid; } } - builder.InitBudget(expected_rows, nthread); + size_t batch_size = batch.Size(); + const size_t thread_size = batch_size/nthread; + builder.InitBudget(expected_rows+1, nthread); uint64_t max_columns = 0; - + if (batch_size == 0) { + omp_set_num_threads(nthread_original); + return max_columns; + } + std::vector> max_columns_vector(nthread); + dmlc::OMPException exec; // First-pass over the batch counting valid elements - size_t batch_size = batch.Size(); -#pragma omp parallel for schedule(static) - for (omp_ulong i = 0; i < static_cast(batch_size); - ++i) { // NOLINT(*) - int tid = omp_get_thread_num(); - auto line = batch.GetLine(i); - for (auto j = 0ull; j < line.Size(); j++) { - data::COOTuple element = line.GetElement(j); - max_columns = - std::max(max_columns, static_cast(element.column_idx + 1)); - if (!common::CheckNAN(element.value) && element.value != missing) { - size_t key = element.row_idx - base_rowid; - // Adapter row index is absolute, here we want it relative to - // current page - CHECK_GE(key, builder_base_row_offset); - builder.AddBudget(key, tid); +#pragma omp parallel num_threads(nthread) + { + exec.Run([&]() { + int tid = omp_get_thread_num(); + size_t begin = tid*thread_size; + size_t end = tid != (nthread-1) ? (tid+1)*thread_size : batch_size; + max_columns_vector[tid].resize(1, 0); + uint64_t& max_columns_local = max_columns_vector[tid][0]; + + for (size_t i = begin; i < end; ++i) { + auto line = batch.GetLine(i); + for (auto j = 0ull; j < line.Size(); j++) { + auto element = line.GetElement(j); + const size_t key = element.row_idx - base_rowid; + CHECK_GE(key, builder_base_row_offset); + max_columns_local = + std::max(max_columns_local, static_cast(element.column_idx + 1)); + + if (!common::CheckNAN(element.value) && element.value != missing) { + // Adapter row index is absolute, here we want it relative to + // current page + builder.AddBudget(key, tid); + } + } } - } + }); } + exec.Rethrow(); + for (const auto & max : max_columns_vector) { + max_columns = std::max(max_columns, max[0]); + } + builder.InitStorage(); // Second pass over batch, placing elements in correct position -#pragma omp parallel for schedule(static) - for (omp_ulong i = 0; i < static_cast(batch_size); - ++i) { // NOLINT(*) - int tid = omp_get_thread_num(); - auto line = batch.GetLine(i); - for (auto j = 0ull; j < line.Size(); j++) { - auto element = line.GetElement(j); - if (!common::CheckNAN(element.value) && element.value != missing) { - size_t key = element.row_idx - - base_rowid; // Adapter row index is absolute, here we want - // it relative to current page - builder.Push(key, Entry(element.column_idx, element.value), tid); + +#pragma omp parallel num_threads(nthread) + { + exec.Run([&]() { + int tid = omp_get_thread_num(); + size_t begin = tid*thread_size; + size_t end = tid != (nthread-1) ? (tid+1)*thread_size : batch_size; + for (size_t i = begin; i < end; ++i) { + auto line = batch.GetLine(i); + for (auto j = 0ull; j < line.Size(); j++) { + auto element = line.GetElement(j); + const size_t key = (element.row_idx - base_rowid); + if (!common::CheckNAN(element.value) && element.value != missing) { + builder.Push(key, Entry(element.column_idx, element.value), tid); + } + } } - } + }); } + exec.Rethrow(); omp_set_num_threads(nthread_original); + return max_columns; } diff --git a/src/data/data.cu b/src/data/data.cu index 5e63a828c207..15260498734d 100644 --- a/src/data/data.cu +++ b/src/data/data.cu @@ -58,6 +58,15 @@ void CopyGroupInfoImpl(ArrayInterface column, std::vector* out) { std::partial_sum(out->begin(), out->end(), out->begin()); } +namespace { +// thrust::all_of tries to copy lambda function. +struct AllOfOp { + __device__ bool operator()(float w) { + return w >= 0; + } +}; +} // anonymous namespace + void MetaInfo::SetInfo(const char * c_key, std::string const& interface_str) { Json j_interface = Json::Load({interface_str.c_str(), interface_str.size()}); auto const& j_arr = get(j_interface); @@ -82,6 +91,21 @@ void MetaInfo::SetInfo(const char * c_key, std::string const& interface_str) { } else if (key == "group") { CopyGroupInfoImpl(array_interface, &group_ptr_); return; + } else if (key == "label_lower_bound") { + CopyInfoImpl(array_interface, &labels_lower_bound_); + return; + } else if (key == "label_upper_bound") { + CopyInfoImpl(array_interface, &labels_upper_bound_); + return; + } else if (key == "feature_weights") { + CopyInfoImpl(array_interface, &feature_weigths); + auto d_feature_weights = feature_weigths.ConstDeviceSpan(); + auto valid = + thrust::all_of(thrust::device, d_feature_weights.data(), + d_feature_weights.data() + d_feature_weights.size(), + AllOfOp{}); + CHECK(valid) << "Feature weight must be greater than 0."; + return; } else { LOG(FATAL) << "Unknown metainfo: " << key; } diff --git a/src/data/simple_dmatrix.cc b/src/data/simple_dmatrix.cc index f054ff64a490..06fa385b48de 100644 --- a/src/data/simple_dmatrix.cc +++ b/src/data/simple_dmatrix.cc @@ -192,8 +192,7 @@ SimpleDMatrix::SimpleDMatrix(AdapterT* adapter, float missing, int nthread) { SimpleDMatrix::SimpleDMatrix(dmlc::Stream* in_stream) { int tmagic; - CHECK(in_stream->Read(&tmagic, sizeof(tmagic)) == sizeof(tmagic)) - << "invalid input file format"; + CHECK(in_stream->Read(&tmagic)) << "invalid input file format"; CHECK_EQ(tmagic, kMagic) << "invalid format, magic number mismatch"; info_.LoadBinary(in_stream); in_stream->Read(&sparse_page_.offset.HostVector()); @@ -203,7 +202,7 @@ SimpleDMatrix::SimpleDMatrix(dmlc::Stream* in_stream) { void SimpleDMatrix::SaveToLocalFile(const std::string& fname) { std::unique_ptr fo(dmlc::Stream::Create(fname.c_str(), "w")); int tmagic = kMagic; - fo->Write(&tmagic, sizeof(tmagic)); + fo->Write(tmagic); info_.SaveBinary(fo.get()); fo->Write(sparse_page_.offset.HostVector()); fo->Write(sparse_page_.data.HostVector()); diff --git a/src/data/sparse_page_source.h b/src/data/sparse_page_source.h index 108af403b1a3..6db6de9fad55 100644 --- a/src/data/sparse_page_source.h +++ b/src/data/sparse_page_source.h @@ -144,7 +144,7 @@ class ExternalMemoryPrefetcher : dmlc::DataIter { std::unique_ptr finfo( dmlc::Stream::Create(info.name_info.c_str(), "r")); int tmagic; - CHECK_EQ(finfo->Read(&tmagic, sizeof(tmagic)), sizeof(tmagic)); + CHECK(finfo->Read(&tmagic)); CHECK_EQ(tmagic, kMagic) << "invalid format, magic number mismatch"; } files_.resize(info.name_shards.size()); @@ -359,7 +359,7 @@ class SparsePageSource { std::unique_ptr fo( dmlc::Stream::Create(cache_info_.name_info.c_str(), "w")); int tmagic = kMagic; - fo->Write(&tmagic, sizeof(tmagic)); + fo->Write(tmagic); // Either every row has query ID or none at all CHECK(qids.empty() || qids.size() == info.num_row_); info.SaveBinary(fo.get()); diff --git a/src/gbm/gbtree_model.cc b/src/gbm/gbtree_model.cc index 8ebd8284c269..4a20b48f7d1d 100644 --- a/src/gbm/gbtree_model.cc +++ b/src/gbm/gbtree_model.cc @@ -12,18 +12,35 @@ namespace gbm { void GBTreeModel::Save(dmlc::Stream* fo) const { CHECK_EQ(param.num_trees, static_cast(trees.size())); - fo->Write(¶m, sizeof(param)); + + if (DMLC_IO_NO_ENDIAN_SWAP) { + fo->Write(¶m, sizeof(param)); + } else { + auto x = param.ByteSwap(); + fo->Write(&x, sizeof(x)); + } for (const auto & tree : trees) { tree->Save(fo); } if (tree_info.size() != 0) { - fo->Write(dmlc::BeginPtr(tree_info), sizeof(int32_t) * tree_info.size()); + if (DMLC_IO_NO_ENDIAN_SWAP) { + fo->Write(dmlc::BeginPtr(tree_info), sizeof(int32_t) * tree_info.size()); + } else { + for (const auto& e : tree_info) { + auto x = e; + dmlc::ByteSwap(&x, sizeof(x), 1); + fo->Write(&x, sizeof(x)); + } + } } } void GBTreeModel::Load(dmlc::Stream* fi) { CHECK_EQ(fi->Read(¶m, sizeof(param)), sizeof(param)) << "GBTree: invalid model file"; + if (!DMLC_IO_NO_ENDIAN_SWAP) { + param = param.ByteSwap(); + } trees.clear(); trees_to_update.clear(); for (int32_t i = 0; i < param.num_trees; ++i) { @@ -33,9 +50,16 @@ void GBTreeModel::Load(dmlc::Stream* fi) { } tree_info.resize(param.num_trees); if (param.num_trees != 0) { - CHECK_EQ( - fi->Read(dmlc::BeginPtr(tree_info), sizeof(int32_t) * param.num_trees), - sizeof(int32_t) * param.num_trees); + if (DMLC_IO_NO_ENDIAN_SWAP) { + CHECK_EQ( + fi->Read(dmlc::BeginPtr(tree_info), sizeof(int32_t) * param.num_trees), + sizeof(int32_t) * param.num_trees); + } else { + for (auto& info : tree_info) { + CHECK_EQ(fi->Read(&info, sizeof(int32_t)), sizeof(int32_t)); + dmlc::ByteSwap(&info, sizeof(info), 1); + } + } } } diff --git a/src/gbm/gbtree_model.h b/src/gbm/gbtree_model.h index 7ac7d8f470a2..5a89878d3816 100644 --- a/src/gbm/gbtree_model.h +++ b/src/gbm/gbtree_model.h @@ -61,6 +61,21 @@ struct GBTreeModelParam : public dmlc::Parameter { .set_default(0) .describe("Reserved option for vector tree."); } + + // Swap byte order for all fields. Useful for transporting models between machines with different + // endianness (big endian vs little endian) + inline GBTreeModelParam ByteSwap() const { + GBTreeModelParam x = *this; + dmlc::ByteSwap(&x.num_trees, sizeof(x.num_trees), 1); + dmlc::ByteSwap(&x.deprecated_num_roots, sizeof(x.deprecated_num_roots), 1); + dmlc::ByteSwap(&x.deprecated_num_feature, sizeof(x.deprecated_num_feature), 1); + dmlc::ByteSwap(&x.pad_32bit, sizeof(x.pad_32bit), 1); + dmlc::ByteSwap(&x.deprecated_num_pbuffer, sizeof(x.deprecated_num_pbuffer), 1); + dmlc::ByteSwap(&x.deprecated_num_output_group, sizeof(x.deprecated_num_output_group), 1); + dmlc::ByteSwap(&x.size_leaf_vector, sizeof(x.size_leaf_vector), 1); + dmlc::ByteSwap(x.reserved, sizeof(x.reserved[0]), sizeof(x.reserved) / sizeof(x.reserved[0])); + return x; + } }; struct GBTreeModel : public Model { diff --git a/src/learner.cc b/src/learner.cc index 47080a5c12b9..8210c4d1c89b 100644 --- a/src/learner.cc +++ b/src/learner.cc @@ -128,6 +128,19 @@ struct LearnerModelParamLegacy : public dmlc::Parameter std::string str = get(j_param.at("base_score")); from_chars(str.c_str(), str.c_str() + str.size(), base_score); } + inline LearnerModelParamLegacy ByteSwap() const { + LearnerModelParamLegacy x = *this; + dmlc::ByteSwap(&x.base_score, sizeof(x.base_score), 1); + dmlc::ByteSwap(&x.num_feature, sizeof(x.num_feature), 1); + dmlc::ByteSwap(&x.num_class, sizeof(x.num_class), 1); + dmlc::ByteSwap(&x.contain_extra_attrs, sizeof(x.contain_extra_attrs), 1); + dmlc::ByteSwap(&x.contain_eval_metrics, sizeof(x.contain_eval_metrics), 1); + dmlc::ByteSwap(&x.major_version, sizeof(x.major_version), 1); + dmlc::ByteSwap(&x.minor_version, sizeof(x.minor_version), 1); + dmlc::ByteSwap(x.reserved, sizeof(x.reserved[0]), sizeof(x.reserved) / sizeof(x.reserved[0])); + return x; + } + // declare parameters DMLC_DECLARE_PARAMETER(LearnerModelParamLegacy) { DMLC_DECLARE_FIELD(base_score) @@ -694,7 +707,9 @@ class LearnerIO : public LearnerConfiguration { // read parameter CHECK_EQ(fi->Read(&mparam_, sizeof(mparam_)), sizeof(mparam_)) << "BoostLearner: wrong model format"; - + if (!DMLC_IO_NO_ENDIAN_SWAP) { + mparam_ = mparam_.ByteSwap(); + } CHECK(fi->Read(&tparam_.objective)) << "BoostLearner: wrong model format"; CHECK(fi->Read(&tparam_.booster)) << "BoostLearner: wrong model format"; @@ -828,7 +843,12 @@ class LearnerIO : public LearnerConfiguration { } std::string header {"binf"}; fo->Write(header.data(), 4); - fo->Write(&mparam, sizeof(LearnerModelParamLegacy)); + if (DMLC_IO_NO_ENDIAN_SWAP) { + fo->Write(&mparam, sizeof(LearnerModelParamLegacy)); + } else { + LearnerModelParamLegacy x = mparam.ByteSwap(); + fo->Write(&x, sizeof(LearnerModelParamLegacy)); + } fo->Write(tparam_.objective); fo->Write(tparam_.booster); gbm_->Save(fo); @@ -867,7 +887,13 @@ class LearnerIO : public LearnerConfiguration { // concatonate the model and config at final output, it's a temporary solution for // continuing support for binary model format fo->Write(&serialisation_header_[0], serialisation_header_.size()); - fo->Write(&json_offset, sizeof(json_offset)); + if (DMLC_IO_NO_ENDIAN_SWAP) { + fo->Write(&json_offset, sizeof(json_offset)); + } else { + auto x = json_offset; + dmlc::ByteSwap(&x, sizeof(x), 1); + fo->Write(&x, sizeof(json_offset)); + } fo->Write(&binary_buf[0], binary_buf.size()); fo->Write(&config_str[0], config_str.size()); } @@ -904,6 +930,9 @@ class LearnerIO : public LearnerConfiguration { )doc"; int64_t sz {-1}; CHECK_EQ(fp.Read(&sz, sizeof(sz)), sizeof(sz)); + if (!DMLC_IO_NO_ENDIAN_SWAP) { + dmlc::ByteSwap(&sz, sizeof(sz), 1); + } CHECK_GT(sz, 0); size_t json_offset = static_cast(sz); std::string buffer; diff --git a/src/tree/param.h b/src/tree/param.h index 280f06066e44..dedc2a7f0ff5 100644 --- a/src/tree/param.h +++ b/src/tree/param.h @@ -239,6 +239,10 @@ struct TrainParam : public XGBoostParameter { if (this->max_leaves > 0) { n_nodes = this->max_leaves * 2 - 1; } else { + // bst_node_t will overflow. + CHECK_LE(this->max_depth, 31) + << "max_depth can not be greater than 31 as that might generate 2 ** " + "32 - 1 nodes."; n_nodes = (1 << (this->max_depth + 1)) - 1; } CHECK_NE(n_nodes, 0); diff --git a/src/tree/tree_model.cc b/src/tree/tree_model.cc index 8f45621ca15e..7f9721aef1d9 100644 --- a/src/tree/tree_model.cc +++ b/src/tree/tree_model.cc @@ -664,13 +664,26 @@ bst_node_t RegTree::GetNumSplitNodes() const { void RegTree::Load(dmlc::Stream* fi) { CHECK_EQ(fi->Read(¶m, sizeof(TreeParam)), sizeof(TreeParam)); + if (!DMLC_IO_NO_ENDIAN_SWAP) { + param = param.ByteSwap(); + } nodes_.resize(param.num_nodes); stats_.resize(param.num_nodes); CHECK_NE(param.num_nodes, 0); CHECK_EQ(fi->Read(dmlc::BeginPtr(nodes_), sizeof(Node) * nodes_.size()), sizeof(Node) * nodes_.size()); + if (!DMLC_IO_NO_ENDIAN_SWAP) { + for (Node& node : nodes_) { + node = node.ByteSwap(); + } + } CHECK_EQ(fi->Read(dmlc::BeginPtr(stats_), sizeof(RTreeNodeStat) * stats_.size()), sizeof(RTreeNodeStat) * stats_.size()); + if (!DMLC_IO_NO_ENDIAN_SWAP) { + for (RTreeNodeStat& stat : stats_) { + stat = stat.ByteSwap(); + } + } // chg deleted nodes deleted_nodes_.resize(0); for (int i = 1; i < param.num_nodes; ++i) { @@ -683,11 +696,32 @@ void RegTree::Load(dmlc::Stream* fi) { void RegTree::Save(dmlc::Stream* fo) const { CHECK_EQ(param.num_nodes, static_cast(nodes_.size())); CHECK_EQ(param.num_nodes, static_cast(stats_.size())); - fo->Write(¶m, sizeof(TreeParam)); CHECK_EQ(param.deprecated_num_roots, 1); CHECK_NE(param.num_nodes, 0); - fo->Write(dmlc::BeginPtr(nodes_), sizeof(Node) * nodes_.size()); - fo->Write(dmlc::BeginPtr(stats_), sizeof(RTreeNodeStat) * nodes_.size()); + + if (DMLC_IO_NO_ENDIAN_SWAP) { + fo->Write(¶m, sizeof(TreeParam)); + } else { + TreeParam x = param.ByteSwap(); + fo->Write(&x, sizeof(x)); + } + + if (DMLC_IO_NO_ENDIAN_SWAP) { + fo->Write(dmlc::BeginPtr(nodes_), sizeof(Node) * nodes_.size()); + } else { + for (const Node& node : nodes_) { + Node x = node.ByteSwap(); + fo->Write(&x, sizeof(x)); + } + } + if (DMLC_IO_NO_ENDIAN_SWAP) { + fo->Write(dmlc::BeginPtr(stats_), sizeof(RTreeNodeStat) * nodes_.size()); + } else { + for (const RTreeNodeStat& stat : stats_) { + RTreeNodeStat x = stat.ByteSwap(); + fo->Write(&x, sizeof(x)); + } + } } void RegTree::LoadModel(Json const& in) { diff --git a/src/tree/updater_colmaker.cc b/src/tree/updater_colmaker.cc index 951cfdb5ec27..45cdb0ba9163 100644 --- a/src/tree/updater_colmaker.cc +++ b/src/tree/updater_colmaker.cc @@ -235,8 +235,10 @@ class ColMaker: public TreeUpdater { } } { - column_sampler_.Init(fmat.Info().num_col_, param_.colsample_bynode, - param_.colsample_bylevel, param_.colsample_bytree); + column_sampler_.Init(fmat.Info().num_col_, + fmat.Info().feature_weigths.ConstHostVector(), + param_.colsample_bynode, param_.colsample_bylevel, + param_.colsample_bytree); } { // setup temp space for each thread diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu index 5cbe75350402..3535a59d6f85 100644 --- a/src/tree/updater_gpu_hist.cu +++ b/src/tree/updater_gpu_hist.cu @@ -266,8 +266,10 @@ struct GPUHistMakerDevice { // Note that the column sampler must be passed by value because it is not // thread safe void Reset(HostDeviceVector* dh_gpair, DMatrix* dmat, int64_t num_columns) { - this->column_sampler.Init(num_columns, param.colsample_bynode, - param.colsample_bylevel, param.colsample_bytree); + auto const& info = dmat->Info(); + this->column_sampler.Init(num_columns, info.feature_weigths.HostVector(), + param.colsample_bynode, param.colsample_bylevel, + param.colsample_bytree); dh::safe_cuda(cudaSetDevice(device_id)); this->interaction_constraints.Reset(); std::fill(node_sum_gradients.begin(), node_sum_gradients.end(), diff --git a/src/tree/updater_quantile_hist.cc b/src/tree/updater_quantile_hist.cc index 37a90dfebd74..95d3c2008ef9 100644 --- a/src/tree/updater_quantile_hist.cc +++ b/src/tree/updater_quantile_hist.cc @@ -841,11 +841,13 @@ void QuantileHistMaker::Builder::InitData(const GHistIndexMatrix& // store a pointer to the tree p_last_tree_ = &tree; if (data_layout_ == kDenseDataOneBased) { - column_sampler_.Init(info.num_col_, param_.colsample_bynode, param_.colsample_bylevel, - param_.colsample_bytree, true); + column_sampler_.Init(info.num_col_, info.feature_weigths.ConstHostVector(), + param_.colsample_bynode, param_.colsample_bylevel, + param_.colsample_bytree, true); } else { - column_sampler_.Init(info.num_col_, param_.colsample_bynode, param_.colsample_bylevel, - param_.colsample_bytree, false); + column_sampler_.Init(info.num_col_, info.feature_weigths.ConstHostVector(), + param_.colsample_bynode, param_.colsample_bylevel, + param_.colsample_bytree, false); } if (data_layout_ == kDenseDataZeroBased || data_layout_ == kDenseDataOneBased) { /* specialized code for dense data: diff --git a/tests/ci_build/Dockerfile.s390x b/tests/ci_build/Dockerfile.s390x new file mode 100644 index 000000000000..5ad4a7888feb --- /dev/null +++ b/tests/ci_build/Dockerfile.s390x @@ -0,0 +1,27 @@ +FROM s390x/ubuntu:20.04 + +# Environment +ENV DEBIAN_FRONTEND noninteractive +SHELL ["/bin/bash", "-c"] # Use Bash as shell + +# Install all basic requirements +RUN \ + apt-get update && \ + apt-get install -y --no-install-recommends tar unzip wget git build-essential ninja-build \ + cmake time python3 python3-pip python3-numpy python3-scipy python3-sklearn r-base && \ + python3 -m pip install pytest hypothesis + +ENV GOSU_VERSION 1.10 + +# Install lightweight sudo (not bound to TTY) +RUN set -ex; \ + wget -O /usr/local/bin/gosu "https://github.com/tianon/gosu/releases/download/$GOSU_VERSION/gosu-amd64" && \ + chmod +x /usr/local/bin/gosu && \ + gosu nobody true + +# Default entry-point to use if running locally +# It will preserve attributes of created files +COPY entrypoint.sh /scripts/ + +WORKDIR /workspace +ENTRYPOINT ["/scripts/entrypoint.sh"] diff --git a/tests/ci_build/deploy_jvm_packages.sh b/tests/ci_build/deploy_jvm_packages.sh index 950cfeb38573..50a190862dd1 100755 --- a/tests/ci_build/deploy_jvm_packages.sh +++ b/tests/ci_build/deploy_jvm_packages.sh @@ -3,22 +3,32 @@ set -e set -x -if [ $# -ne 1 ]; then - echo "Usage: $0 [spark version]" +if [ $# -ne 2 ]; then + echo "Usage: $0 [spark version] [build_gpu? 0 or 1]" exit 1 fi spark_version=$1 +build_gpu=$2 # Initialize local Maven repository ./tests/ci_build/initialize_maven.sh -rm -rf build/ cd jvm-packages +rm -rf $(find . -name target) +rm -rf ../build/ # Re-build package without Mock Rabit # Deploy to S3 bucket xgboost-maven-repo -mvn --no-transfer-progress package deploy -P release-to-s3 -Dspark.version=${spark_version} -DskipTests +if [[ "$build_gpu" == "0" ]] +then + # Build CPU artifact + mvn --no-transfer-progress package deploy -P release-to-s3 -Dspark.version=${spark_version} -DskipTests +else + # Build GPU artifact + sed -i -e 's/xgboost\(.*\)_\(.*\)<\/artifactId>/xgboost\1-gpu_\2<\/artifactId>/' $(find . -name pom.xml) + mvn --no-transfer-progress package deploy -Duse.cuda=ON -P release-to-s3 -Dspark.version=${spark_version} -DskipTests +fi set +x set +e diff --git a/tests/ci_build/doxygen.sh b/tests/ci_build/doxygen.sh deleted file mode 100755 index 41757eb6935f..000000000000 --- a/tests/ci_build/doxygen.sh +++ /dev/null @@ -1,19 +0,0 @@ -#!/bin/bash - -if [ $# -ne 1 ]; then - echo "Usage: $0 [branch name]" - exit 1 -fi - -set -e -set -x - -branch_name=$1 - -rm -rf build -mkdir build -cd build -cmake .. -DBUILD_C_DOC=ON -make -j - -tar cvjf ${branch_name}.tar.bz2 doc_doxygen/ diff --git a/tests/cpp/c_api/test_c_api.cc b/tests/cpp/c_api/test_c_api.cc index f4c2722fe92c..664118780cc3 100644 --- a/tests/cpp/c_api/test_c_api.cc +++ b/tests/cpp/c_api/test_c_api.cc @@ -23,9 +23,9 @@ TEST(CAPI, XGDMatrixCreateFromMatDT) { std::shared_ptr *dmat = static_cast *>(handle); xgboost::MetaInfo &info = (*dmat)->Info(); - ASSERT_EQ(info.num_col_, 2); - ASSERT_EQ(info.num_row_, 3); - ASSERT_EQ(info.num_nonzero_, 6); + ASSERT_EQ(info.num_col_, 2ul); + ASSERT_EQ(info.num_row_, 3ul); + ASSERT_EQ(info.num_nonzero_, 6ul); for (const auto &batch : (*dmat)->GetBatches()) { ASSERT_EQ(batch[0][0].fvalue, 0.0f); @@ -38,9 +38,9 @@ TEST(CAPI, XGDMatrixCreateFromMatDT) { } TEST(CAPI, XGDMatrixCreateFromMatOmp) { - std::vector num_rows = {100, 11374, 15000}; + std::vector num_rows = {100, 11374, 15000}; for (auto row : num_rows) { - int num_cols = 50; + bst_ulong num_cols = 50; int num_missing = 5; DMatrixHandle handle; std::vector data(num_cols * row, 1.5); diff --git a/tests/cpp/common/test_common.cc b/tests/cpp/common/test_common.cc new file mode 100644 index 000000000000..006860b11af2 --- /dev/null +++ b/tests/cpp/common/test_common.cc @@ -0,0 +1,13 @@ +#include +#include "../../../src/common/common.h" + +namespace xgboost { +namespace common { +TEST(ArgSort, Basic) { + std::vector inputs {3.0, 2.0, 1.0}; + auto ret = ArgSort(inputs); + std::vector sol{2, 1, 0}; + ASSERT_EQ(ret, sol); +} +} // namespace common +} // namespace xgboost diff --git a/tests/cpp/common/test_hist_util.cc b/tests/cpp/common/test_hist_util.cc index 0fad360f4298..24c23b3e2608 100644 --- a/tests/cpp/common/test_hist_util.cc +++ b/tests/cpp/common/test_hist_util.cc @@ -159,10 +159,10 @@ TEST(CutsBuilder, SearchGroupInd) { HistogramCuts hmat; size_t group_ind = HostSketchContainer::SearchGroupIndFromRow(p_mat->Info().group_ptr_, 0); - ASSERT_EQ(group_ind, 0); + ASSERT_EQ(group_ind, 0ul); group_ind = HostSketchContainer::SearchGroupIndFromRow(p_mat->Info().group_ptr_, 5); - ASSERT_EQ(group_ind, 2); + ASSERT_EQ(group_ind, 2ul); EXPECT_ANY_THROW(HostSketchContainer::SearchGroupIndFromRow(p_mat->Info().group_ptr_, 17)); @@ -189,7 +189,7 @@ TEST(HistUtil, DenseCutsCategorical) { EXPECT_LT(cuts.MinValues()[0], x_sorted.front()); EXPECT_GT(cuts_from_sketch.front(), x_sorted.front()); EXPECT_GE(cuts_from_sketch.back(), x_sorted.back()); - EXPECT_EQ(cuts_from_sketch.size(), num_categories); + EXPECT_EQ(cuts_from_sketch.size(), static_cast(num_categories)); } } } diff --git a/tests/cpp/common/test_hist_util.h b/tests/cpp/common/test_hist_util.h index bd88d14ef1f2..d025e5ea60bf 100644 --- a/tests/cpp/common/test_hist_util.h +++ b/tests/cpp/common/test_hist_util.h @@ -162,7 +162,7 @@ inline void ValidateColumn(const HistogramCuts& cuts, int column_idx, // Check all cut points are unique EXPECT_EQ(std::set(cuts_begin, cuts_end).size(), - cuts_end - cuts_begin); + static_cast(cuts_end - cuts_begin)); auto unique = std::set(sorted_column.begin(), sorted_column.end()); if (unique.size() <= num_bins) { @@ -189,7 +189,7 @@ inline void ValidateCuts(const HistogramCuts& cuts, DMatrix* dmat, // Collect data into columns std::vector> columns(dmat->Info().num_col_); for (auto& batch : dmat->GetBatches()) { - ASSERT_GT(batch.Size(), 0); + ASSERT_GT(batch.Size(), 0ul); for (auto i = 0ull; i < batch.Size(); i++) { for (auto e : batch[i]) { columns[e.index].push_back(e.fvalue); diff --git a/tests/cpp/common/test_json.cc b/tests/cpp/common/test_json.cc index ba3b12e337e2..029beee8d48b 100644 --- a/tests/cpp/common/test_json.cc +++ b/tests/cpp/common/test_json.cc @@ -222,7 +222,7 @@ TEST(Json, ParseArray) { auto json = Json::Load(StringView{str.c_str(), str.size()}); json = json["nodes"]; std::vector arr = get(json); - ASSERT_EQ(arr.size(), 3); + ASSERT_EQ(arr.size(), 3ul); Json v0 = arr[0]; ASSERT_EQ(get(v0["depth"]), 3); ASSERT_NEAR(get(v0["gain"]), 10.4866, kRtEps); @@ -284,7 +284,7 @@ TEST(Json, EmptyArray) { std::istringstream iss(str); auto json = Json::Load(StringView{str.c_str(), str.size()}); auto arr = get(json["leaf_vector"]); - ASSERT_EQ(arr.size(), 0); + ASSERT_EQ(arr.size(), 0ul); } TEST(Json, Boolean) { @@ -315,7 +315,7 @@ TEST(Json, AssigningObjects) { Json json; json = JsonObject(); json["Okay"] = JsonArray(); - ASSERT_EQ(get(json["Okay"]).size(), 0); + ASSERT_EQ(get(json["Okay"]).size(), 0ul); } { @@ -453,7 +453,8 @@ TEST(Json, Invalid) { Json load{Json::Load(StringView(str.c_str(), str.size()))}; } catch (dmlc::Error const &e) { std::string msg = e.what(); - ASSERT_NE(msg.find("EOF"), std::string::npos); + ASSERT_TRUE(msg.find("EOF") != std::string::npos + || msg.find("255") != std::string::npos); // EOF is printed as 255 on s390x has_thrown = true; }; ASSERT_TRUE(has_thrown); diff --git a/tests/cpp/common/test_quantile.cc b/tests/cpp/common/test_quantile.cc index c273658e54cb..fa748de1cc6c 100644 --- a/tests/cpp/common/test_quantile.cc +++ b/tests/cpp/common/test_quantile.cc @@ -5,14 +5,122 @@ namespace xgboost { namespace common { + +TEST(Quantile, LoadBalance) { + size_t constexpr kRows = 1000, kCols = 100; + auto m = RandomDataGenerator{kRows, kCols, 0}.GenerateDMatrix(); + std::vector cols_ptr; + for (auto const &page : m->GetBatches()) { + cols_ptr = HostSketchContainer::LoadBalance(page, kCols, 13); + } + size_t n_cols = 0; + for (size_t i = 1; i < cols_ptr.size(); ++i) { + n_cols += cols_ptr[i] - cols_ptr[i - 1]; + } + CHECK_EQ(n_cols, kCols); +} + +void TestDistributedQuantile(size_t rows, size_t cols) { + std::string msg {"Skipping AllReduce test"}; + int32_t constexpr kWorkers = 4; + InitRabitContext(msg, kWorkers); + auto world = rabit::GetWorldSize(); + if (world != 1) { + ASSERT_EQ(world, kWorkers); + } else { + return; + } + + std::vector infos(2); + auto& h_weights = infos.front().weights_.HostVector(); + h_weights.resize(rows); + SimpleLCG lcg; + SimpleRealUniformDistribution dist(3, 1000); + std::generate(h_weights.begin(), h_weights.end(), [&]() { return dist(&lcg); }); + std::vector column_size(cols, rows); + size_t n_bins = 64; + + // Generate cuts for distributed environment. + auto sparsity = 0.5f; + auto rank = rabit::GetRank(); + HostSketchContainer sketch_distributed(column_size, n_bins, false); + auto m = RandomDataGenerator{rows, cols, sparsity} + .Seed(rank) + .Lower(.0f) + .Upper(1.0f) + .GenerateDMatrix(); + for (auto const &page : m->GetBatches()) { + sketch_distributed.PushRowPage(page, m->Info()); + } + HistogramCuts distributed_cuts; + sketch_distributed.MakeCuts(&distributed_cuts); + + // Generate cuts for single node environment + rabit::Finalize(); + CHECK_EQ(rabit::GetWorldSize(), 1); + std::for_each(column_size.begin(), column_size.end(), [=](auto& size) { size *= world; }); + HostSketchContainer sketch_on_single_node(column_size, n_bins, false); + for (auto rank = 0; rank < world; ++rank) { + auto m = RandomDataGenerator{rows, cols, sparsity} + .Seed(rank) + .Lower(.0f) + .Upper(1.0f) + .GenerateDMatrix(); + for (auto const &page : m->GetBatches()) { + sketch_on_single_node.PushRowPage(page, m->Info()); + } + } + + HistogramCuts single_node_cuts; + sketch_on_single_node.MakeCuts(&single_node_cuts); + + auto const& sptrs = single_node_cuts.Ptrs(); + auto const& dptrs = distributed_cuts.Ptrs(); + auto const& svals = single_node_cuts.Values(); + auto const& dvals = distributed_cuts.Values(); + auto const& smins = single_node_cuts.MinValues(); + auto const& dmins = distributed_cuts.MinValues(); + + ASSERT_EQ(sptrs.size(), dptrs.size()); + for (size_t i = 0; i < sptrs.size(); ++i) { + ASSERT_EQ(sptrs[i], dptrs[i]); + } + + ASSERT_EQ(svals.size(), dvals.size()); + for (size_t i = 0; i < svals.size(); ++i) { + ASSERT_NEAR(svals[i], dvals[i], 2e-2f); + } + + ASSERT_EQ(smins.size(), dmins.size()); + for (size_t i = 0; i < smins.size(); ++i) { + ASSERT_FLOAT_EQ(smins[i], dmins[i]); + } +} + +TEST(Quantile, DistributedBasic) { +#if defined(__unix__) + constexpr size_t kRows = 10, kCols = 10; + TestDistributedQuantile(kRows, kCols); +#endif +} + +TEST(Quantile, Distributed) { +#if defined(__unix__) + constexpr size_t kRows = 1000, kCols = 200; + TestDistributedQuantile(kRows, kCols); +#endif +} + TEST(Quantile, SameOnAllWorkers) { +#if defined(__unix__) std::string msg{"Skipping Quantile AllreduceBasic test"}; - size_t constexpr kWorkers = 4; + int32_t constexpr kWorkers = 4; InitRabitContext(msg, kWorkers); auto world = rabit::GetWorldSize(); if (world != 1) { CHECK_EQ(world, kWorkers); } else { + LOG(WARNING) << msg; return; } @@ -72,6 +180,8 @@ TEST(Quantile, SameOnAllWorkers) { } } }); + rabit::Finalize(); +#endif // defined(__unix__) } } // namespace common } // namespace xgboost diff --git a/tests/cpp/common/test_quantile.h b/tests/cpp/common/test_quantile.h index 7dea0b17deb3..e91f19ef84a8 100644 --- a/tests/cpp/common/test_quantile.h +++ b/tests/cpp/common/test_quantile.h @@ -7,7 +7,7 @@ namespace xgboost { namespace common { -inline void InitRabitContext(std::string msg, size_t n_workers) { +inline void InitRabitContext(std::string msg, int32_t n_workers) { auto port = std::getenv("DMLC_TRACKER_PORT"); std::string port_str; if (port) { @@ -35,7 +35,7 @@ template void RunWithSeedsAndBins(size_t rows, Fn fn) { for (size_t i = 0; i < bins.size() - 1; ++i) { bins[i] = i * 35 + 2; } - bins.back() = rows + 80; // provide a bin number greater than rows. + bins.back() = rows + 160; // provide a bin number greater than rows. std::vector infos(2); auto& h_weights = infos.front().weights_.HostVector(); diff --git a/tests/cpp/common/test_random.cc b/tests/cpp/common/test_random.cc index dc7b38554162..9b2a1515543f 100644 --- a/tests/cpp/common/test_random.cc +++ b/tests/cpp/common/test_random.cc @@ -8,9 +8,10 @@ namespace common { TEST(ColumnSampler, Test) { int n = 128; ColumnSampler cs; + std::vector feature_weights; // No node sampling - cs.Init(n, 1.0f, 0.5f, 0.5f); + cs.Init(n, feature_weights, 1.0f, 0.5f, 0.5f); auto set0 = cs.GetFeatureSet(0); ASSERT_EQ(set0->Size(), 32); @@ -23,7 +24,7 @@ TEST(ColumnSampler, Test) { ASSERT_EQ(set2->Size(), 32); // Node sampling - cs.Init(n, 0.5f, 1.0f, 0.5f); + cs.Init(n, feature_weights, 0.5f, 1.0f, 0.5f); auto set3 = cs.GetFeatureSet(0); ASSERT_EQ(set3->Size(), 32); @@ -33,19 +34,19 @@ TEST(ColumnSampler, Test) { ASSERT_EQ(set4->Size(), 32); // No level or node sampling, should be the same at different depth - cs.Init(n, 1.0f, 1.0f, 0.5f); + cs.Init(n, feature_weights, 1.0f, 1.0f, 0.5f); ASSERT_EQ(cs.GetFeatureSet(0)->HostVector(), cs.GetFeatureSet(1)->HostVector()); - cs.Init(n, 1.0f, 1.0f, 1.0f); + cs.Init(n, feature_weights, 1.0f, 1.0f, 1.0f); auto set5 = cs.GetFeatureSet(0); ASSERT_EQ(set5->Size(), n); - cs.Init(n, 1.0f, 1.0f, 1.0f); + cs.Init(n, feature_weights, 1.0f, 1.0f, 1.0f); auto set6 = cs.GetFeatureSet(0); ASSERT_EQ(set5->HostVector(), set6->HostVector()); // Should always be a minimum of one feature - cs.Init(n, 1e-16f, 1e-16f, 1e-16f); + cs.Init(n, feature_weights, 1e-16f, 1e-16f, 1e-16f); ASSERT_EQ(cs.GetFeatureSet(0)->Size(), 1); } @@ -56,13 +57,13 @@ TEST(ColumnSampler, ThreadSynchronisation) { size_t iterations = 10; size_t levels = 5; std::vector reference_result; - bool success = - true; // Cannot use google test asserts in multithreaded region + std::vector feature_weights; + bool success = true; // Cannot use google test asserts in multithreaded region #pragma omp parallel num_threads(num_threads) { for (auto j = 0ull; j < iterations; j++) { ColumnSampler cs(j); - cs.Init(n, 0.5f, 0.5f, 0.5f); + cs.Init(n, feature_weights, 0.5f, 0.5f, 0.5f); for (auto level = 0ull; level < levels; level++) { auto result = cs.GetFeatureSet(level)->ConstHostVector(); #pragma omp single @@ -76,5 +77,54 @@ TEST(ColumnSampler, ThreadSynchronisation) { } ASSERT_TRUE(success); } + +TEST(ColumnSampler, WeightedSampling) { + auto test_basic = [](int first) { + std::vector feature_weights(2); + feature_weights[0] = std::abs(first - 1.0f); + feature_weights[1] = first - 0.0f; + ColumnSampler cs{0}; + cs.Init(2, feature_weights, 1.0, 1.0, 0.5); + auto feature_sets = cs.GetFeatureSet(0); + auto const &h_feat_set = feature_sets->HostVector(); + ASSERT_EQ(h_feat_set.size(), 1); + ASSERT_EQ(h_feat_set[0], first - 0); + }; + + test_basic(0); + test_basic(1); + + size_t constexpr kCols = 64; + std::vector feature_weights(kCols); + SimpleLCG rng; + SimpleRealUniformDistribution dist(.0f, 12.0f); + std::generate(feature_weights.begin(), feature_weights.end(), [&]() { return dist(&rng); }); + ColumnSampler cs{0}; + cs.Init(kCols, feature_weights, 0.5f, 1.0f, 1.0f); + std::vector features(kCols); + std::iota(features.begin(), features.end(), 0); + std::vector freq(kCols, 0); + for (size_t i = 0; i < 1024; ++i) { + auto fset = cs.GetFeatureSet(0); + ASSERT_EQ(kCols * 0.5, fset->Size()); + auto const& h_fset = fset->HostVector(); + for (auto f : h_fset) { + freq[f] += 1.0f; + } + } + + auto norm = std::accumulate(freq.cbegin(), freq.cend(), .0f); + for (auto& f : freq) { + f /= norm; + } + norm = std::accumulate(feature_weights.cbegin(), feature_weights.cend(), .0f); + for (auto& f : feature_weights) { + f /= norm; + } + + for (size_t i = 0; i < feature_weights.size(); ++i) { + EXPECT_NEAR(freq[i], feature_weights[i], 1e-2); + } +} } // namespace common } // namespace xgboost diff --git a/tests/cpp/test_serialization.cc b/tests/cpp/test_serialization.cc index 23334408f1fa..66428e8de950 100644 --- a/tests/cpp/test_serialization.cc +++ b/tests/cpp/test_serialization.cc @@ -179,7 +179,6 @@ TEST_F(SerializationTest, Exact) { {"nthread", "1"}, {"base_score", "3.14195265"}, {"max_depth", "2"}, - {"enable_experimental_json_serialization", "1"}, {"tree_method", "exact"}}, fmap_, p_dmat_); @@ -189,7 +188,6 @@ TEST_F(SerializationTest, Exact) { {"base_score", "3.14195265"}, {"max_depth", "2"}, {"num_parallel_tree", "4"}, - {"enable_experimental_json_serialization", "1"}, {"tree_method", "exact"}}, fmap_, p_dmat_); @@ -198,7 +196,6 @@ TEST_F(SerializationTest, Exact) { {"nthread", "1"}, {"base_score", "3.14195265"}, {"max_depth", "2"}, - {"enable_experimental_json_serialization", "1"}, {"tree_method", "exact"}}, fmap_, p_dmat_); } @@ -208,7 +205,6 @@ TEST_F(SerializationTest, Approx) { {"seed", "0"}, {"nthread", "1"}, {"max_depth", "2"}, - {"enable_experimental_json_serialization", "1"}, {"tree_method", "approx"}}, fmap_, p_dmat_); @@ -217,7 +213,6 @@ TEST_F(SerializationTest, Approx) { {"nthread", "1"}, {"max_depth", "2"}, {"num_parallel_tree", "4"}, - {"enable_experimental_json_serialization", "1"}, {"tree_method", "approx"}}, fmap_, p_dmat_); @@ -225,7 +220,6 @@ TEST_F(SerializationTest, Approx) { {"seed", "0"}, {"nthread", "1"}, {"max_depth", "2"}, - {"enable_experimental_json_serialization", "1"}, {"tree_method", "approx"}}, fmap_, p_dmat_); } @@ -235,7 +229,6 @@ TEST_F(SerializationTest, Hist) { {"seed", "0"}, {"nthread", "1"}, {"max_depth", "2"}, - {"enable_experimental_json_serialization", "1"}, {"tree_method", "hist"}}, fmap_, p_dmat_); @@ -244,7 +237,6 @@ TEST_F(SerializationTest, Hist) { {"nthread", "1"}, {"max_depth", "2"}, {"num_parallel_tree", "4"}, - {"enable_experimental_json_serialization", "1"}, {"tree_method", "hist"}}, fmap_, p_dmat_); @@ -252,7 +244,6 @@ TEST_F(SerializationTest, Hist) { {"seed", "0"}, {"nthread", "1"}, {"max_depth", "2"}, - {"enable_experimental_json_serialization", "1"}, {"tree_method", "hist"}}, fmap_, p_dmat_); } @@ -261,7 +252,6 @@ TEST_F(SerializationTest, CPUCoordDescent) { TestLearnerSerialization({{"booster", "gblinear"}, {"seed", "0"}, {"nthread", "1"}, - {"enable_experimental_json_serialization", "1"}, {"updater", "coord_descent"}}, fmap_, p_dmat_); } @@ -270,7 +260,6 @@ TEST_F(SerializationTest, CPUCoordDescent) { TEST_F(SerializationTest, GpuHist) { TestLearnerSerialization({{"booster", "gbtree"}, {"seed", "0"}, - {"enable_experimental_json_serialization", "1"}, {"nthread", "1"}, {"max_depth", "2"}, {"tree_method", "gpu_hist"}}, @@ -278,7 +267,6 @@ TEST_F(SerializationTest, GpuHist) { TestLearnerSerialization({{"booster", "gbtree"}, {"seed", "0"}, - {"enable_experimental_json_serialization", "1"}, {"nthread", "1"}, {"max_depth", "2"}, {"num_parallel_tree", "4"}, @@ -287,7 +275,6 @@ TEST_F(SerializationTest, GpuHist) { TestLearnerSerialization({{"booster", "dart"}, {"seed", "0"}, - {"enable_experimental_json_serialization", "1"}, {"nthread", "1"}, {"max_depth", "2"}, {"tree_method", "gpu_hist"}}, @@ -345,7 +332,6 @@ TEST_F(SerializationTest, GPUCoordDescent) { TestLearnerSerialization({{"booster", "gblinear"}, {"seed", "0"}, {"nthread", "1"}, - {"enable_experimental_json_serialization", "1"}, {"updater", "gpu_coord_descent"}}, fmap_, p_dmat_); } @@ -380,7 +366,6 @@ TEST_F(LogitSerializationTest, Exact) { {"seed", "0"}, {"nthread", "1"}, {"max_depth", "2"}, - {"enable_experimental_json_serialization", "1"}, {"tree_method", "exact"}}, fmap_, p_dmat_); @@ -389,7 +374,6 @@ TEST_F(LogitSerializationTest, Exact) { {"seed", "0"}, {"nthread", "1"}, {"max_depth", "2"}, - {"enable_experimental_json_serialization", "1"}, {"tree_method", "exact"}}, fmap_, p_dmat_); } @@ -400,7 +384,6 @@ TEST_F(LogitSerializationTest, Approx) { {"seed", "0"}, {"nthread", "1"}, {"max_depth", "2"}, - {"enable_experimental_json_serialization", "1"}, {"tree_method", "approx"}}, fmap_, p_dmat_); @@ -409,7 +392,6 @@ TEST_F(LogitSerializationTest, Approx) { {"seed", "0"}, {"nthread", "1"}, {"max_depth", "2"}, - {"enable_experimental_json_serialization", "1"}, {"tree_method", "approx"}}, fmap_, p_dmat_); } @@ -420,7 +402,6 @@ TEST_F(LogitSerializationTest, Hist) { {"seed", "0"}, {"nthread", "1"}, {"max_depth", "2"}, - {"enable_experimental_json_serialization", "1"}, {"tree_method", "hist"}}, fmap_, p_dmat_); @@ -429,7 +410,6 @@ TEST_F(LogitSerializationTest, Hist) { {"seed", "0"}, {"nthread", "1"}, {"max_depth", "2"}, - {"enable_experimental_json_serialization", "1"}, {"tree_method", "hist"}}, fmap_, p_dmat_); } @@ -438,7 +418,6 @@ TEST_F(LogitSerializationTest, CPUCoordDescent) { TestLearnerSerialization({{"booster", "gblinear"}, {"seed", "0"}, {"nthread", "1"}, - {"enable_experimental_json_serialization", "1"}, {"updater", "coord_descent"}}, fmap_, p_dmat_); } @@ -450,14 +429,12 @@ TEST_F(LogitSerializationTest, GpuHist) { {"seed", "0"}, {"nthread", "1"}, {"max_depth", "2"}, - {"enable_experimental_json_serialization", "1"}, {"tree_method", "gpu_hist"}}, fmap_, p_dmat_); TestLearnerSerialization({{"booster", "gbtree"}, {"objective", "binary:logistic"}, {"seed", "0"}, - {"enable_experimental_json_serialization", "1"}, {"nthread", "1"}, {"max_depth", "2"}, {"num_parallel_tree", "4"}, @@ -469,7 +446,6 @@ TEST_F(LogitSerializationTest, GpuHist) { {"seed", "0"}, {"nthread", "1"}, {"max_depth", "2"}, - {"enable_experimental_json_serialization", "1"}, {"tree_method", "gpu_hist"}}, fmap_, p_dmat_); } @@ -479,7 +455,6 @@ TEST_F(LogitSerializationTest, GPUCoordDescent) { {"objective", "binary:logistic"}, {"seed", "0"}, {"nthread", "1"}, - {"enable_experimental_json_serialization", "1"}, {"updater", "gpu_coord_descent"}}, fmap_, p_dmat_); } @@ -515,7 +490,6 @@ TEST_F(MultiClassesSerializationTest, Exact) { {"seed", "0"}, {"nthread", "1"}, {"max_depth", std::to_string(kClasses)}, - {"enable_experimental_json_serialization", "1"}, {"tree_method", "exact"}}, fmap_, p_dmat_); @@ -525,7 +499,6 @@ TEST_F(MultiClassesSerializationTest, Exact) { {"nthread", "1"}, {"max_depth", std::to_string(kClasses)}, {"num_parallel_tree", "4"}, - {"enable_experimental_json_serialization", "1"}, {"tree_method", "exact"}}, fmap_, p_dmat_); @@ -534,7 +507,6 @@ TEST_F(MultiClassesSerializationTest, Exact) { {"seed", "0"}, {"nthread", "1"}, {"max_depth", std::to_string(kClasses)}, - {"enable_experimental_json_serialization", "1"}, {"tree_method", "exact"}}, fmap_, p_dmat_); } @@ -545,7 +517,6 @@ TEST_F(MultiClassesSerializationTest, Approx) { {"seed", "0"}, {"nthread", "1"}, {"max_depth", std::to_string(kClasses)}, - {"enable_experimental_json_serialization", "1"}, {"tree_method", "approx"}}, fmap_, p_dmat_); @@ -554,7 +525,6 @@ TEST_F(MultiClassesSerializationTest, Approx) { {"seed", "0"}, {"nthread", "1"}, {"max_depth", std::to_string(kClasses)}, - {"enable_experimental_json_serialization", "1"}, {"tree_method", "approx"}}, fmap_, p_dmat_); } @@ -565,7 +535,6 @@ TEST_F(MultiClassesSerializationTest, Hist) { {"seed", "0"}, {"nthread", "1"}, {"max_depth", std::to_string(kClasses)}, - {"enable_experimental_json_serialization", "1"}, {"tree_method", "hist"}}, fmap_, p_dmat_); @@ -574,7 +543,6 @@ TEST_F(MultiClassesSerializationTest, Hist) { {"seed", "0"}, {"nthread", "1"}, {"max_depth", std::to_string(kClasses)}, - {"enable_experimental_json_serialization", "1"}, {"num_parallel_tree", "4"}, {"tree_method", "hist"}}, fmap_, p_dmat_); @@ -584,7 +552,6 @@ TEST_F(MultiClassesSerializationTest, Hist) { {"seed", "0"}, {"nthread", "1"}, {"max_depth", std::to_string(kClasses)}, - {"enable_experimental_json_serialization", "1"}, {"tree_method", "hist"}}, fmap_, p_dmat_); } @@ -593,7 +560,6 @@ TEST_F(MultiClassesSerializationTest, CPUCoordDescent) { TestLearnerSerialization({{"booster", "gblinear"}, {"seed", "0"}, {"nthread", "1"}, - {"enable_experimental_json_serialization", "1"}, {"updater", "coord_descent"}}, fmap_, p_dmat_); } @@ -609,7 +575,6 @@ TEST_F(MultiClassesSerializationTest, GpuHist) { // different result (1e-7) with CPU predictor for some // entries. {"predictor", "gpu_predictor"}, - {"enable_experimental_json_serialization", "1"}, {"tree_method", "gpu_hist"}}, fmap_, p_dmat_); @@ -621,7 +586,6 @@ TEST_F(MultiClassesSerializationTest, GpuHist) { // GPU_Hist has higher floating point error. 1e-6 doesn't work // after num_parallel_tree goes to 4 {"num_parallel_tree", "3"}, - {"enable_experimental_json_serialization", "1"}, {"tree_method", "gpu_hist"}}, fmap_, p_dmat_); @@ -630,7 +594,6 @@ TEST_F(MultiClassesSerializationTest, GpuHist) { {"seed", "0"}, {"nthread", "1"}, {"max_depth", std::to_string(kClasses)}, - {"enable_experimental_json_serialization", "1"}, {"tree_method", "gpu_hist"}}, fmap_, p_dmat_); } @@ -640,7 +603,6 @@ TEST_F(MultiClassesSerializationTest, GPUCoordDescent) { {"num_class", std::to_string(kClasses)}, {"seed", "0"}, {"nthread", "1"}, - {"enable_experimental_json_serialization", "1"}, {"updater", "gpu_coord_descent"}}, fmap_, p_dmat_); } diff --git a/tests/cpp/tree/test_gpu_hist.cu b/tests/cpp/tree/test_gpu_hist.cu index fd5c9f43fb2a..5199a27d26e8 100644 --- a/tests/cpp/tree/test_gpu_hist.cu +++ b/tests/cpp/tree/test_gpu_hist.cu @@ -204,12 +204,11 @@ TEST(GpuHist, EvaluateRootSplit) { ASSERT_EQ(maker.hist.Data().size(), hist.size()); thrust::copy(hist.begin(), hist.end(), maker.hist.Data().begin()); + std::vector feature_weights; - maker.column_sampler.Init(kNCols, - param.colsample_bynode, - param.colsample_bylevel, - param.colsample_bytree, - false); + maker.column_sampler.Init(kNCols, feature_weights, param.colsample_bynode, + param.colsample_bylevel, param.colsample_bytree, + false); RegTree tree; MetaInfo info; @@ -506,5 +505,17 @@ TEST(GpuHist, ConfigIO) { ASSERT_EQ(j_updater, j_updater_roundtrip); } +TEST(GpuHist, MaxDepth) { + GenericParameter generic_param(CreateEmptyGenericParam(0)); + size_t constexpr kRows = 16; + size_t constexpr kCols = 4; + auto p_mat = RandomDataGenerator{kRows, kCols, 0}.GenerateDMatrix(); + + auto learner = std::unique_ptr(Learner::Create({p_mat})); + learner->SetParam("max_depth", "32"); + learner->Configure(); + + ASSERT_THROW({learner->UpdateOneIter(0, p_mat);}, dmlc::Error); +} } // namespace tree } // namespace xgboost diff --git a/tests/cpp/tree/test_tree_model.cc b/tests/cpp/tree/test_tree_model.cc index dbf2b80a2de4..1dbc5fc2c89b 100644 --- a/tests/cpp/tree/test_tree_model.cc +++ b/tests/cpp/tree/test_tree_model.cc @@ -6,6 +6,7 @@ #include "xgboost/json_io.h" namespace xgboost { +#if DMLC_IO_NO_ENDIAN_SWAP // skip on big-endian machines // Manually construct tree in binary format // Do not use structs in case they change // We want to preserve backwards compatibility @@ -85,6 +86,7 @@ TEST(Tree, Load) { EXPECT_EQ(tree[1].LeafValue(), 0.1f); EXPECT_TRUE(tree[1].IsLeaf()); } +#endif // DMLC_IO_NO_ENDIAN_SWAP TEST(Tree, AllocateNode) { RegTree tree; diff --git a/tests/python-gpu/test_device_quantile_dmatrix.py b/tests/python-gpu/test_device_quantile_dmatrix.py index f0978a0afaf4..c44de28bd2ff 100644 --- a/tests/python-gpu/test_device_quantile_dmatrix.py +++ b/tests/python-gpu/test_device_quantile_dmatrix.py @@ -16,6 +16,20 @@ def test_dmatrix_numpy_init(self): match='is not supported for DeviceQuantileDMatrix'): xgb.DeviceQuantileDMatrix(data, np.ones(5, dtype=np.float64)) + @pytest.mark.skipif(**tm.no_cupy()) + def test_dmatrix_feature_weights(self): + import cupy as cp + rng = cp.random.RandomState(1994) + data = rng.randn(5, 5) + m = xgb.DMatrix(data) + + feature_weights = rng.uniform(size=5) + m.set_info(feature_weights=feature_weights) + + cp.testing.assert_array_equal( + cp.array(m.get_float_info('feature_weights')), + feature_weights.astype(np.float32)) + @pytest.mark.skipif(**tm.no_cupy()) def test_dmatrix_cupy_init(self): import cupy as cp diff --git a/tests/python/test_basic.py b/tests/python/test_basic.py index dad7ddc9db0c..acfb6db560be 100644 --- a/tests/python/test_basic.py +++ b/tests/python/test_basic.py @@ -110,16 +110,19 @@ def test_multiclass(self): # error must be smaller than 10% assert err < 0.1 - # save dmatrix into binary buffer - dtest.save_binary('dtest.buffer') - # save model - bst.save_model('xgb.model') - # load model and data in - bst2 = xgb.Booster(model_file='xgb.model') - dtest2 = xgb.DMatrix('dtest.buffer') - preds2 = bst2.predict(dtest2) - # assert they are the same - assert np.sum(np.abs(preds2 - preds)) == 0 + with tempfile.TemporaryDirectory() as tmpdir: + dtest_path = os.path.join(tmpdir, 'dtest.buffer') + model_path = os.path.join(tmpdir, 'xgb.model') + # save dmatrix into binary buffer + dtest.save_binary(dtest_path) + # save model + bst.save_model(model_path) + # load model and data in + bst2 = xgb.Booster(model_file=model_path) + dtest2 = xgb.DMatrix(dtest_path) + preds2 = bst2.predict(dtest2) + # assert they are the same + assert np.sum(np.abs(preds2 - preds)) == 0 def test_dump(self): data = np.random.randn(100, 2) diff --git a/tests/python/test_basic_models.py b/tests/python/test_basic_models.py index 3eafdf71d821..529f7784c60d 100644 --- a/tests/python/test_basic_models.py +++ b/tests/python/test_basic_models.py @@ -6,6 +6,7 @@ import testing as tm import pytest import locale +import tempfile dpath = 'demo/data/' dtrain = xgb.DMatrix(dpath + 'agaricus.txt.train') @@ -60,15 +61,20 @@ def test_dart(self): # error must be smaller than 10% assert err < 0.1 - # save dmatrix into binary buffer - dtest.save_binary('dtest.buffer') - model_path = 'xgb.model.dart' - # save model - bst.save_model(model_path) - # load model and data in - bst2 = xgb.Booster(params=param, model_file='xgb.model.dart') - dtest2 = xgb.DMatrix('dtest.buffer') + with tempfile.TemporaryDirectory() as tmpdir: + dtest_path = os.path.join(tmpdir, 'dtest.dmatrix') + model_path = os.path.join(tmpdir, 'xgboost.model.dart') + # save dmatrix into binary buffer + dtest.save_binary(dtest_path) + model_path = model_path + # save model + bst.save_model(model_path) + # load model and data in + bst2 = xgb.Booster(params=param, model_file=model_path) + dtest2 = xgb.DMatrix(dtest_path) + preds2 = bst2.predict(dtest2, ntree_limit=num_round) + # assert they are the same assert np.sum(np.abs(preds2 - preds)) == 0 @@ -103,7 +109,6 @@ def my_logloss(preds, dtrain): for ii in range(len(preds_list)): for jj in range(ii + 1, len(preds_list)): assert np.sum(np.abs(preds_list[ii] - preds_list[jj])) > 0 - os.remove(model_path) def run_eta_decay(self, tree_method): watchlist = [(dtest, 'eval'), (dtrain, 'train')] diff --git a/tests/python/test_cli.py b/tests/python/test_cli.py index 3ff37ea521c8..e437f426cc6f 100644 --- a/tests/python/test_cli.py +++ b/tests/python/test_cli.py @@ -47,9 +47,12 @@ def test_cli_model(self): seed = 1994 with tempfile.TemporaryDirectory() as tmpdir: - model_out_cli = os.path.join(tmpdir, 'test_load_cli_model-cli.bin') - model_out_py = os.path.join(tmpdir, 'test_cli_model-py.bin') - config_path = os.path.join(tmpdir, 'test_load_cli_model.conf') + model_out_cli = os.path.join( + tmpdir, 'test_load_cli_model-cli.json') + model_out_py = os.path.join( + tmpdir, 'test_cli_model-py.json') + config_path = os.path.join( + tmpdir, 'test_load_cli_model.conf') train_conf = self.template.format(data_path=data_path, seed=seed, @@ -121,6 +124,8 @@ def test_cli_help(self): v = xgboost.__version__ if v.find('SNAPSHOT') != -1: assert msg.split(':')[1].strip() == v.split('-')[0] + elif v.find('rc') != -1: + assert msg.split(':')[1].strip() == v.split('rc')[0] else: assert msg.split(':')[1].strip() == v diff --git a/tests/python/test_demos.py b/tests/python/test_demos.py index 8b6535dbff45..33e64f7dd40b 100644 --- a/tests/python/test_demos.py +++ b/tests/python/test_demos.py @@ -1,12 +1,10 @@ import os import subprocess -import sys import pytest import testing as tm -CURRENT_DIR = os.path.dirname(__file__) -ROOT_DIR = os.path.dirname(os.path.dirname(CURRENT_DIR)) +ROOT_DIR = tm.PROJECT_ROOT DEMO_DIR = os.path.join(ROOT_DIR, 'demo') PYTHON_DEMO_DIR = os.path.join(DEMO_DIR, 'guide-python') @@ -19,21 +17,27 @@ def test_basic_walkthrough(): os.remove('dump.raw.txt') +@pytest.mark.skipif(**tm.no_matplotlib()) def test_custom_multiclass_objective(): script = os.path.join(PYTHON_DEMO_DIR, 'custom_softmax.py') cmd = ['python', script, '--plot=0'] subprocess.check_call(cmd) +@pytest.mark.skipif(**tm.no_matplotlib()) def test_custom_rmsle_objective(): - major, minor = sys.version_info[:2] - if minor < 6: - pytest.skip('Skipping RMLSE test due to Python version being too low.') script = os.path.join(PYTHON_DEMO_DIR, 'custom_rmsle.py') cmd = ['python', script, '--plot=0'] subprocess.check_call(cmd) +@pytest.mark.skipif(**tm.no_matplotlib()) +def test_feature_weights_demo(): + script = os.path.join(PYTHON_DEMO_DIR, 'feature_weights.py') + cmd = ['python', script, '--plot=0'] + subprocess.check_call(cmd) + + @pytest.mark.skipif(**tm.no_sklearn()) def test_sklearn_demo(): script = os.path.join(PYTHON_DEMO_DIR, 'sklearn_examples.py') @@ -105,6 +109,8 @@ def test_evals_result_demo(): subprocess.check_call(cmd) +@pytest.mark.skipif(**tm.no_sklearn()) +@pytest.mark.skipif(**tm.no_pandas()) def test_aft_demo(): script = os.path.join(DEMO_DIR, 'aft_survival', 'aft_survival_demo.py') cmd = ['python', script] diff --git a/tests/python/test_dmatrix.py b/tests/python/test_dmatrix.py index ecf5f60411bf..f641ea2c54f4 100644 --- a/tests/python/test_dmatrix.py +++ b/tests/python/test_dmatrix.py @@ -99,6 +99,11 @@ def test_slice(self): X = rng.randn(100, 100) y = rng.randint(low=0, high=3, size=100) d = xgb.DMatrix(X, y) + np.testing.assert_equal(d.get_label(), y.astype(np.float32)) + + fw = rng.uniform(size=100).astype(np.float32) + d.set_info(feature_weights=fw) + eval_res_0 = {} booster = xgb.train( {'num_class': 3, 'objective': 'multi:softprob'}, d, @@ -106,19 +111,23 @@ def test_slice(self): predt = booster.predict(d) predt = predt.reshape(100 * 3, 1) + d.set_base_margin(predt) ridxs = [1, 2, 3, 4, 5, 6] - d = d.slice(ridxs) - sliced_margin = d.get_float_info('base_margin') + sliced = d.slice(ridxs) + + sliced_margin = sliced.get_float_info('base_margin') assert sliced_margin.shape[0] == len(ridxs) * 3 eval_res_1 = {} - xgb.train({'num_class': 3, 'objective': 'multi:softprob'}, d, - num_boost_round=2, evals=[(d, 'd')], evals_result=eval_res_1) + xgb.train({'num_class': 3, 'objective': 'multi:softprob'}, sliced, + num_boost_round=2, evals=[(sliced, 'd')], + evals_result=eval_res_1) eval_res_0 = eval_res_0['d']['merror'] eval_res_1 = eval_res_1['d']['merror'] + for i in range(len(eval_res_0)): assert abs(eval_res_0[i] - eval_res_1[i]) < 0.02 @@ -196,13 +205,33 @@ def test_get_info(self): dtrain.get_float_info('base_margin') dtrain.get_uint_info('group_ptr') + def test_feature_weights(self): + kRows = 10 + kCols = 50 + rng = np.random.RandomState(1994) + fw = rng.uniform(size=kCols) + X = rng.randn(kRows, kCols) + m = xgb.DMatrix(X) + m.set_info(feature_weights=fw) + np.testing.assert_allclose(fw, m.get_float_info('feature_weights')) + # Handle empty + m.set_info(feature_weights=np.empty((0, 0))) + + assert m.get_float_info('feature_weights').shape[0] == 0 + + fw -= 1 + + def assign_weight(): + m.set_info(feature_weights=fw) + self.assertRaises(ValueError, assign_weight) + def test_sparse_dmatrix_csr(self): nrow = 100 ncol = 1000 x = rand(nrow, ncol, density=0.0005, format='csr', random_state=rng) assert x.indices.max() < ncol - 1 x.data[:] = 1 - dtrain = xgb.DMatrix(x, label=np.random.binomial(1, 0.3, nrow)) + dtrain = xgb.DMatrix(x, label=rng.binomial(1, 0.3, nrow)) assert (dtrain.num_row(), dtrain.num_col()) == (nrow, ncol) watchlist = [(dtrain, 'train')] param = {'max_depth': 3, 'objective': 'binary:logistic', 'verbosity': 0} @@ -215,7 +244,7 @@ def test_sparse_dmatrix_csc(self): x = rand(nrow, ncol, density=0.0005, format='csc', random_state=rng) assert x.indices.max() < nrow - 1 x.data[:] = 1 - dtrain = xgb.DMatrix(x, label=np.random.binomial(1, 0.3, nrow)) + dtrain = xgb.DMatrix(x, label=rng.binomial(1, 0.3, nrow)) assert (dtrain.num_row(), dtrain.num_col()) == (nrow, ncol) watchlist = [(dtrain, 'train')] param = {'max_depth': 3, 'objective': 'binary:logistic', 'verbosity': 0} diff --git a/tests/python/test_early_stopping.py b/tests/python/test_early_stopping.py index 9338c095d657..4fca3e59302b 100644 --- a/tests/python/test_early_stopping.py +++ b/tests/python/test_early_stopping.py @@ -82,6 +82,7 @@ def test_cv_early_stopping(self): self.assert_metrics_length(cv, 1) @pytest.mark.skipif(**tm.no_sklearn()) + @pytest.mark.skipif(**tm.no_pandas()) def test_cv_early_stopping_with_multiple_eval_sets_and_metrics(self): from sklearn.datasets import load_breast_cancer diff --git a/tests/python/test_model_compatibility.py b/tests/python/test_model_compatibility.py index 55110720bf92..e02134d6cc71 100644 --- a/tests/python/test_model_compatibility.py +++ b/tests/python/test_model_compatibility.py @@ -1,10 +1,12 @@ import xgboost import os import generate_models as gm +import testing as tm import json import zipfile import pytest import copy +import urllib.request def run_model_param_check(config): @@ -87,6 +89,7 @@ def run_scikit_model_check(name, path): assert False +@pytest.mark.skipif(**tm.no_sklearn()) def test_model_compatibility(): '''Test model compatibility, can only be run on CI as others don't have the credentials. @@ -94,17 +97,9 @@ def test_model_compatibility(): ''' path = os.path.dirname(os.path.abspath(__file__)) path = os.path.join(path, 'models') - try: - import boto3 - import botocore - except ImportError: - pytest.skip( - 'Skiping compatibility tests as boto3 is not installed.') - - s3_bucket = boto3.resource('s3').Bucket('xgboost-ci-jenkins-artifacts') - zip_path = 'xgboost_model_compatibility_test.zip' - s3_bucket.download_file(zip_path, zip_path) + zip_path, _ = urllib.request.urlretrieve('https://xgboost-ci-jenkins-artifacts.s3-us-west-2' + + '.amazonaws.com/xgboost_model_compatibility_test.zip') with zipfile.ZipFile(zip_path, 'r') as z: z.extractall(path) diff --git a/tests/python/test_plotting.py b/tests/python/test_plotting.py index 18b0b83c7d60..e5e3a96e1bb0 100644 --- a/tests/python/test_plotting.py +++ b/tests/python/test_plotting.py @@ -14,27 +14,27 @@ except ImportError: pass +pytestmark = pytest.mark.skipif(**tm.no_multiple(tm.no_matplotlib(), + tm.no_graphviz())) -pytestmark = pytest.mark.skipif(**tm.no_multiple(tm.no_matplotlib(), tm.no_graphviz())) - - -dpath = 'demo/data/' -rng = np.random.RandomState(1994) +dpath = 'demo/data/agaricus.txt.train' class TestPlotting(unittest.TestCase): - def test_plotting(self): - bst2 = xgb.Booster(model_file='xgb.model') + m = xgb.DMatrix(dpath) + booster = xgb.train({'max_depth': 2, 'eta': 1, + 'objective': 'binary:logistic'}, m, + num_boost_round=2) - ax = xgb.plot_importance(bst2) + ax = xgb.plot_importance(booster) assert isinstance(ax, Axes) assert ax.get_title() == 'Feature importance' assert ax.get_xlabel() == 'F score' assert ax.get_ylabel() == 'Features' assert len(ax.patches) == 4 - ax = xgb.plot_importance(bst2, color='r', + ax = xgb.plot_importance(booster, color='r', title='t', xlabel='x', ylabel='y') assert isinstance(ax, Axes) assert ax.get_title() == 't' @@ -44,7 +44,7 @@ def test_plotting(self): for p in ax.patches: assert p.get_facecolor() == (1.0, 0, 0, 1.0) # red - ax = xgb.plot_importance(bst2, color=['r', 'r', 'b', 'b'], + ax = xgb.plot_importance(booster, color=['r', 'r', 'b', 'b'], title=None, xlabel=None, ylabel=None) assert isinstance(ax, Axes) assert ax.get_title() == '' @@ -56,10 +56,10 @@ def test_plotting(self): assert ax.patches[2].get_facecolor() == (0, 0, 1.0, 1.0) # blue assert ax.patches[3].get_facecolor() == (0, 0, 1.0, 1.0) # blue - g = xgb.to_graphviz(bst2, num_trees=0) + g = xgb.to_graphviz(booster, num_trees=0) assert isinstance(g, Source) - ax = xgb.plot_tree(bst2, num_trees=0) + ax = xgb.plot_tree(booster, num_trees=0) assert isinstance(ax, Axes) def test_importance_plot_lim(self): diff --git a/tests/python/test_with_dask.py b/tests/python/test_with_dask.py index dc5c155e6027..145fa0b524cd 100644 --- a/tests/python/test_with_dask.py +++ b/tests/python/test_with_dask.py @@ -501,17 +501,20 @@ def run_updater_test(self, client, params, num_rounds, dataset, num_boost_round=num_rounds, evals=[(m, 'train')])['history'] note(history) - assert tm.non_increasing(history['train'][dataset.metric]) + history = history['train'][dataset.metric] + assert tm.non_increasing(history) + # Make sure that it's decreasing + assert history[-1] < history[0] @given(params=hist_parameter_strategy, - num_rounds=strategies.integers(10, 20), + num_rounds=strategies.integers(20, 30), dataset=tm.dataset_strategy) @settings(deadline=None) def test_hist(self, params, num_rounds, dataset, client): self.run_updater_test(client, params, num_rounds, dataset, 'hist') @given(params=exact_parameter_strategy, - num_rounds=strategies.integers(10, 20), + num_rounds=strategies.integers(20, 30), dataset=tm.dataset_strategy) @settings(deadline=None) def test_approx(self, client, params, num_rounds, dataset): @@ -524,8 +527,7 @@ def run_quantile(self, name): exe = None for possible_path in {'./testxgboost', './build/testxgboost', '../build/testxgboost', - '../cpu-build/testxgboost', - '../gpu-build/testxgboost'}: + '../cpu-build/testxgboost'}: if os.path.exists(possible_path): exe = possible_path if exe is None: @@ -542,7 +544,7 @@ def runit(worker_addr, rabit_args): port = port.split('=') env = os.environ.copy() env[port[0]] = port[1] - return subprocess.run([exe, test], env=env, stdout=subprocess.PIPE) + return subprocess.run([exe, test], env=env, capture_output=True) with LocalCluster(n_workers=4) as cluster: with Client(cluster) as client: @@ -555,6 +557,7 @@ def runit(worker_addr, rabit_args): workers=workers, rabit_args=rabit_args) results = client.gather(futures) + for ret in results: msg = ret.stdout.decode('utf-8') assert msg.find('1 test from Quantile') != -1, msg @@ -563,4 +566,14 @@ def runit(worker_addr, rabit_args): @pytest.mark.skipif(**tm.no_dask()) @pytest.mark.gtest def test_quantile_basic(self): + self.run_quantile('DistributedBasic') + + @pytest.mark.skipif(**tm.no_dask()) + @pytest.mark.gtest + def test_quantile(self): + self.run_quantile('Distributed') + + @pytest.mark.skipif(**tm.no_dask()) + @pytest.mark.gtest + def test_quantile_same_on_all_workers(self): self.run_quantile('SameOnAllWorkers') diff --git a/tests/python/test_with_sklearn.py b/tests/python/test_with_sklearn.py index 7f62a3e83052..ce0b57e823ff 100644 --- a/tests/python/test_with_sklearn.py +++ b/tests/python/test_with_sklearn.py @@ -1,3 +1,5 @@ +import collections +import importlib.util import numpy as np import xgboost as xgb from xgboost.sklearn import XGBoostLabelEncoder @@ -654,6 +656,7 @@ def test_validation_weights_xgbmodel(): eval_set=[(X_train, y_train), (X_test, y_test)], sample_weight_eval_set=[weights_train]) + def test_validation_weights_xgbclassifier(): from sklearn.datasets import make_hastie_10_2 @@ -920,6 +923,64 @@ def test_pandas_input(): np.array([0, 1])) +def run_feature_weights(increasing): + with TemporaryDirectory() as tmpdir: + kRows = 512 + kCols = 64 + colsample_bynode = 0.5 + reg = xgb.XGBRegressor(tree_method='hist', + colsample_bynode=colsample_bynode) + X = rng.randn(kRows, kCols) + y = rng.randn(kRows) + fw = np.ones(shape=(kCols,)) + for i in range(kCols): + if increasing: + fw[i] *= float(i) + else: + fw[i] *= float(kCols - i) + + reg.fit(X, y, feature_weights=fw) + model_path = os.path.join(tmpdir, 'model.json') + reg.save_model(model_path) + with open(model_path) as fd: + model = json.load(fd) + + parser_path = os.path.join(tm.PROJECT_ROOT, 'demo', 'json-model', + 'json_parser.py') + spec = importlib.util.spec_from_file_location("JsonParser", + parser_path) + foo = importlib.util.module_from_spec(spec) + spec.loader.exec_module(foo) + model = foo.Model(model) + splits = {} + total_nodes = 0 + for tree in model.trees: + n_nodes = len(tree.nodes) + total_nodes += n_nodes + for n in range(n_nodes): + if tree.is_leaf(n): + continue + if splits.get(tree.split_index(n), None) is None: + splits[tree.split_index(n)] = 1 + else: + splits[tree.split_index(n)] += 1 + + od = collections.OrderedDict(sorted(splits.items())) + tuples = [(k, v) for k, v in od.items()] + k, v = list(zip(*tuples)) + w = np.polyfit(k, v, deg=1) + return w + + +def test_feature_weights(): + poly_increasing = run_feature_weights(True) + poly_decreasing = run_feature_weights(False) + # Approxmated test, this is dependent on the implementation of random + # number generator in std library. + assert poly_increasing[0] > 0.08 + assert poly_decreasing[0] < -0.08 + + class TestBoostFromPrediction(unittest.TestCase): def run_boost_from_prediction(self, tree_method): from sklearn.datasets import load_breast_cancer diff --git a/tests/python/testing.py b/tests/python/testing.py index c3f78f78e966..30b44079607b 100644 --- a/tests/python/testing.py +++ b/tests/python/testing.py @@ -2,13 +2,17 @@ import os from xgboost.compat import SKLEARN_INSTALLED, PANDAS_INSTALLED from xgboost.compat import DASK_INSTALLED +import pytest +import tempfile +import xgboost as xgb +import numpy as np + +hypothesis = pytest.importorskip('hypothesis') +sklearn = pytest.importorskip('sklearn') from hypothesis import strategies from hypothesis.extra.numpy import arrays from joblib import Memory from sklearn import datasets -import tempfile -import xgboost as xgb -import numpy as np try: import cupy as cp diff --git a/tests/travis/run_test.sh b/tests/travis/run_test.sh index a0e1c9f28651..500aa1e57ae1 100755 --- a/tests/travis/run_test.sh +++ b/tests/travis/run_test.sh @@ -88,3 +88,19 @@ if [ ${TASK} == "cmake_test" ]; then cd .. rm -rf build fi + +if [ ${TASK} == "s390x_test" ]; then + set -e + + # Build and run C++ tests + rm -rf build + mkdir build && cd build + cmake .. -DCMAKE_VERBOSE_MAKEFILE=ON -DGOOGLE_TEST=ON -DUSE_OPENMP=ON -DUSE_DMLC_GTEST=ON -GNinja + time ninja -v + ./testxgboost + + # Run model compatibility tests + cd .. + python3 -m pip install --user pytest hypothesis + PYTHONPATH=./python-package python3 -m pytest --fulltrace -v -rxXs tests/python/ -k 'test_model' +fi diff --git a/tests/travis/setup.sh b/tests/travis/setup.sh index 5a7a91671da0..0e9f7e8fd687 100755 --- a/tests/travis/setup.sh +++ b/tests/travis/setup.sh @@ -20,6 +20,15 @@ if [ ${TASK} == "cmake_test" ] && [ ${TRAVIS_OS_NAME} == "osx" ]; then sudo softwareupdate -i "Command Line Tools (macOS High Sierra version 10.13) for Xcode-9.3" fi +if [ ${TASK} == "s390x_test" ] && [ ${TRAVIS_CPU_ARCH} == "s390x" ]; then + sudo snap install cmake --channel=3.17/beta --classic + export PATH=/snap/bin:${PATH} + cmake --version + sudo apt-get update + sudo apt-get install -y --no-install-recommends tar unzip wget git build-essential ninja-build \ + time python3 python3-pip python3-numpy python3-scipy python3-sklearn r-base +fi + if [ ${TASK} == "python_sdist_test" ] && [ ${TRAVIS_OS_NAME} == "linux" ]; then wget https://github.com/Kitware/CMake/releases/download/v3.17.1/cmake-3.17.1-Linux-x86_64.sh sudo bash cmake-3.17.1-Linux-x86_64.sh --prefix=/usr/local --skip-license