Skip to content

Commit

Permalink
Merge remote-tracking branch 'upstream/master' into fix-repartition
Browse files Browse the repository at this point in the history
  • Loading branch information
hcho3 committed Sep 11, 2022
2 parents ec05b26 + bc81831 commit 89cde94
Show file tree
Hide file tree
Showing 81 changed files with 1,819 additions and 152 deletions.
21 changes: 13 additions & 8 deletions .github/workflows/r_tests.yml
Expand Up @@ -31,8 +31,8 @@ jobs:
uses: actions/cache@v2
with:
path: ${{ env.R_LIBS_USER }}
key: ${{ runner.os }}-r-${{ matrix.config.r }}-4-${{ hashFiles('R-package/DESCRIPTION') }}
restore-keys: ${{ runner.os }}-r-${{ matrix.config.r }}-4-${{ hashFiles('R-package/DESCRIPTION') }}
key: ${{ runner.os }}-r-${{ matrix.config.r }}-5-${{ hashFiles('R-package/DESCRIPTION') }}
restore-keys: ${{ runner.os }}-r-${{ matrix.config.r }}-5-${{ hashFiles('R-package/DESCRIPTION') }}

- name: Install dependencies
shell: Rscript {0}
Expand Down Expand Up @@ -80,20 +80,25 @@ jobs:
uses: actions/cache@v2
with:
path: ${{ env.R_LIBS_USER }}
key: ${{ runner.os }}-r-${{ matrix.config.r }}-4-${{ hashFiles('R-package/DESCRIPTION') }}
restore-keys: ${{ runner.os }}-r-${{ matrix.config.r }}-4-${{ hashFiles('R-package/DESCRIPTION') }}
key: ${{ runner.os }}-r-${{ matrix.config.r }}-5-${{ hashFiles('R-package/DESCRIPTION') }}
restore-keys: ${{ runner.os }}-r-${{ matrix.config.r }}-5-${{ hashFiles('R-package/DESCRIPTION') }}

- name: Install dependencies
shell: Rscript {0}
if: matrix.config.os != 'windows-latest'
run: |
install.packages(${{ env.R_PACKAGES }},
repos = 'http://cloud.r-project.org',
dependencies = c('Depends', 'Imports', 'LinkingTo'))
- name: Install igraph on Windows
- name: Install binary dependencies
shell: Rscript {0}
if: matrix.config.os == 'windows-latest'
run: |
install.packages('igraph', type='binary', dependencies = c('Depends', 'Imports', 'LinkingTo'))
install.packages(${{ env.R_PACKAGES }},
type = 'binary',
repos = 'http://cloud.r-project.org',
dependencies = c('Depends', 'Imports', 'LinkingTo'))
- uses: actions/setup-python@v2
with:
Expand Down Expand Up @@ -132,8 +137,8 @@ jobs:
uses: actions/cache@v2
with:
path: ${{ env.R_LIBS_USER }}
key: ${{ runner.os }}-r-${{ matrix.config.r }}-4-${{ hashFiles('R-package/DESCRIPTION') }}
restore-keys: ${{ runner.os }}-r-${{ matrix.config.r }}-4-${{ hashFiles('R-package/DESCRIPTION') }}
key: ${{ runner.os }}-r-${{ matrix.config.r }}-5-${{ hashFiles('R-package/DESCRIPTION') }}
restore-keys: ${{ runner.os }}-r-${{ matrix.config.r }}-5-${{ hashFiles('R-package/DESCRIPTION') }}

- name: Install dependencies
shell: Rscript {0}
Expand Down
7 changes: 5 additions & 2 deletions .gitignore
Expand Up @@ -97,8 +97,11 @@ metastore_db
R-package/src/Makevars
*.lib

# Visual Studio Code
/.vscode/
# Visual Studio
.vs/
CMakeSettings.json
*.ilk
*.pdb

# IntelliJ/CLion
.idea
Expand Down
28 changes: 28 additions & 0 deletions .readthedocs.yaml
@@ -0,0 +1,28 @@
# .readthedocs.yaml
# Read the Docs configuration file
# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details

# Required
version: 2

# Set the version of Python and other tools you might need
build:
os: ubuntu-22.04
tools:
python: "3.8"
apt_packages:
- graphviz

# Build documentation in the docs/ directory with Sphinx
sphinx:
configuration: doc/conf.py

# If using Sphinx, optionally build your docs in additional formats such as PDF
formats:
- pdf

# Optionally declare the Python requirements required to build your docs
python:
install:
- requirements: doc/requirements.txt
system_packages: true
2 changes: 1 addition & 1 deletion R-package/src/Makevars.win
Expand Up @@ -30,7 +30,7 @@ $(foreach v, $(XGB_RFLAGS), $(warning $(v)))

PKG_CPPFLAGS= -I$(PKGROOT)/include -I$(PKGROOT)/dmlc-core/include -I$(PKGROOT)/rabit/include -I$(PKGROOT) $(XGB_RFLAGS)
PKG_CXXFLAGS= $(SHLIB_OPENMP_CXXFLAGS) $(SHLIB_PTHREAD_FLAGS)
PKG_LIBS = $(SHLIB_OPENMP_CXXFLAGS) $(SHLIB_PTHREAD_FLAGS)
PKG_LIBS = $(SHLIB_OPENMP_CXXFLAGS) $(SHLIB_PTHREAD_FLAGS) -lwsock32 -lws2_32
OBJECTS= ./xgboost_R.o ./xgboost_custom.o ./xgboost_assert.o ./init.o \
$(PKGROOT)/amalgamation/xgboost-all0.o $(PKGROOT)/amalgamation/dmlc-minimum0.o \
$(PKGROOT)/rabit/src/engine.o $(PKGROOT)/rabit/src/rabit_c_api.o \
Expand Down
6 changes: 5 additions & 1 deletion cmake/Utils.cmake
Expand Up @@ -244,7 +244,7 @@ macro(xgboost_target_properties target)
$<$<NOT:$<COMPILE_LANGUAGE:CUDA>>:/utf-8>
-D_CRT_SECURE_NO_WARNINGS
-D_CRT_SECURE_NO_DEPRECATE
)
)
endif (MSVC)

if (WIN32 AND MINGW)
Expand Down Expand Up @@ -314,4 +314,8 @@ macro(xgboost_target_link_libraries target)
if (RABIT_BUILD_MPI)
target_link_libraries(${target} PRIVATE MPI::MPI_CXX)
endif (RABIT_BUILD_MPI)

if (MINGW)
target_link_libraries(${target} PRIVATE wsock32 ws2_32)
endif (MINGW)
endmacro(xgboost_target_link_libraries)
64 changes: 49 additions & 15 deletions doc/contrib/ci.rst
Expand Up @@ -39,24 +39,58 @@ task of cross-compiling a Python wheel. (Note that ``cibuildwheel`` will call
a glue code to call CMake and a C++ compiler to build the native library on the fly.)

*******************************
Reproducing errors from Jenkins
Elastic CI Stack with BuildKite
*******************************

It is often useful to reproduce the particular testing environment from our Jenkins server for
the purpose of troubleshooting a failing test. We use Docker containers heavily to package
the testing environment, so you can use Docker to reproduce it on your own machine.
`BuildKite <https://buildkite.com/home>`_ is a SaaS (Software as a Service) platform that orchestrates
cloud machines to host CI pipelines. The BuildKite platform allows us to define cloud resources in
a declarative fashion. Every configuration step is now documented explicitly as code.

1. Install Docker: https://docs.docker.com/engine/install/ubuntu/
2. Install NVIDIA Docker runtime: https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html#installing-on-ubuntu-and-debian
The runtime lets you access NVIDIA GPUs inside a Docker container.
3. In a build log, all tests are invoked via the wrapper script ``tests/ci_build/ci_build.sh``.
Identify the test you'd like to reproduce locally, and note how the wrapper script was invoked for that test.
The invocation should look like this:
**Prerequisite**: You should have some knowledge of `CloudFormation <https://aws.amazon.com/cloudformation/>`_.
CloudFormation lets us define a stack of cloud resources (EC2 machines, Lambda functions, S3 etc) using
a single YAML file.

.. code-block:: bash
**Prerequisite**: Gain access to the XGBoost project's AWS account (``admin@xgboost-ci.net``), and then
set up a credential pair in order to provision resources on AWS. See
`Creating an IAM user in your AWS account <https://docs.aws.amazon.com/IAM/latest/UserGuide/id_users_create.html>`_.

CI_DOCKER_EXTRA_PARAMS_INIT='--shm-size=4g' tests/ci_build/ci_build.sh gpu nvidia-docker \
--build-arg CUDA_VERSION_ARG=11.0 tests/ci_build/test_python.sh mgpu --use-rmm-pool
* Option 1. Give full admin privileges to your IAM user. This is the simplest option.
* Option 2. Give limited set of permissions to your IAM user, to reduce the possibility of messing up other resources.
For this, use the script ``tests/buildkite/infrastructure/service-user/create_service_user.py``.

4. You can now run the same command on your own machine. The wrapper script will automatically download and
set up the correct Docker container(s).
=====================
Worker Image Pipeline
=====================
Building images for worker machines used to be a chore: you'd provision an EC2 machine, SSH into it, and
manually install the necessary packages. This process is not only laborous but also error-prone. You may
forget to install a package or change a system configuration.

No more. Now we have an automated pipeline for building images for worker machines.

* Run ``tests/buildkite/infrastructure/worker-image-pipeline/create_worker_image_pipelines.py`` in order to provision
CloudFormation stacks named ``buildkite-linux-amd64-gpu-worker`` and ``buildkite-windows-gpu-worker``. They are
pipelines that create AMIs (Amazon Machine Images) for Linux and Windows workers, respectively.
* Navigate to the CloudFormation web console to verify that the image builder pipelines have been provisioned. It may
take some time.
* Once they pipelines have been fully provisioned, run the script
``tests/buildkite/infrastructure/worker-image-pipeline/run_pipelines.py`` to execute the pipelines. New AMIs will be
uploaded to the EC2 service. You can locate them in the EC2 console.
* Make sure to modify ``tests/buildkite/infrastructure/aws-stack-creator/metadata.py`` to use the correct AMI IDs.
(For ``linux-amd64-cpu`` and ``linux-arm64-cpu``, use the AMIs provided by BuildKite. Consult the ``AWSRegion2AMI``
section of https://s3.amazonaws.com/buildkite-aws-stack/latest/aws-stack.yml.)

======================
EC2 Autoscaling Groups
======================
In EC2, you can create auto-scaling groups, where you can dynamically adjust the number of worker instances according to
workload. When a pull request is submitted, the following steps take place:

1. GitHub sends a signal to the registered webhook, which connects to the BuildKite server.
2. BuildKite sends a signal to a `Lambda <https://aws.amazon.com/lambda/>`_ function named ``Autoscaling``.
3. The Lambda function sends a signal to the auto-scaling group. The group scales up and adds additional worker instances.
4. New worker instances run the test jobs. Test results are reported back to BuildKite.
5. When the test jobs complete, BuildKite sends a signal to ``Autoscaling``, which in turn requests the autoscaling group
to scale down. Idle worker instances are shut down.

To set up the auto-scaling group, run the script ``tests/buildkite/infrastructure/aws-stack-creator/create_stack.py``.
Check the CloudFormation web console to verify successful provision of auto-scaling groups.
3 changes: 3 additions & 0 deletions rabit/src/allreduce_base.cc
Expand Up @@ -5,7 +5,10 @@
*
* \author Tianqi Chen, Ignacio Cano, Tianyi Zhou
*/
#if !defined(NOMINMAX) && defined(_WIN32)
#define NOMINMAX
#endif // !defined(NOMINMAX)

#include "rabit/base.h"
#include "rabit/internal/rabit-inl.h"
#include "allreduce_base.h"
Expand Down
4 changes: 4 additions & 0 deletions src/cli_main.cc
Expand Up @@ -6,7 +6,11 @@
*/
#define _CRT_SECURE_NO_WARNINGS
#define _CRT_SECURE_NO_DEPRECATE

#if !defined(NOMINMAX) && defined(_WIN32)
#define NOMINMAX
#endif // !defined(NOMINMAX)

#include <dmlc/timer.h>

#include <xgboost/learner.h>
Expand Down
43 changes: 43 additions & 0 deletions tests/buildkite/build-cpu-arm64.sh
@@ -0,0 +1,43 @@
#!/bin/bash

set -euo pipefail

WHEEL_TAG=manylinux2014_aarch64

echo "--- Build CPU code targeting ARM64"

source tests/buildkite/conftest.sh

command_wrapper="tests/ci_build/ci_build.sh aarch64 docker"

echo "--- Build libxgboost from the source"
$command_wrapper tests/ci_build/build_via_cmake.sh --conda-env=aarch64_test \
-DUSE_OPENMP=ON -DHIDE_CXX_SYMBOL=ON
echo "--- Run Google Test"
$command_wrapper bash -c "cd build && ctest --extra-verbose"

echo "--- Build binary wheel"
$command_wrapper bash -c \
"cd python-package && rm -rf dist/* && python setup.py bdist_wheel --universal"
$command_wrapper python tests/ci_build/rename_whl.py python-package/dist/*.whl \
${BUILDKITE_COMMIT} ${WHEEL_TAG}

echo "--- Audit binary wheel to ensure it's compliant with manylinux2014 standard"
$command_wrapper auditwheel repair --plat ${WHEEL_TAG} python-package/dist/*.whl
$command_wrapper python tests/ci_build/rename_whl.py wheelhouse/*.whl \
${BUILDKITE_COMMIT} ${WHEEL_TAG}
mv -v wheelhouse/*.whl python-package/dist/
# Make sure that libgomp.so is vendored in the wheel
$command_wrapper bash -c \
"unzip -l python-package/dist/*.whl | grep libgomp || exit -1"

echo "--- Upload Python wheel"
buildkite-agent artifact upload "python-package/dist/*.whl"
if [[ ($is_pull_request == 0) && ($is_release_branch == 1) ]]
then
aws s3 cp python-package/dist/*.whl s3://xgboost-nightly-builds/${BRANCH_NAME}/ \
--acl public-read --no-progress
fi

echo "--- Stash XGBoost CLI executable"
buildkite-agent artifact upload ./xgboost
33 changes: 33 additions & 0 deletions tests/buildkite/build-cpu.sh
@@ -0,0 +1,33 @@
#!/bin/bash

set -euo pipefail

echo "--- Build CPU code"

source tests/buildkite/conftest.sh

command_wrapper="tests/ci_build/ci_build.sh cpu docker"

$command_wrapper rm -fv dmlc-core/include/dmlc/build_config_default.h
# This step is not necessary, but here we include it, to ensure that
# DMLC_CORE_USE_CMAKE flag is correctly propagated. We want to make sure that we use
# the configured header build/dmlc/build_config.h instead of
# include/dmlc/build_config_default.h.
echo "--- Build libxgboost from the source"
$command_wrapper tests/ci_build/build_via_cmake.sh -DPLUGIN_DENSE_PARSER=ON
echo "--- Run Google Test"
$command_wrapper bash -c "cd build && ctest --extra-verbose"
echo "--- Stash XGBoost CLI executable"
buildkite-agent artifact upload ./xgboost

# Sanitizer test
echo "--- Run Google Test with sanitizer enabled"
$command_wrapper tests/ci_build/build_via_cmake.sh -DUSE_SANITIZER=ON \
-DENABLED_SANITIZERS="address;leak;undefined" -DCMAKE_BUILD_TYPE=Debug \
-DSANITIZER_PATH=/usr/lib/x86_64-linux-gnu/
CI_DOCKER_EXTRA_PARAMS_INIT="-e ASAN_SYMBOLIZER_PATH=/usr/bin/llvm-symbolizer "`
`"-e ASAN_OPTIONS=symbolize=1 "`
`"-e UBSAN_OPTIONS=print_stacktrace=1:log_path=ubsan_error.log "`
`"--cap-add SYS_PTRACE" \
$command_wrapper bash -c "cd build && ctest --exclude-regex AllTestsInDMLCUnitTests "`
`"--extra-verbose"
26 changes: 26 additions & 0 deletions tests/buildkite/build-cuda-with-rmm.sh
@@ -0,0 +1,26 @@
#!/bin/bash

set -euo pipefail

CUDA_VERSION=11.0.3

source tests/buildkite/conftest.sh

echo "--- Build with CUDA ${CUDA_VERSION}, RMM enabled"

if [[ ($is_pull_request == 1) || ($is_release_branch == 0) ]]
then
arch_flag="-DGPU_COMPUTE_VER=75"
else
arch_flag=""
fi

command_wrapper="tests/ci_build/ci_build.sh rmm docker --build-arg "`
`"CUDA_VERSION_ARG=$CUDA_VERSION"

echo "--- Build libxgboost from the source"
$command_wrapper tests/ci_build/build_via_cmake.sh --conda-env=gpu_test -DUSE_CUDA=ON \
-DUSE_NCCL=ON -DPLUGIN_RMM=ON -DBUILD_WITH_CUDA_CUB=ON ${arch_flag}

echo "-- Stash C++ test executable (testxgboost)"
buildkite-agent artifact upload build/testxgboost
52 changes: 52 additions & 0 deletions tests/buildkite/build-cuda.sh
@@ -0,0 +1,52 @@
#!/bin/bash

set -euo pipefail

CUDA_VERSION=11.0.3
WHEEL_TAG=manylinux2014_x86_64

echo "--- Build with CUDA ${CUDA_VERSION}"

source tests/buildkite/conftest.sh

if [[ ($is_pull_request == 1) || ($is_release_branch == 0) ]]
then
arch_flag="-DGPU_COMPUTE_VER=75"
else
arch_flag=""
fi

command_wrapper="tests/ci_build/ci_build.sh gpu_build_centos7 docker --build-arg "`
`"CUDA_VERSION_ARG=$CUDA_VERSION"

echo "--- Build libxgboost from the source"
$command_wrapper tests/ci_build/prune_libnccl.sh
$command_wrapper tests/ci_build/build_via_cmake.sh -DUSE_CUDA=ON -DUSE_NCCL=ON \
-DUSE_OPENMP=ON -DHIDE_CXX_SYMBOLS=ON -DUSE_NCCL_LIB_PATH=ON \
-DNCCL_INCLUDE_DIR=/usr/include -DNCCL_LIBRARY=/workspace/libnccl_static.a \
${arch_flag}
echo "--- Build binary wheel"
$command_wrapper bash -c \
"cd python-package && rm -rf dist/* && python setup.py bdist_wheel --universal"
$command_wrapper python tests/ci_build/rename_whl.py python-package/dist/*.whl \
${BUILDKITE_COMMIT} ${WHEEL_TAG}

echo "--- Audit binary wheel to ensure it's compliant with manylinux2014 standard"
tests/ci_build/ci_build.sh auditwheel_x86_64 docker auditwheel repair \
--plat ${WHEEL_TAG} python-package/dist/*.whl
$command_wrapper python tests/ci_build/rename_whl.py wheelhouse/*.whl \
${BUILDKITE_COMMIT} ${WHEEL_TAG}
mv -v wheelhouse/*.whl python-package/dist/
# Make sure that libgomp.so is vendored in the wheel
tests/ci_build/ci_build.sh auditwheel_x86_64 docker bash -c \
"unzip -l python-package/dist/*.whl | grep libgomp || exit -1"

echo "--- Upload Python wheel"
buildkite-agent artifact upload python-package/dist/*.whl
if [[ ($is_pull_request == 0) && ($is_release_branch == 1) ]]
then
aws s3 cp python-package/dist/*.whl s3://xgboost-nightly-builds/${BRANCH_NAME}/ \
--acl public-read --no-progress
fi
echo "-- Stash C++ test executable (testxgboost)"
buildkite-agent artifact upload build/testxgboost

0 comments on commit 89cde94

Please sign in to comment.