Skip to content

Commit

Permalink
Merge branch 'master' into fix-repartition
Browse files Browse the repository at this point in the history
  • Loading branch information
WeichenXu123 committed Sep 13, 2022
2 parents ec05b26 + a268654 commit 9f8f64e
Show file tree
Hide file tree
Showing 106 changed files with 3,590 additions and 247 deletions.
21 changes: 13 additions & 8 deletions .github/workflows/r_tests.yml
Expand Up @@ -31,8 +31,8 @@ jobs:
uses: actions/cache@v2
with:
path: ${{ env.R_LIBS_USER }}
key: ${{ runner.os }}-r-${{ matrix.config.r }}-4-${{ hashFiles('R-package/DESCRIPTION') }}
restore-keys: ${{ runner.os }}-r-${{ matrix.config.r }}-4-${{ hashFiles('R-package/DESCRIPTION') }}
key: ${{ runner.os }}-r-${{ matrix.config.r }}-5-${{ hashFiles('R-package/DESCRIPTION') }}
restore-keys: ${{ runner.os }}-r-${{ matrix.config.r }}-5-${{ hashFiles('R-package/DESCRIPTION') }}

- name: Install dependencies
shell: Rscript {0}
Expand Down Expand Up @@ -80,20 +80,25 @@ jobs:
uses: actions/cache@v2
with:
path: ${{ env.R_LIBS_USER }}
key: ${{ runner.os }}-r-${{ matrix.config.r }}-4-${{ hashFiles('R-package/DESCRIPTION') }}
restore-keys: ${{ runner.os }}-r-${{ matrix.config.r }}-4-${{ hashFiles('R-package/DESCRIPTION') }}
key: ${{ runner.os }}-r-${{ matrix.config.r }}-5-${{ hashFiles('R-package/DESCRIPTION') }}
restore-keys: ${{ runner.os }}-r-${{ matrix.config.r }}-5-${{ hashFiles('R-package/DESCRIPTION') }}

- name: Install dependencies
shell: Rscript {0}
if: matrix.config.os != 'windows-latest'
run: |
install.packages(${{ env.R_PACKAGES }},
repos = 'http://cloud.r-project.org',
dependencies = c('Depends', 'Imports', 'LinkingTo'))
- name: Install igraph on Windows
- name: Install binary dependencies
shell: Rscript {0}
if: matrix.config.os == 'windows-latest'
run: |
install.packages('igraph', type='binary', dependencies = c('Depends', 'Imports', 'LinkingTo'))
install.packages(${{ env.R_PACKAGES }},
type = 'binary',
repos = 'http://cloud.r-project.org',
dependencies = c('Depends', 'Imports', 'LinkingTo'))
- uses: actions/setup-python@v2
with:
Expand Down Expand Up @@ -132,8 +137,8 @@ jobs:
uses: actions/cache@v2
with:
path: ${{ env.R_LIBS_USER }}
key: ${{ runner.os }}-r-${{ matrix.config.r }}-4-${{ hashFiles('R-package/DESCRIPTION') }}
restore-keys: ${{ runner.os }}-r-${{ matrix.config.r }}-4-${{ hashFiles('R-package/DESCRIPTION') }}
key: ${{ runner.os }}-r-${{ matrix.config.r }}-5-${{ hashFiles('R-package/DESCRIPTION') }}
restore-keys: ${{ runner.os }}-r-${{ matrix.config.r }}-5-${{ hashFiles('R-package/DESCRIPTION') }}

- name: Install dependencies
shell: Rscript {0}
Expand Down
7 changes: 5 additions & 2 deletions .gitignore
Expand Up @@ -97,8 +97,11 @@ metastore_db
R-package/src/Makevars
*.lib

# Visual Studio Code
/.vscode/
# Visual Studio
.vs/
CMakeSettings.json
*.ilk
*.pdb

# IntelliJ/CLion
.idea
Expand Down
28 changes: 28 additions & 0 deletions .readthedocs.yaml
@@ -0,0 +1,28 @@
# .readthedocs.yaml
# Read the Docs configuration file
# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details

# Required
version: 2

# Set the version of Python and other tools you might need
build:
os: ubuntu-22.04
tools:
python: "3.8"
apt_packages:
- graphviz

# Build documentation in the docs/ directory with Sphinx
sphinx:
configuration: doc/conf.py

# If using Sphinx, optionally build your docs in additional formats such as PDF
formats:
- pdf

# Optionally declare the Python requirements required to build your docs
python:
install:
- requirements: doc/requirements.txt
system_packages: true
2 changes: 1 addition & 1 deletion R-package/src/Makevars.win
Expand Up @@ -30,7 +30,7 @@ $(foreach v, $(XGB_RFLAGS), $(warning $(v)))

PKG_CPPFLAGS= -I$(PKGROOT)/include -I$(PKGROOT)/dmlc-core/include -I$(PKGROOT)/rabit/include -I$(PKGROOT) $(XGB_RFLAGS)
PKG_CXXFLAGS= $(SHLIB_OPENMP_CXXFLAGS) $(SHLIB_PTHREAD_FLAGS)
PKG_LIBS = $(SHLIB_OPENMP_CXXFLAGS) $(SHLIB_PTHREAD_FLAGS)
PKG_LIBS = $(SHLIB_OPENMP_CXXFLAGS) $(SHLIB_PTHREAD_FLAGS) -lwsock32 -lws2_32
OBJECTS= ./xgboost_R.o ./xgboost_custom.o ./xgboost_assert.o ./init.o \
$(PKGROOT)/amalgamation/xgboost-all0.o $(PKGROOT)/amalgamation/dmlc-minimum0.o \
$(PKGROOT)/rabit/src/engine.o $(PKGROOT)/rabit/src/rabit_c_api.o \
Expand Down
3 changes: 3 additions & 0 deletions amalgamation/xgboost-all0.cc
Expand Up @@ -71,6 +71,9 @@
#include "../src/logging.cc"
#include "../src/global_config.cc"

// collective
#include "../src/collective/communicator.cc"

// common
#include "../src/common/common.cc"
#include "../src/common/column_matrix.cc"
Expand Down
6 changes: 5 additions & 1 deletion cmake/Utils.cmake
Expand Up @@ -244,7 +244,7 @@ macro(xgboost_target_properties target)
$<$<NOT:$<COMPILE_LANGUAGE:CUDA>>:/utf-8>
-D_CRT_SECURE_NO_WARNINGS
-D_CRT_SECURE_NO_DEPRECATE
)
)
endif (MSVC)

if (WIN32 AND MINGW)
Expand Down Expand Up @@ -314,4 +314,8 @@ macro(xgboost_target_link_libraries target)
if (RABIT_BUILD_MPI)
target_link_libraries(${target} PRIVATE MPI::MPI_CXX)
endif (RABIT_BUILD_MPI)

if (MINGW)
target_link_libraries(${target} PRIVATE wsock32 ws2_32)
endif (MINGW)
endmacro(xgboost_target_link_libraries)
64 changes: 49 additions & 15 deletions doc/contrib/ci.rst
Expand Up @@ -39,24 +39,58 @@ task of cross-compiling a Python wheel. (Note that ``cibuildwheel`` will call
a glue code to call CMake and a C++ compiler to build the native library on the fly.)

*******************************
Reproducing errors from Jenkins
Elastic CI Stack with BuildKite
*******************************

It is often useful to reproduce the particular testing environment from our Jenkins server for
the purpose of troubleshooting a failing test. We use Docker containers heavily to package
the testing environment, so you can use Docker to reproduce it on your own machine.
`BuildKite <https://buildkite.com/home>`_ is a SaaS (Software as a Service) platform that orchestrates
cloud machines to host CI pipelines. The BuildKite platform allows us to define cloud resources in
a declarative fashion. Every configuration step is now documented explicitly as code.

1. Install Docker: https://docs.docker.com/engine/install/ubuntu/
2. Install NVIDIA Docker runtime: https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html#installing-on-ubuntu-and-debian
The runtime lets you access NVIDIA GPUs inside a Docker container.
3. In a build log, all tests are invoked via the wrapper script ``tests/ci_build/ci_build.sh``.
Identify the test you'd like to reproduce locally, and note how the wrapper script was invoked for that test.
The invocation should look like this:
**Prerequisite**: You should have some knowledge of `CloudFormation <https://aws.amazon.com/cloudformation/>`_.
CloudFormation lets us define a stack of cloud resources (EC2 machines, Lambda functions, S3 etc) using
a single YAML file.

.. code-block:: bash
**Prerequisite**: Gain access to the XGBoost project's AWS account (``admin@xgboost-ci.net``), and then
set up a credential pair in order to provision resources on AWS. See
`Creating an IAM user in your AWS account <https://docs.aws.amazon.com/IAM/latest/UserGuide/id_users_create.html>`_.

CI_DOCKER_EXTRA_PARAMS_INIT='--shm-size=4g' tests/ci_build/ci_build.sh gpu nvidia-docker \
--build-arg CUDA_VERSION_ARG=11.0 tests/ci_build/test_python.sh mgpu --use-rmm-pool
* Option 1. Give full admin privileges to your IAM user. This is the simplest option.
* Option 2. Give limited set of permissions to your IAM user, to reduce the possibility of messing up other resources.
For this, use the script ``tests/buildkite/infrastructure/service-user/create_service_user.py``.

4. You can now run the same command on your own machine. The wrapper script will automatically download and
set up the correct Docker container(s).
=====================
Worker Image Pipeline
=====================
Building images for worker machines used to be a chore: you'd provision an EC2 machine, SSH into it, and
manually install the necessary packages. This process is not only laborous but also error-prone. You may
forget to install a package or change a system configuration.

No more. Now we have an automated pipeline for building images for worker machines.

* Run ``tests/buildkite/infrastructure/worker-image-pipeline/create_worker_image_pipelines.py`` in order to provision
CloudFormation stacks named ``buildkite-linux-amd64-gpu-worker`` and ``buildkite-windows-gpu-worker``. They are
pipelines that create AMIs (Amazon Machine Images) for Linux and Windows workers, respectively.
* Navigate to the CloudFormation web console to verify that the image builder pipelines have been provisioned. It may
take some time.
* Once they pipelines have been fully provisioned, run the script
``tests/buildkite/infrastructure/worker-image-pipeline/run_pipelines.py`` to execute the pipelines. New AMIs will be
uploaded to the EC2 service. You can locate them in the EC2 console.
* Make sure to modify ``tests/buildkite/infrastructure/aws-stack-creator/metadata.py`` to use the correct AMI IDs.
(For ``linux-amd64-cpu`` and ``linux-arm64-cpu``, use the AMIs provided by BuildKite. Consult the ``AWSRegion2AMI``
section of https://s3.amazonaws.com/buildkite-aws-stack/latest/aws-stack.yml.)

======================
EC2 Autoscaling Groups
======================
In EC2, you can create auto-scaling groups, where you can dynamically adjust the number of worker instances according to
workload. When a pull request is submitted, the following steps take place:

1. GitHub sends a signal to the registered webhook, which connects to the BuildKite server.
2. BuildKite sends a signal to a `Lambda <https://aws.amazon.com/lambda/>`_ function named ``Autoscaling``.
3. The Lambda function sends a signal to the auto-scaling group. The group scales up and adds additional worker instances.
4. New worker instances run the test jobs. Test results are reported back to BuildKite.
5. When the test jobs complete, BuildKite sends a signal to ``Autoscaling``, which in turn requests the autoscaling group
to scale down. Idle worker instances are shut down.

To set up the auto-scaling group, run the script ``tests/buildkite/infrastructure/aws-stack-creator/create_stack.py``.
Check the CloudFormation web console to verify successful provision of auto-scaling groups.
133 changes: 133 additions & 0 deletions include/xgboost/c_api.h
Expand Up @@ -9,10 +9,12 @@

#ifdef __cplusplus
#define XGB_EXTERN_C extern "C"
#include <cstddef>
#include <cstdio>
#include <cstdint>
#else
#define XGB_EXTERN_C
#include <stddef.h>
#include <stdio.h>
#include <stdint.h>
#endif // __cplusplus
Expand Down Expand Up @@ -1386,4 +1388,135 @@ XGB_DLL int XGBoosterFeatureScore(BoosterHandle handle, const char *json_config,
bst_ulong *out_dim,
bst_ulong const **out_shape,
float const **out_scores);

/*!
* \brief Initialize the collective communicator.
*
* Currently the communicator API is experimental, function signatures may change in the future
* without notice.
*
* Call this once before using anything.
*
* The additional configuration is not required. Usually the communicator will detect settings
* from environment variables.
*
* \param json_config JSON encoded configuration. Accepted JSON keys are:
* - xgboost_communicator: The type of the communicator. Can be set as an environment variable.
* * rabit: Use Rabit. This is the default if the type is unspecified.
* * mpi: Use MPI.
* * federated: Use the gRPC interface for Federated Learning.
* Only applicable to the Rabit communicator (these are case-sensitive):
* - rabit_tracker_uri: Hostname of the tracker.
* - rabit_tracker_port: Port number of the tracker.
* - rabit_task_id: ID of the current task, can be used to obtain deterministic rank assignment.
* - rabit_world_size: Total number of workers.
* - rabit_hadoop_mode: Enable Hadoop support.
* - rabit_tree_reduce_minsize: Minimal size for tree reduce.
* - rabit_reduce_ring_mincount: Minimal count to perform ring reduce.
* - rabit_reduce_buffer: Size of the reduce buffer.
* - rabit_bootstrap_cache: Size of the bootstrap cache.
* - rabit_debug: Enable debugging.
* - rabit_timeout: Enable timeout.
* - rabit_timeout_sec: Timeout in seconds.
* - rabit_enable_tcp_no_delay: Enable TCP no delay on Unix platforms.
* Only applicable to the Rabit communicator (these are case-sensitive, and can be set as
* environment variables):
* - DMLC_TRACKER_URI: Hostname of the tracker.
* - DMLC_TRACKER_PORT: Port number of the tracker.
* - DMLC_TASK_ID: ID of the current task, can be used to obtain deterministic rank assignment.
* - DMLC_ROLE: Role of the current task, "worker" or "server".
* - DMLC_NUM_ATTEMPT: Number of attempts after task failure.
* - DMLC_WORKER_CONNECT_RETRY: Number of retries to connect to the tracker.
* Only applicable to the Federated communicator (use upper case for environment variables, use
* lower case for runtime configuration):
* - federated_server_address: Address of the federated server.
* - federated_world_size: Number of federated workers.
* - federated_rank: Rank of the current worker.
* - federated_server_cert: Server certificate file path. Only needed for the SSL mode.
* - federated_client_key: Client key file path. Only needed for the SSL mode.
* - federated_client_cert: Client certificate file path. Only needed for the SSL mode.
* \return 0 for success, -1 for failure.
*/
XGB_DLL int XGCommunicatorInit(char const* json_config);

/*!
* \brief Finalize the collective communicator.
*
* Call this function after you finished all jobs.
*
* \return 0 for success, -1 for failure.
*/
XGB_DLL int XGCommunicatorFinalize(void);

/*!
* \brief Get rank of current process.
*
* \return Rank of the worker.
*/
XGB_DLL int XGCommunicatorGetRank(void);

/*!
* \brief Get total number of processes.
*
* \return Total world size.
*/
XGB_DLL int XGCommunicatorGetWorldSize(void);

/*!
* \brief Get if the communicator is distributed.
*
* \return True if the communicator is distributed.
*/
XGB_DLL int XGCommunicatorIsDistributed(void);

/*!
* \brief Print the message to the communicator.
*
* This function can be used to communicate the information of the progress to the user who monitors
* the communicator.
*
* \param message The message to be printed.
* \return 0 for success, -1 for failure.
*/
XGB_DLL int XGCommunicatorPrint(char const *message);

/*!
* \brief Get the name of the processor.
*
* \param name_str Pointer to received returned processor name.
* \return 0 for success, -1 for failure.
*/
XGB_DLL int XGCommunicatorGetProcessorName(const char** name_str);

/*!
* \brief Broadcast a memory region to all others from root. This function is NOT thread-safe.
*
* Example:
* int a = 1;
* Broadcast(&a, sizeof(a), root);
*
* \param send_receive_buffer Pointer to the send or receive buffer.
* \param size Size of the data.
* \param root The process rank to broadcast from.
* \return 0 for success, -1 for failure.
*/
XGB_DLL int XGCommunicatorBroadcast(void *send_receive_buffer, size_t size, int root);

/*!
* \brief Perform in-place allreduce. This function is NOT thread-safe.
*
* Example Usage: the following code gives sum of the result
* vector<int> data(10);
* ...
* Allreduce(&data[0], data.size(), DataType:kInt32, Op::kSum);
* ...
* \param send_receive_buffer Buffer for both sending and receiving data.
* \param count Number of elements to be reduced.
* \param data_type Enumeration of data type, see xgboost::collective::DataType in communicator.h.
* \param op Enumeration of operation type, see xgboost::collective::Operation in communicator.h.
* \return 0 for success, -1 for failure.
*/
XGB_DLL int XGCommunicatorAllreduce(void *send_receive_buffer, size_t count, int data_type, int op);


#endif // XGBOOST_C_API_H_

0 comments on commit 9f8f64e

Please sign in to comment.