Merge branch 'master' into fix-repartition

dmlc · Sep 13, 2022 · 9f8f64e · 9f8f64e
2 parents ec05b26 + a268654
commit 9f8f64e
Show file tree

Hide file tree

Showing 106 changed files with 3,590 additions and 247 deletions.
diff --git a/.github/workflows/r_tests.yml b/.github/workflows/r_tests.yml
@@ -31,8 +31,8 @@ jobs:
       uses: actions/cache@v2
       with:
         path: ${{ env.R_LIBS_USER }}
-        key: ${{ runner.os }}-r-${{ matrix.config.r }}-4-${{ hashFiles('R-package/DESCRIPTION') }}
-        restore-keys: ${{ runner.os }}-r-${{ matrix.config.r }}-4-${{ hashFiles('R-package/DESCRIPTION') }}
+        key: ${{ runner.os }}-r-${{ matrix.config.r }}-5-${{ hashFiles('R-package/DESCRIPTION') }}
+        restore-keys: ${{ runner.os }}-r-${{ matrix.config.r }}-5-${{ hashFiles('R-package/DESCRIPTION') }}
 
     - name: Install dependencies
       shell: Rscript {0}
@@ -80,20 +80,25 @@ jobs:
       uses: actions/cache@v2
       with:
         path: ${{ env.R_LIBS_USER }}
-        key: ${{ runner.os }}-r-${{ matrix.config.r }}-4-${{ hashFiles('R-package/DESCRIPTION') }}
-        restore-keys: ${{ runner.os }}-r-${{ matrix.config.r }}-4-${{ hashFiles('R-package/DESCRIPTION') }}
+        key: ${{ runner.os }}-r-${{ matrix.config.r }}-5-${{ hashFiles('R-package/DESCRIPTION') }}
+        restore-keys: ${{ runner.os }}-r-${{ matrix.config.r }}-5-${{ hashFiles('R-package/DESCRIPTION') }}
 
     - name: Install dependencies
       shell: Rscript {0}
+      if: matrix.config.os != 'windows-latest'
       run: |
         install.packages(${{ env.R_PACKAGES }},
                          repos = 'http://cloud.r-project.org',
                          dependencies = c('Depends', 'Imports', 'LinkingTo'))
-    - name: Install igraph on Windows
+
+    - name: Install binary dependencies
       shell: Rscript {0}
       if: matrix.config.os == 'windows-latest'
       run: |
-        install.packages('igraph', type='binary', dependencies = c('Depends', 'Imports', 'LinkingTo'))
+        install.packages(${{ env.R_PACKAGES }},
+                         type = 'binary',
+                         repos = 'http://cloud.r-project.org',
+                         dependencies = c('Depends', 'Imports', 'LinkingTo'))
 
     - uses: actions/setup-python@v2
       with:
@@ -132,8 +137,8 @@ jobs:
       uses: actions/cache@v2
       with:
         path: ${{ env.R_LIBS_USER }}
-        key: ${{ runner.os }}-r-${{ matrix.config.r }}-4-${{ hashFiles('R-package/DESCRIPTION') }}
-        restore-keys: ${{ runner.os }}-r-${{ matrix.config.r }}-4-${{ hashFiles('R-package/DESCRIPTION') }}
+        key: ${{ runner.os }}-r-${{ matrix.config.r }}-5-${{ hashFiles('R-package/DESCRIPTION') }}
+        restore-keys: ${{ runner.os }}-r-${{ matrix.config.r }}-5-${{ hashFiles('R-package/DESCRIPTION') }}
 
     - name: Install dependencies
       shell: Rscript {0}

diff --git a/.gitignore b/.gitignore
@@ -97,8 +97,11 @@ metastore_db
 R-package/src/Makevars
 *.lib
 
-# Visual Studio Code
-/.vscode/
+# Visual Studio
+.vs/
+CMakeSettings.json
+*.ilk
+*.pdb
 
 # IntelliJ/CLion
 .idea

diff --git a/.readthedocs.yaml b/.readthedocs.yaml
@@ -0,0 +1,28 @@
+# .readthedocs.yaml
+# Read the Docs configuration file
+# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
+
+# Required
+version: 2
+
+# Set the version of Python and other tools you might need
+build:
+  os: ubuntu-22.04
+  tools:
+    python: "3.8"
+  apt_packages:
+    - graphviz
+
+# Build documentation in the docs/ directory with Sphinx
+sphinx:
+   configuration: doc/conf.py
+
+# If using Sphinx, optionally build your docs in additional formats such as PDF
+formats:
+   - pdf
+
+# Optionally declare the Python requirements required to build your docs
+python:
+  install:
+   - requirements: doc/requirements.txt
+  system_packages: true
diff --git a/R-package/src/Makevars.win b/R-package/src/Makevars.win
@@ -30,7 +30,7 @@ $(foreach v, $(XGB_RFLAGS), $(warning $(v)))
 
 PKG_CPPFLAGS=  -I$(PKGROOT)/include -I$(PKGROOT)/dmlc-core/include -I$(PKGROOT)/rabit/include -I$(PKGROOT) $(XGB_RFLAGS)
 PKG_CXXFLAGS= $(SHLIB_OPENMP_CXXFLAGS) $(SHLIB_PTHREAD_FLAGS)
-PKG_LIBS = $(SHLIB_OPENMP_CXXFLAGS) $(SHLIB_PTHREAD_FLAGS)
+PKG_LIBS = $(SHLIB_OPENMP_CXXFLAGS) $(SHLIB_PTHREAD_FLAGS) -lwsock32 -lws2_32
 OBJECTS= ./xgboost_R.o ./xgboost_custom.o ./xgboost_assert.o ./init.o \
          $(PKGROOT)/amalgamation/xgboost-all0.o $(PKGROOT)/amalgamation/dmlc-minimum0.o \
          $(PKGROOT)/rabit/src/engine.o $(PKGROOT)/rabit/src/rabit_c_api.o \

diff --git a/amalgamation/xgboost-all0.cc b/amalgamation/xgboost-all0.cc
@@ -71,6 +71,9 @@
 #include "../src/logging.cc"
 #include "../src/global_config.cc"
 
+// collective
+#include "../src/collective/communicator.cc"
+
 // common
 #include "../src/common/common.cc"
 #include "../src/common/column_matrix.cc"

diff --git a/cmake/Utils.cmake b/cmake/Utils.cmake
@@ -244,7 +244,7 @@ macro(xgboost_target_properties target)
       $<$<NOT:$<COMPILE_LANGUAGE:CUDA>>:/utf-8>
       -D_CRT_SECURE_NO_WARNINGS
       -D_CRT_SECURE_NO_DEPRECATE
-      )
+    )
   endif (MSVC)
 
   if (WIN32 AND MINGW)
@@ -314,4 +314,8 @@ macro(xgboost_target_link_libraries target)
   if (RABIT_BUILD_MPI)
     target_link_libraries(${target} PRIVATE MPI::MPI_CXX)
   endif (RABIT_BUILD_MPI)
+
+  if (MINGW)
+    target_link_libraries(${target} PRIVATE wsock32 ws2_32)
+  endif (MINGW)
 endmacro(xgboost_target_link_libraries)
diff --git a/doc/contrib/ci.rst b/doc/contrib/ci.rst
@@ -39,24 +39,58 @@ task of cross-compiling a Python wheel. (Note that ``cibuildwheel`` will call
 a glue code to call CMake and a C++ compiler to build the native library on the fly.)
 
 *******************************
-Reproducing errors from Jenkins
+Elastic CI Stack with BuildKite
 *******************************
 
-It is often useful to reproduce the particular testing environment from our Jenkins server for
-the purpose of troubleshooting a failing test. We use Docker containers heavily to package
-the testing environment, so you can use Docker to reproduce it on your own machine.
+`BuildKite <https://buildkite.com/home>`_ is a SaaS (Software as a Service) platform that orchestrates
+cloud machines to host CI pipelines. The BuildKite platform allows us to define cloud resources in
+a declarative fashion. Every configuration step is now documented explicitly as code.
 
-1. Install Docker: https://docs.docker.com/engine/install/ubuntu/
-2. Install NVIDIA Docker runtime: https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html#installing-on-ubuntu-and-debian
-   The runtime lets you access NVIDIA GPUs inside a Docker container.
-3. In a build log, all tests are invoked via the wrapper script ``tests/ci_build/ci_build.sh``.
-   Identify the test you'd like to reproduce locally, and note how the wrapper script was invoked for that test.
-   The invocation should look like this:
+**Prerequisite**: You should have some knowledge of `CloudFormation <https://aws.amazon.com/cloudformation/>`_.
+CloudFormation lets us define a stack of cloud resources (EC2 machines, Lambda functions, S3 etc) using
+a single YAML file.
 
-.. code-block:: bash
+**Prerequisite**: Gain access to the XGBoost project's AWS account (``admin@xgboost-ci.net``), and then
+set up a credential pair in order to provision resources on AWS. See
+`Creating an IAM user in your AWS account <https://docs.aws.amazon.com/IAM/latest/UserGuide/id_users_create.html>`_.
 
-  CI_DOCKER_EXTRA_PARAMS_INIT='--shm-size=4g' tests/ci_build/ci_build.sh gpu nvidia-docker \
-    --build-arg CUDA_VERSION_ARG=11.0 tests/ci_build/test_python.sh mgpu --use-rmm-pool
+* Option 1. Give full admin privileges to your IAM user. This is the simplest option.
+* Option 2. Give limited set of permissions to your IAM user, to reduce the possibility of messing up other resources.
+  For this, use the script ``tests/buildkite/infrastructure/service-user/create_service_user.py``.
 
-4. You can now run the same command on your own machine. The wrapper script will automatically download and
-   set up the correct Docker container(s).
+=====================
+Worker Image Pipeline
+=====================
+Building images for worker machines used to be a chore: you'd provision an EC2 machine, SSH into it, and
+manually install the necessary packages. This process is not only laborous but also error-prone. You may
+forget to install a package or change a system configuration.
+
+No more. Now we have an automated pipeline for building images for worker machines.
+
+* Run ``tests/buildkite/infrastructure/worker-image-pipeline/create_worker_image_pipelines.py`` in order to provision
+  CloudFormation stacks named ``buildkite-linux-amd64-gpu-worker`` and ``buildkite-windows-gpu-worker``. They are
+  pipelines that create AMIs (Amazon Machine Images) for Linux and Windows workers, respectively.
+* Navigate to the CloudFormation web console to verify that the image builder pipelines have been provisioned. It may
+  take some time.
+* Once they pipelines have been fully provisioned, run the script
+  ``tests/buildkite/infrastructure/worker-image-pipeline/run_pipelines.py`` to execute the pipelines. New AMIs will be
+  uploaded to the EC2 service. You can locate them in the EC2 console.
+* Make sure to modify ``tests/buildkite/infrastructure/aws-stack-creator/metadata.py`` to use the correct AMI IDs.
+  (For ``linux-amd64-cpu`` and ``linux-arm64-cpu``, use the AMIs provided by BuildKite. Consult the ``AWSRegion2AMI``
+  section of https://s3.amazonaws.com/buildkite-aws-stack/latest/aws-stack.yml.)
+
+======================
+EC2 Autoscaling Groups
+======================
+In EC2, you can create auto-scaling groups, where you can dynamically adjust the number of worker instances according to
+workload. When a pull request is submitted, the following steps take place:
+
+1. GitHub sends a signal to the registered webhook, which connects to the BuildKite server.
+2. BuildKite sends a signal to a `Lambda <https://aws.amazon.com/lambda/>`_ function named ``Autoscaling``.
+3. The Lambda function sends a signal to the auto-scaling group. The group scales up and adds additional worker instances.
+4. New worker instances run the test jobs. Test results are reported back to BuildKite.
+5. When the test jobs complete, BuildKite sends a signal to ``Autoscaling``, which in turn requests the autoscaling group
+   to scale down. Idle worker instances are shut down.
+
+To set up the auto-scaling group, run the script ``tests/buildkite/infrastructure/aws-stack-creator/create_stack.py``.
+Check the CloudFormation web console to verify successful provision of auto-scaling groups.
diff --git a/include/xgboost/c_api.h b/include/xgboost/c_api.h
@@ -9,10 +9,12 @@
 
 #ifdef __cplusplus
 #define XGB_EXTERN_C extern "C"
+#include <cstddef>
 #include <cstdio>
 #include <cstdint>
 #else
 #define XGB_EXTERN_C
+#include <stddef.h>
 #include <stdio.h>
 #include <stdint.h>
 #endif  // __cplusplus
@@ -1386,4 +1388,135 @@ XGB_DLL int XGBoosterFeatureScore(BoosterHandle handle, const char *json_config,
                                   bst_ulong *out_dim,
                                   bst_ulong const **out_shape,
                                   float const **out_scores);
+
+/*!
+ * \brief Initialize the collective communicator.
+ *
+ *  Currently the communicator API is experimental, function signatures may change in the future
+ *  without notice.
+ *
+ *  Call this once before using anything.
+ *
+ *  The additional configuration is not required. Usually the communicator will detect settings
+ *  from environment variables.
+ *
+ * \param json_config JSON encoded configuration. Accepted JSON keys are:
+ *   - xgboost_communicator: The type of the communicator. Can be set as an environment variable.
+ *     * rabit: Use Rabit. This is the default if the type is unspecified.
+ *     * mpi: Use MPI.
+ *     * federated: Use the gRPC interface for Federated Learning.
+ * Only applicable to the Rabit communicator (these are case-sensitive):
+ *   - rabit_tracker_uri: Hostname of the tracker.
+ *   - rabit_tracker_port: Port number of the tracker.
+ *   - rabit_task_id: ID of the current task, can be used to obtain deterministic rank assignment.
+ *   - rabit_world_size: Total number of workers.
+ *   - rabit_hadoop_mode: Enable Hadoop support.
+ *   - rabit_tree_reduce_minsize: Minimal size for tree reduce.
+ *   - rabit_reduce_ring_mincount: Minimal count to perform ring reduce.
+ *   - rabit_reduce_buffer: Size of the reduce buffer.
+ *   - rabit_bootstrap_cache: Size of the bootstrap cache.
+ *   - rabit_debug: Enable debugging.
+ *   - rabit_timeout: Enable timeout.
+ *   - rabit_timeout_sec: Timeout in seconds.
+ *   - rabit_enable_tcp_no_delay: Enable TCP no delay on Unix platforms.
+ * Only applicable to the Rabit communicator (these are case-sensitive, and can be set as
+ * environment variables):
+ *   - DMLC_TRACKER_URI: Hostname of the tracker.
+ *   - DMLC_TRACKER_PORT: Port number of the tracker.
+ *   - DMLC_TASK_ID: ID of the current task, can be used to obtain deterministic rank assignment.
+ *   - DMLC_ROLE: Role of the current task, "worker" or "server".
+ *   - DMLC_NUM_ATTEMPT: Number of attempts after task failure.
+ *   - DMLC_WORKER_CONNECT_RETRY: Number of retries to connect to the tracker.
+ * Only applicable to the Federated communicator (use upper case for environment variables, use
+ * lower case for runtime configuration):
+ *   - federated_server_address: Address of the federated server.
+ *   - federated_world_size: Number of federated workers.
+ *   - federated_rank: Rank of the current worker.
+ *   - federated_server_cert: Server certificate file path. Only needed for the SSL mode.
+ *   - federated_client_key: Client key file path. Only needed for the SSL mode.
+ *   - federated_client_cert: Client certificate file path. Only needed for the SSL mode.
+ * \return 0 for success, -1 for failure.
+ */
+XGB_DLL int XGCommunicatorInit(char const* json_config);
+
+/*!
+ * \brief Finalize the collective communicator.
+ *
+ * Call this function after you finished all jobs.
+ *
+ * \return 0 for success, -1 for failure.
+ */
+XGB_DLL int XGCommunicatorFinalize(void);
+
+/*!
+ * \brief Get rank of current process.
+ *
+ * \return Rank of the worker.
+ */
+XGB_DLL int XGCommunicatorGetRank(void);
+
+/*!
+ * \brief Get total number of processes.
+ *
+ * \return Total world size.
+ */
+XGB_DLL int XGCommunicatorGetWorldSize(void);
+
+/*!
+ * \brief Get if the communicator is distributed.
+ *
+ * \return True if the communicator is distributed.
+ */
+XGB_DLL int XGCommunicatorIsDistributed(void);
+
+/*!
+ * \brief Print the message to the communicator.
+ *
+ * This function can be used to communicate the information of the progress to the user who monitors
+ * the communicator.
+ *
+ * \param message The message to be printed.
+ * \return 0 for success, -1 for failure.
+ */
+XGB_DLL int XGCommunicatorPrint(char const *message);
+
+/*!
+ * \brief Get the name of the processor.
+ *
+ * \param name_str Pointer to received returned processor name.
+ * \return 0 for success, -1 for failure.
+ */
+XGB_DLL int XGCommunicatorGetProcessorName(const char** name_str);
+
+/*!
+ * \brief Broadcast a memory region to all others from root.  This function is NOT thread-safe.
+ *
+ * Example:
+ *   int a = 1;
+ *   Broadcast(&a, sizeof(a), root);
+ *
+ * \param send_receive_buffer Pointer to the send or receive buffer.
+ * \param size Size of the data.
+ * \param root The process rank to broadcast from.
+ * \return 0 for success, -1 for failure.
+ */
+XGB_DLL int XGCommunicatorBroadcast(void *send_receive_buffer, size_t size, int root);
+
+/*!
+ * \brief Perform in-place allreduce. This function is NOT thread-safe.
+ *
+ * Example Usage: the following code gives sum of the result
+ *     vector<int> data(10);
+ *     ...
+ *     Allreduce(&data[0], data.size(), DataType:kInt32, Op::kSum);
+ *     ...
+ * \param send_receive_buffer Buffer for both sending and receiving data.
+ * \param count Number of elements to be reduced.
+ * \param data_type Enumeration of data type, see xgboost::collective::DataType in communicator.h.
+ * \param op Enumeration of operation type, see xgboost::collective::Operation in communicator.h.
+ * \return 0 for success, -1 for failure.
+ */
+XGB_DLL int XGCommunicatorAllreduce(void *send_receive_buffer, size_t count, int data_type, int op);
+
+
 #endif  // XGBOOST_C_API_H_