From d6781b3f3893c03061dff030f7f148777f38d4a8 Mon Sep 17 00:00:00 2001 From: fis Date: Thu, 1 Sep 2022 00:19:50 +0800 Subject: [PATCH 01/10] Copy ellpack to ghist. Start working on sparse data. Fix race. Remove check. Merge functions. Cleanup. Cleanup. Start writing tests. Fix. comp column. Python test. lint. lint. Fix. Cleanup. Avoid binary search. Use quantile dmatrix by default in sklearn interface. dask as well. Fix max_bin. Fix empty dmatrix for CPU. Fix GPU version. Fix empty DMatrix. pylint. --- python-package/xgboost/dask.py | 25 +++++++++++++------ python-package/xgboost/sklearn.py | 23 ++++++++++++++---- src/data/iterative_dmatrix.cc | 40 +++++++++++++++++++++++++++++++ src/data/iterative_dmatrix.cu | 11 +++++++-- src/data/iterative_dmatrix.h | 25 +------------------ 5 files changed, 87 insertions(+), 37 deletions(-) diff --git a/python-package/xgboost/dask.py b/python-package/xgboost/dask.py index 75eeba875fee..9a74d0143681 100644 --- a/python-package/xgboost/dask.py +++ b/python-package/xgboost/dask.py @@ -726,10 +726,9 @@ def _create_quantile_dmatrix( if parts is None: msg = f"worker {worker.address} has an empty DMatrix." LOGGER.warning(msg) - import cupy d = QuantileDMatrix( - cupy.zeros((0, 0)), + numpy.empty((0, 0)), feature_names=feature_names, feature_types=feature_types, max_bin=max_bin, @@ -1544,15 +1543,21 @@ def inplace_predict( # pylint: disable=unused-argument async def _async_wrap_evaluation_matrices( - client: Optional["distributed.Client"], **kwargs: Any + client: Optional["distributed.Client"], + tree_method: Optional[str], + max_bin: Optional[int], + **kwargs: Any, ) -> Tuple[DaskDMatrix, Optional[List[Tuple[DaskDMatrix, str]]]]: """A switch function for async environment.""" - def _inner(**kwargs: Any) -> DaskDMatrix: - m = DaskDMatrix(client=client, **kwargs) - return m + def _dispatch(ref: Optional[DaskDMatrix], **kwargs: Any) -> DaskDMatrix: + if tree_method in ("hist", "gpu_hist"): + return DaskQuantileDMatrix( + client=client, ref=ref, max_bin=max_bin, **kwargs + ) + return DaskDMatrix(client=client, **kwargs) - train_dmatrix, evals = _wrap_evaluation_matrices(create_dmatrix=_inner, **kwargs) + train_dmatrix, evals = _wrap_evaluation_matrices(create_dmatrix=_dispatch, **kwargs) train_dmatrix = await train_dmatrix if evals is None: return train_dmatrix, evals @@ -1756,6 +1761,8 @@ async def _fit_async( params = self.get_xgb_params() dtrain, evals = await _async_wrap_evaluation_matrices( client=self.client, + tree_method=self.tree_method, + max_bin=self.max_bin, X=X, y=y, group=None, @@ -1851,6 +1858,8 @@ async def _fit_async( params = self.get_xgb_params() dtrain, evals = await _async_wrap_evaluation_matrices( self.client, + tree_method=self.tree_method, + max_bin=self.max_bin, X=X, y=y, group=None, @@ -2057,6 +2066,8 @@ async def _fit_async( params = self.get_xgb_params() dtrain, evals = await _async_wrap_evaluation_matrices( self.client, + tree_method=self.tree_method, + max_bin=self.max_bin, X=X, y=y, group=None, diff --git a/python-package/xgboost/sklearn.py b/python-package/xgboost/sklearn.py index 98ad43af64ad..d820ccc01f87 100644 --- a/python-package/xgboost/sklearn.py +++ b/python-package/xgboost/sklearn.py @@ -38,6 +38,7 @@ Booster, DMatrix, Metric, + QuantileDMatrix, XGBoostError, _convert_ntree_limit, _deprecate_positional_args, @@ -430,7 +431,8 @@ def _wrap_evaluation_matrices( enable_categorical: bool, feature_types: Optional[FeatureTypes], ) -> Tuple[Any, List[Tuple[Any, str]]]: - """Convert array_like evaluation matrices into DMatrix. Perform validation on the way.""" + """Convert array_like evaluation matrices into DMatrix. Perform validation on the + way.""" train_dmatrix = create_dmatrix( data=X, label=y, @@ -442,6 +444,7 @@ def _wrap_evaluation_matrices( missing=missing, enable_categorical=enable_categorical, feature_types=feature_types, + ref=None, ) n_validation = 0 if eval_set is None else len(eval_set) @@ -491,6 +494,7 @@ def validate_or_none(meta: Optional[Sequence], name: str) -> Sequence: missing=missing, enable_categorical=enable_categorical, feature_types=feature_types, + ref=train_dmatrix, ) evals.append(m) nevals = len(evals) @@ -904,6 +908,17 @@ def _duplicated(parameter: str) -> None: return model, metric, params, early_stopping_rounds, callbacks + def _create_dmatrix(self, ref: Optional[DMatrix], **kwargs: Any) -> DMatrix: + # Use `QuantileDMatrix` to save memory. + if self.tree_method in ("hist", "gpu_hist"): + try: + return QuantileDMatrix( + **kwargs, ref=ref, nthread=self.n_jobs, max_bin=self.max_bin + ) + except TypeError: # `QuantileDMatrix` supports lesser types than DMatrix + pass + return DMatrix(**kwargs, nthread=self.n_jobs) + def _set_evaluation_result(self, evals_result: TrainingCallback.EvalsLog) -> None: if evals_result: self.evals_result_ = cast(Dict[str, Dict[str, List[float]]], evals_result) @@ -996,7 +1011,7 @@ def fit( base_margin_eval_set=base_margin_eval_set, eval_group=None, eval_qid=None, - create_dmatrix=lambda **kwargs: DMatrix(nthread=self.n_jobs, **kwargs), + create_dmatrix=self._create_dmatrix, enable_categorical=self.enable_categorical, feature_types=self.feature_types, ) @@ -1479,7 +1494,7 @@ def fit( base_margin_eval_set=base_margin_eval_set, eval_group=None, eval_qid=None, - create_dmatrix=lambda **kwargs: DMatrix(nthread=self.n_jobs, **kwargs), + create_dmatrix=self._create_dmatrix, enable_categorical=self.enable_categorical, feature_types=self.feature_types, ) @@ -1930,7 +1945,7 @@ def fit( base_margin_eval_set=base_margin_eval_set, eval_group=eval_group, eval_qid=eval_qid, - create_dmatrix=lambda **kwargs: DMatrix(nthread=self.n_jobs, **kwargs), + create_dmatrix=self._create_dmatrix, enable_categorical=self.enable_categorical, feature_types=self.feature_types, ) diff --git a/src/data/iterative_dmatrix.cc b/src/data/iterative_dmatrix.cc index f108c746ba09..352c7cc2cce7 100644 --- a/src/data/iterative_dmatrix.cc +++ b/src/data/iterative_dmatrix.cc @@ -14,6 +14,45 @@ namespace xgboost { namespace data { +IterativeDMatrix::IterativeDMatrix(DataIterHandle iter_handle, DMatrixHandle proxy, + std::shared_ptr ref, DataIterResetCallback* reset, + XGDMatrixCallbackNext* next, float missing, int nthread, + bst_bin_t max_bin) + : proxy_{proxy}, reset_{reset}, next_{next} { + // fetch the first batch + auto iter = + DataIterProxy{iter_handle, reset_, next_}; + iter.Reset(); + bool valid = iter.Next(); + CHECK(valid) << "Iterative DMatrix must have at least 1 batch."; + + auto d = MakeProxy(proxy_)->DeviceIdx(); + + StringView msg{"All batch should be on the same device."}; + if (batch_param_.gpu_id != Context::kCpuId) { + CHECK_EQ(d, batch_param_.gpu_id) << msg; + } + + int32_t max_device{d}; + rabit::Allreduce(&max_device, 1); + if (max_device != d) { + CHECK_EQ(MakeProxy(proxy_)->Info().num_row_, 0); + CHECK_NE(d, Context::kCpuId) << msg; + d = max_device; + } + + batch_param_ = BatchParam{d, max_bin}; + batch_param_.sparse_thresh = 0.2; // default from TrainParam + + ctx_.UpdateAllowUnknown( + Args{{"nthread", std::to_string(nthread)}, {"gpu_id", std::to_string(d)}}); + if (ctx_.IsCPU()) { + this->InitFromCPU(iter_handle, missing, ref); + } else { + this->InitFromCUDA(iter_handle, missing, ref); + } +} + void GetCutsFromRef(std::shared_ptr ref_, bst_feature_t n_features, BatchParam p, common::HistogramCuts* p_cuts) { CHECK(ref_); @@ -199,6 +238,7 @@ void IterativeDMatrix::InitFromCPU(DataIterHandle iter_handle, float missing, if (n_batches == 1) { this->info_ = std::move(proxy->Info()); this->info_.num_nonzero_ = nnz; + this->info_.num_col_ = n_features; // proxy might be empty. CHECK_EQ(proxy->Info().labels.Size(), 0); } } diff --git a/src/data/iterative_dmatrix.cu b/src/data/iterative_dmatrix.cu index 901662852a15..ceb470a5c7e5 100644 --- a/src/data/iterative_dmatrix.cu +++ b/src/data/iterative_dmatrix.cu @@ -173,8 +173,15 @@ BatchSet IterativeDMatrix::GetEllpackBatches(BatchParam const& para } if (!ellpack_ && ghist_) { ellpack_.reset(new EllpackPage()); - this->ctx_.gpu_id = param.gpu_id; - this->Info().feature_types.SetDevice(param.gpu_id); + // Evaluation QuantileDMatrix initialized from CPU data might not have the correct GPU + // ID. + if (this->ctx_.IsCPU()) { + this->ctx_.gpu_id = param.gpu_id; + } + if (this->ctx_.IsCPU()) { + this->ctx_.gpu_id = dh::CurrentDevice(); + } + this->Info().feature_types.SetDevice(this->ctx_.gpu_id); *ellpack_->Impl() = EllpackPageImpl(&ctx_, *this->ghist_, this->Info().feature_types.ConstDeviceSpan()); } diff --git a/src/data/iterative_dmatrix.h b/src/data/iterative_dmatrix.h index 7a8e5188c921..30358bb819ca 100644 --- a/src/data/iterative_dmatrix.h +++ b/src/data/iterative_dmatrix.h @@ -75,30 +75,7 @@ class IterativeDMatrix : public DMatrix { explicit IterativeDMatrix(DataIterHandle iter_handle, DMatrixHandle proxy, std::shared_ptr ref, DataIterResetCallback *reset, XGDMatrixCallbackNext *next, float missing, int nthread, - bst_bin_t max_bin) - : proxy_{proxy}, reset_{reset}, next_{next} { - // fetch the first batch - auto iter = - DataIterProxy{iter_handle, reset_, next_}; - iter.Reset(); - bool valid = iter.Next(); - CHECK(valid) << "Iterative DMatrix must have at least 1 batch."; - - auto d = MakeProxy(proxy_)->DeviceIdx(); - if (batch_param_.gpu_id != Context::kCpuId) { - CHECK_EQ(d, batch_param_.gpu_id) << "All batch should be on the same device."; - } - batch_param_ = BatchParam{d, max_bin}; - batch_param_.sparse_thresh = 0.2; // default from TrainParam - - ctx_.UpdateAllowUnknown( - Args{{"nthread", std::to_string(nthread)}, {"gpu_id", std::to_string(d)}}); - if (ctx_.IsCPU()) { - this->InitFromCPU(iter_handle, missing, ref); - } else { - this->InitFromCUDA(iter_handle, missing, ref); - } - } + bst_bin_t max_bin); ~IterativeDMatrix() override = default; bool EllpackExists() const override { return static_cast(ellpack_); } From 7654b8827f6ddf45be4437d7006a1a665d2188e6 Mon Sep 17 00:00:00 2001 From: fis Date: Thu, 8 Sep 2022 18:46:35 +0800 Subject: [PATCH 02/10] log. --- src/data/iterative_dmatrix.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/data/iterative_dmatrix.cc b/src/data/iterative_dmatrix.cc index 352c7cc2cce7..c8752785f43b 100644 --- a/src/data/iterative_dmatrix.cc +++ b/src/data/iterative_dmatrix.cc @@ -36,7 +36,8 @@ IterativeDMatrix::IterativeDMatrix(DataIterHandle iter_handle, DMatrixHandle pro int32_t max_device{d}; rabit::Allreduce(&max_device, 1); if (max_device != d) { - CHECK_EQ(MakeProxy(proxy_)->Info().num_row_, 0); + CHECK_EQ(MakeProxy(proxy_)->Info().num_row_, 0) + << "max device:" << max_device << " device:" << d; CHECK_NE(d, Context::kCpuId) << msg; d = max_device; } From cc90c2bc16db715e73b90874ece4b53527e41423 Mon Sep 17 00:00:00 2001 From: fis Date: Thu, 8 Sep 2022 18:58:12 +0800 Subject: [PATCH 03/10] Remove hack. --- src/data/iterative_dmatrix.cc | 9 --------- 1 file changed, 9 deletions(-) diff --git a/src/data/iterative_dmatrix.cc b/src/data/iterative_dmatrix.cc index c8752785f43b..60591a1d91b8 100644 --- a/src/data/iterative_dmatrix.cc +++ b/src/data/iterative_dmatrix.cc @@ -33,15 +33,6 @@ IterativeDMatrix::IterativeDMatrix(DataIterHandle iter_handle, DMatrixHandle pro CHECK_EQ(d, batch_param_.gpu_id) << msg; } - int32_t max_device{d}; - rabit::Allreduce(&max_device, 1); - if (max_device != d) { - CHECK_EQ(MakeProxy(proxy_)->Info().num_row_, 0) - << "max device:" << max_device << " device:" << d; - CHECK_NE(d, Context::kCpuId) << msg; - d = max_device; - } - batch_param_ = BatchParam{d, max_bin}; batch_param_.sparse_thresh = 0.2; // default from TrainParam From 23faf656ad47d89b0d230b06d8f7e8c61ae583c6 Mon Sep 17 00:00:00 2001 From: Philip Hyunsu Cho Date: Thu, 8 Sep 2022 10:26:22 -0700 Subject: [PATCH 04/10] [CI] Don't require manual approval for master branch (#8235) --- tests/buildkite/pipeline-win64.yml | 1 + tests/buildkite/pipeline.yml | 1 + 2 files changed, 2 insertions(+) diff --git a/tests/buildkite/pipeline-win64.yml b/tests/buildkite/pipeline-win64.yml index afb6aafba680..d4d171b02b02 100644 --- a/tests/buildkite/pipeline-win64.yml +++ b/tests/buildkite/pipeline-win64.yml @@ -1,5 +1,6 @@ steps: - block: ":rocket: Run this test job" + if: build.pull_request.repository.fork == true #### -------- BUILD -------- - label: ":windows: Build XGBoost for Windows with CUDA" command: "tests/buildkite/build-win64-gpu.ps1" diff --git a/tests/buildkite/pipeline.yml b/tests/buildkite/pipeline.yml index 7180e0f5020c..86763e75ca6d 100644 --- a/tests/buildkite/pipeline.yml +++ b/tests/buildkite/pipeline.yml @@ -3,6 +3,7 @@ env: DOCKER_CACHE_ECR_REGION: "us-west-2" steps: - block: ":rocket: Run this test job" + if: build.pull_request.repository.fork == true #### -------- BUILD -------- - label: ":console: Run clang-tidy" command: "tests/buildkite/run-clang-tidy.sh" From 70045739ad7cd63dfdf37e2d7f997730329145a2 Mon Sep 17 00:00:00 2001 From: jiamingy Date: Fri, 9 Sep 2022 04:32:29 +0800 Subject: [PATCH 05/10] Avoid duplicated parameter values. --- src/data/iterative_dmatrix.cc | 8 +++++++- src/tree/param.h | 8 ++++++-- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/src/data/iterative_dmatrix.cc b/src/data/iterative_dmatrix.cc index 60591a1d91b8..30583a9439bc 100644 --- a/src/data/iterative_dmatrix.cc +++ b/src/data/iterative_dmatrix.cc @@ -7,6 +7,7 @@ #include "../common/column_matrix.h" #include "../common/hist_util.h" +#include "../tree/param.h" // FIXME(jiamingy): Find a better way to share this parameter. #include "gradient_index.h" #include "proxy_dmatrix.h" #include "simple_batch_iterator.h" @@ -34,7 +35,8 @@ IterativeDMatrix::IterativeDMatrix(DataIterHandle iter_handle, DMatrixHandle pro } batch_param_ = BatchParam{d, max_bin}; - batch_param_.sparse_thresh = 0.2; // default from TrainParam + // hardcoded parameter. + batch_param_.sparse_thresh = tree::TrainParam::DftSparseThreshold(); ctx_.UpdateAllowUnknown( Args{{"nthread", std::to_string(nthread)}, {"gpu_id", std::to_string(d)}}); @@ -242,6 +244,10 @@ BatchSet IterativeDMatrix::GetGradientIndex(BatchParam const& ghist_ = std::make_shared(&ctx_, Info(), *ellpack_, param); } + if (param.sparse_thresh != tree::TrainParam::DftSparseThreshold()) { + LOG(WARNING) << "`sparse_threshold` can not be changed when `QuantileDMatrix` is used instead " + "of `DMatrix`."; + } auto begin_iter = BatchIterator(new SimpleBatchIteratorImpl(ghist_)); return BatchSet(begin_iter); diff --git a/src/tree/param.h b/src/tree/param.h index 7930dde8975b..3f5e4ec7bc71 100644 --- a/src/tree/param.h +++ b/src/tree/param.h @@ -78,7 +78,9 @@ struct TrainParam : public XGBoostParameter { // ------ From CPU quantile histogram -------. // percentage threshold for treating a feature as sparse // e.g. 0.2 indicates a feature with fewer than 20% nonzeros is considered sparse - double sparse_threshold; + static constexpr double DftSparseThreshold() { return 0.2; } + + double sparse_threshold{DftSparseThreshold()}; // declare the parameters DMLC_DECLARE_PARAMETER(TrainParam) { @@ -182,7 +184,9 @@ struct TrainParam : public XGBoostParameter { "See tutorial for more information"); // ------ From cpu quantile histogram -------. - DMLC_DECLARE_FIELD(sparse_threshold).set_range(0, 1.0).set_default(0.2) + DMLC_DECLARE_FIELD(sparse_threshold) + .set_range(0, 1.0) + .set_default(DftSparseThreshold()) .describe("percentage threshold for treating a feature as sparse"); // add alias of parameters From dd44ac91b844c47f2b97a20a3e7f7a7a5273748a Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Sat, 10 Sep 2022 10:51:15 +0800 Subject: [PATCH 06/10] [CI] Use binary R dependencies on Windows. (#8241) --- .github/workflows/r_tests.yml | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/.github/workflows/r_tests.yml b/.github/workflows/r_tests.yml index ebc22056443e..d49d5b936ddd 100644 --- a/.github/workflows/r_tests.yml +++ b/.github/workflows/r_tests.yml @@ -31,8 +31,8 @@ jobs: uses: actions/cache@v2 with: path: ${{ env.R_LIBS_USER }} - key: ${{ runner.os }}-r-${{ matrix.config.r }}-4-${{ hashFiles('R-package/DESCRIPTION') }} - restore-keys: ${{ runner.os }}-r-${{ matrix.config.r }}-4-${{ hashFiles('R-package/DESCRIPTION') }} + key: ${{ runner.os }}-r-${{ matrix.config.r }}-5-${{ hashFiles('R-package/DESCRIPTION') }} + restore-keys: ${{ runner.os }}-r-${{ matrix.config.r }}-5-${{ hashFiles('R-package/DESCRIPTION') }} - name: Install dependencies shell: Rscript {0} @@ -80,20 +80,25 @@ jobs: uses: actions/cache@v2 with: path: ${{ env.R_LIBS_USER }} - key: ${{ runner.os }}-r-${{ matrix.config.r }}-4-${{ hashFiles('R-package/DESCRIPTION') }} - restore-keys: ${{ runner.os }}-r-${{ matrix.config.r }}-4-${{ hashFiles('R-package/DESCRIPTION') }} + key: ${{ runner.os }}-r-${{ matrix.config.r }}-5-${{ hashFiles('R-package/DESCRIPTION') }} + restore-keys: ${{ runner.os }}-r-${{ matrix.config.r }}-5-${{ hashFiles('R-package/DESCRIPTION') }} - name: Install dependencies shell: Rscript {0} + if: matrix.config.os != 'windows-latest' run: | install.packages(${{ env.R_PACKAGES }}, repos = 'http://cloud.r-project.org', dependencies = c('Depends', 'Imports', 'LinkingTo')) - - name: Install igraph on Windows + + - name: Install binary dependencies shell: Rscript {0} if: matrix.config.os == 'windows-latest' run: | - install.packages('igraph', type='binary', dependencies = c('Depends', 'Imports', 'LinkingTo')) + install.packages(${{ env.R_PACKAGES }}, + type = 'binary', + repos = 'http://cloud.r-project.org', + dependencies = c('Depends', 'Imports', 'LinkingTo')) - uses: actions/setup-python@v2 with: @@ -132,8 +137,8 @@ jobs: uses: actions/cache@v2 with: path: ${{ env.R_LIBS_USER }} - key: ${{ runner.os }}-r-${{ matrix.config.r }}-4-${{ hashFiles('R-package/DESCRIPTION') }} - restore-keys: ${{ runner.os }}-r-${{ matrix.config.r }}-4-${{ hashFiles('R-package/DESCRIPTION') }} + key: ${{ runner.os }}-r-${{ matrix.config.r }}-5-${{ hashFiles('R-package/DESCRIPTION') }} + restore-keys: ${{ runner.os }}-r-${{ matrix.config.r }}-5-${{ hashFiles('R-package/DESCRIPTION') }} - name: Install dependencies shell: Rscript {0} From bc818316f25fc28d8ab8c1b42eeafc211fd94065 Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Sat, 10 Sep 2022 15:16:49 +0800 Subject: [PATCH 07/10] Prepare for improving Windows networking compatibility. (#8234) * Prepare for improving Windows networking compatibility. * Include dmlc filesystem indirectly as dmlc/filesystem.h includes windows.h, which conflicts with winsock2.h * Define `NOMINMAX` conditionally. * Link the winsock library when mysys32 is used. * Add config file for read the doc. --- .gitignore | 7 +++-- .readthedocs.yaml | 28 +++++++++++++++++++ R-package/src/Makevars.win | 2 +- cmake/Utils.cmake | 6 +++- rabit/src/allreduce_base.cc | 3 ++ src/cli_main.cc | 4 +++ tests/cpp/common/test_column_matrix.cc | 1 - tests/cpp/common/test_config.cc | 6 ++-- tests/cpp/common/test_hist_util.cu | 21 +++++++------- tests/cpp/common/test_hist_util.h | 11 ++++---- tests/cpp/common/test_io.cc | 4 +-- tests/cpp/common/test_json.cc | 11 ++++---- tests/cpp/common/test_quantile.h | 5 ++++ tests/cpp/common/test_version.cc | 10 +++---- tests/cpp/data/test_data.cc | 5 ++-- .../cpp/data/test_ellpack_page_raw_format.cu | 5 ++-- tests/cpp/data/test_file_iterator.cc | 4 +-- tests/cpp/data/test_metainfo.cc | 7 +++-- tests/cpp/data/test_simple_dmatrix.cc | 7 +++-- tests/cpp/data/test_simple_dmatrix.cu | 1 - tests/cpp/data/test_sparse_page_dmatrix.cc | 8 ++++-- tests/cpp/data/test_sparse_page_dmatrix.cu | 4 +-- tests/cpp/data/test_sparse_page_raw_format.cc | 2 +- tests/cpp/filesystem.h | 15 ++++++++++ tests/cpp/gbm/test_gbtree.cc | 2 +- tests/cpp/helpers.cc | 2 +- tests/cpp/helpers.h | 23 ++++++++------- tests/cpp/plugin/test_predictor_oneapi.cc | 6 ++-- tests/cpp/predictor/test_cpu_predictor.cc | 2 +- tests/cpp/predictor/test_gpu_predictor.cu | 1 - tests/cpp/test_learner.cc | 15 +++++----- tests/cpp/test_serialization.cc | 12 ++++---- .../gpu_hist/test_gradient_based_sampler.cu | 2 +- tests/cpp/tree/test_gpu_hist.cu | 14 +++++----- tests/cpp/tree/test_tree_model.cc | 7 +++-- 35 files changed, 165 insertions(+), 98 deletions(-) create mode 100644 .readthedocs.yaml create mode 100644 tests/cpp/filesystem.h diff --git a/.gitignore b/.gitignore index 20b92c057e1a..15503ad57bf4 100644 --- a/.gitignore +++ b/.gitignore @@ -97,8 +97,11 @@ metastore_db R-package/src/Makevars *.lib -# Visual Studio Code -/.vscode/ +# Visual Studio +.vs/ +CMakeSettings.json +*.ilk +*.pdb # IntelliJ/CLion .idea diff --git a/.readthedocs.yaml b/.readthedocs.yaml new file mode 100644 index 000000000000..924f516f95e6 --- /dev/null +++ b/.readthedocs.yaml @@ -0,0 +1,28 @@ +# .readthedocs.yaml +# Read the Docs configuration file +# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details + +# Required +version: 2 + +# Set the version of Python and other tools you might need +build: + os: ubuntu-22.04 + tools: + python: "3.8" + apt_packages: + - graphviz + +# Build documentation in the docs/ directory with Sphinx +sphinx: + configuration: doc/conf.py + +# If using Sphinx, optionally build your docs in additional formats such as PDF +formats: + - pdf + +# Optionally declare the Python requirements required to build your docs +python: + install: + - requirements: doc/requirements.txt + system_packages: true diff --git a/R-package/src/Makevars.win b/R-package/src/Makevars.win index 381d5a6d3889..f48e41188f6d 100644 --- a/R-package/src/Makevars.win +++ b/R-package/src/Makevars.win @@ -30,7 +30,7 @@ $(foreach v, $(XGB_RFLAGS), $(warning $(v))) PKG_CPPFLAGS= -I$(PKGROOT)/include -I$(PKGROOT)/dmlc-core/include -I$(PKGROOT)/rabit/include -I$(PKGROOT) $(XGB_RFLAGS) PKG_CXXFLAGS= $(SHLIB_OPENMP_CXXFLAGS) $(SHLIB_PTHREAD_FLAGS) -PKG_LIBS = $(SHLIB_OPENMP_CXXFLAGS) $(SHLIB_PTHREAD_FLAGS) +PKG_LIBS = $(SHLIB_OPENMP_CXXFLAGS) $(SHLIB_PTHREAD_FLAGS) -lwsock32 -lws2_32 OBJECTS= ./xgboost_R.o ./xgboost_custom.o ./xgboost_assert.o ./init.o \ $(PKGROOT)/amalgamation/xgboost-all0.o $(PKGROOT)/amalgamation/dmlc-minimum0.o \ $(PKGROOT)/rabit/src/engine.o $(PKGROOT)/rabit/src/rabit_c_api.o \ diff --git a/cmake/Utils.cmake b/cmake/Utils.cmake index 9c8dc188c735..f28c1f2703e6 100644 --- a/cmake/Utils.cmake +++ b/cmake/Utils.cmake @@ -244,7 +244,7 @@ macro(xgboost_target_properties target) $<$>:/utf-8> -D_CRT_SECURE_NO_WARNINGS -D_CRT_SECURE_NO_DEPRECATE - ) + ) endif (MSVC) if (WIN32 AND MINGW) @@ -314,4 +314,8 @@ macro(xgboost_target_link_libraries target) if (RABIT_BUILD_MPI) target_link_libraries(${target} PRIVATE MPI::MPI_CXX) endif (RABIT_BUILD_MPI) + + if (MINGW) + target_link_libraries(${target} PRIVATE wsock32 ws2_32) + endif (MINGW) endmacro(xgboost_target_link_libraries) diff --git a/rabit/src/allreduce_base.cc b/rabit/src/allreduce_base.cc index ffed988efb98..75ba901b2145 100644 --- a/rabit/src/allreduce_base.cc +++ b/rabit/src/allreduce_base.cc @@ -5,7 +5,10 @@ * * \author Tianqi Chen, Ignacio Cano, Tianyi Zhou */ +#if !defined(NOMINMAX) && defined(_WIN32) #define NOMINMAX +#endif // !defined(NOMINMAX) + #include "rabit/base.h" #include "rabit/internal/rabit-inl.h" #include "allreduce_base.h" diff --git a/src/cli_main.cc b/src/cli_main.cc index dc99cf4f0451..9e3b9e6c820f 100644 --- a/src/cli_main.cc +++ b/src/cli_main.cc @@ -6,7 +6,11 @@ */ #define _CRT_SECURE_NO_WARNINGS #define _CRT_SECURE_NO_DEPRECATE + +#if !defined(NOMINMAX) && defined(_WIN32) #define NOMINMAX +#endif // !defined(NOMINMAX) + #include #include diff --git a/tests/cpp/common/test_column_matrix.cc b/tests/cpp/common/test_column_matrix.cc index e2f59c58d8fc..fca5c0c4e316 100644 --- a/tests/cpp/common/test_column_matrix.cc +++ b/tests/cpp/common/test_column_matrix.cc @@ -1,7 +1,6 @@ /*! * Copyright 2018-2022 by XGBoost Contributors */ -#include #include #include "../../../src/common/column_matrix.h" diff --git a/tests/cpp/common/test_config.cc b/tests/cpp/common/test_config.cc index 7bf61dcfd729..5807db8caa9a 100644 --- a/tests/cpp/common/test_config.cc +++ b/tests/cpp/common/test_config.cc @@ -1,11 +1,13 @@ /*! * Copyright 2019 by Contributors */ +#include + #include #include -#include -#include + #include "../../../src/common/config.h" +#include "../filesystem.h" // dmlc::TemporaryDirectory #include "../helpers.h" namespace xgboost { diff --git a/tests/cpp/common/test_hist_util.cu b/tests/cpp/common/test_hist_util.cu index e92e2728eb40..7324531b17f0 100644 --- a/tests/cpp/common/test_hist_util.cu +++ b/tests/cpp/common/test_hist_util.cu @@ -1,26 +1,25 @@ /*! * Copyright 2019-2022 by XGBoost Contributors */ -#include #include +#include +#include +#include #include #include -#include - -#include -#include -#include "test_hist_util.h" -#include "../helpers.h" -#include "../data/test_array_interface.h" +#include "../../../include/xgboost/logging.h" #include "../../../src/common/device_helpers.cuh" -#include "../../../src/common/hist_util.h" #include "../../../src/common/hist_util.cuh" -#include "../../../src/data/device_adapter.cuh" +#include "../../../src/common/hist_util.h" #include "../../../src/common/math.h" +#include "../../../src/data/device_adapter.cuh" #include "../../../src/data/simple_dmatrix.h" -#include "../../../include/xgboost/logging.h" +#include "../data/test_array_interface.h" +#include "../filesystem.h" // dmlc::TemporaryDirectory +#include "../helpers.h" +#include "test_hist_util.h" namespace xgboost { namespace common { diff --git a/tests/cpp/common/test_hist_util.h b/tests/cpp/common/test_hist_util.h index 37a38d1b3361..c8881c158621 100644 --- a/tests/cpp/common/test_hist_util.h +++ b/tests/cpp/common/test_hist_util.h @@ -3,16 +3,17 @@ */ #pragma once #include -#include + +#include #include -#include #include -#include +#include -#include "../helpers.h" #include "../../../src/common/hist_util.h" -#include "../../../src/data/simple_dmatrix.h" #include "../../../src/data/adapter.h" +#include "../../../src/data/simple_dmatrix.h" +#include "../filesystem.h" // dmlc::TemporaryDirectory +#include "../helpers.h" #ifdef __CUDACC__ #include diff --git a/tests/cpp/common/test_io.cc b/tests/cpp/common/test_io.cc index 974fdefad6c8..feac8bd89934 100644 --- a/tests/cpp/common/test_io.cc +++ b/tests/cpp/common/test_io.cc @@ -2,12 +2,12 @@ * Copyright (c) by XGBoost Contributors 2019 */ #include -#include #include -#include "../helpers.h" #include "../../../src/common/io.h" +#include "../helpers.h" +#include "../filesystem.h" // dmlc::TemporaryDirectory namespace xgboost { namespace common { diff --git a/tests/cpp/common/test_json.cc b/tests/cpp/common/test_json.cc index 7836b23afcae..6397122756c4 100644 --- a/tests/cpp/common/test_json.cc +++ b/tests/cpp/common/test_json.cc @@ -2,16 +2,17 @@ * Copyright (c) by Contributors 2019-2022 */ #include -#include + #include #include +#include "../../../src/common/charconv.h" +#include "../../../src/common/io.h" +#include "../filesystem.h" // dmlc::TemporaryDirectory +#include "../helpers.h" #include "xgboost/json.h" -#include "xgboost/logging.h" #include "xgboost/json_io.h" -#include "../helpers.h" -#include "../../../src/common/io.h" -#include "../../../src/common/charconv.h" +#include "xgboost/logging.h" namespace xgboost { diff --git a/tests/cpp/common/test_quantile.h b/tests/cpp/common/test_quantile.h index 8118248dc939..d92695f53276 100644 --- a/tests/cpp/common/test_quantile.h +++ b/tests/cpp/common/test_quantile.h @@ -1,3 +1,6 @@ +#ifndef XGBOOST_TESTS_CPP_COMMON_TEST_QUANTILE_H_ +#define XGBOOST_TESTS_CPP_COMMON_TEST_QUANTILE_H_ + #include #include #include @@ -62,3 +65,5 @@ template void RunWithSeedsAndBins(size_t rows, Fn fn) { } } // namespace common } // namespace xgboost + +#endif // XGBOOST_TESTS_CPP_COMMON_TEST_QUANTILE_H_ diff --git a/tests/cpp/common/test_version.cc b/tests/cpp/common/test_version.cc index e6ee030f7b8c..802e04344645 100644 --- a/tests/cpp/common/test_version.cc +++ b/tests/cpp/common/test_version.cc @@ -1,18 +1,16 @@ /*! * Copyright 2019 XGBoost contributors */ -#include - -#include #include - -#include -#include +#include #include +#include +#include #include #include "../../../src/common/version.h" +#include "../filesystem.h" // dmlc::TemporaryDirectory namespace xgboost { TEST(Version, Basic) { diff --git a/tests/cpp/data/test_data.cc b/tests/cpp/data/test_data.cc index 92e94fee8dd1..871a7f49800e 100644 --- a/tests/cpp/data/test_data.cc +++ b/tests/cpp/data/test_data.cc @@ -2,13 +2,14 @@ * Copyright 2019-2022 by XGBoost Contributors */ #include -#include + #include #include #include -#include "xgboost/data.h" +#include "../filesystem.h" // dmlc::TemporaryDirectory #include "../helpers.h" +#include "xgboost/data.h" namespace xgboost { TEST(SparsePage, PushCSC) { diff --git a/tests/cpp/data/test_ellpack_page_raw_format.cu b/tests/cpp/data/test_ellpack_page_raw_format.cu index d4b5722eabf6..92b4acf4bd96 100644 --- a/tests/cpp/data/test_ellpack_page_raw_format.cu +++ b/tests/cpp/data/test_ellpack_page_raw_format.cu @@ -2,12 +2,11 @@ * Copyright 2021 XGBoost contributors */ #include -#include #include -#include "../../../src/data/sparse_page_source.h" #include "../../../src/data/ellpack_page.cuh" - +#include "../../../src/data/sparse_page_source.h" +#include "../filesystem.h" // dmlc::TemporaryDirectory #include "../helpers.h" namespace xgboost { diff --git a/tests/cpp/data/test_file_iterator.cc b/tests/cpp/data/test_file_iterator.cc index 12ae9e726d4a..21029620b8ab 100644 --- a/tests/cpp/data/test_file_iterator.cc +++ b/tests/cpp/data/test_file_iterator.cc @@ -2,13 +2,13 @@ * Copyright 2021 XGBoost contributors */ #include -#include #include +#include "../../../src/data/adapter.h" #include "../../../src/data/file_iterator.h" #include "../../../src/data/proxy_dmatrix.h" -#include "../../../src/data/adapter.h" +#include "../filesystem.h" // dmlc::TemporaryDirectory #include "../helpers.h" namespace xgboost { diff --git a/tests/cpp/data/test_metainfo.cc b/tests/cpp/data/test_metainfo.cc index 62146b571fb8..342af77bfba2 100644 --- a/tests/cpp/data/test_metainfo.cc +++ b/tests/cpp/data/test_metainfo.cc @@ -2,12 +2,13 @@ #include "test_metainfo.h" #include -#include #include -#include + #include -#include "../../../src/common/version.h" +#include +#include "../../../src/common/version.h" +#include "../filesystem.h" // dmlc::TemporaryDirectory #include "../helpers.h" #include "xgboost/base.h" diff --git a/tests/cpp/data/test_simple_dmatrix.cc b/tests/cpp/data/test_simple_dmatrix.cc index e709605c9c6d..266115731fd6 100644 --- a/tests/cpp/data/test_simple_dmatrix.cc +++ b/tests/cpp/data/test_simple_dmatrix.cc @@ -1,12 +1,13 @@ // Copyright by Contributors -#include #include #include -#include "xgboost/base.h" -#include "../../../src/data/simple_dmatrix.h" + #include "../../../src/data/adapter.h" +#include "../../../src/data/simple_dmatrix.h" +#include "../filesystem.h" // dmlc::TemporaryDirectory #include "../helpers.h" +#include "xgboost/base.h" using namespace xgboost; // NOLINT diff --git a/tests/cpp/data/test_simple_dmatrix.cu b/tests/cpp/data/test_simple_dmatrix.cu index 19f13b1fddfd..4b020c0a6cda 100644 --- a/tests/cpp/data/test_simple_dmatrix.cu +++ b/tests/cpp/data/test_simple_dmatrix.cu @@ -1,5 +1,4 @@ // Copyright by Contributors -#include #include #include "../../../src/data/simple_dmatrix.h" diff --git a/tests/cpp/data/test_sparse_page_dmatrix.cc b/tests/cpp/data/test_sparse_page_dmatrix.cc index b5ed00fb4f8a..68171932bdc5 100644 --- a/tests/cpp/data/test_sparse_page_dmatrix.cc +++ b/tests/cpp/data/test_sparse_page_dmatrix.cc @@ -1,14 +1,16 @@ // Copyright by Contributors -#include #include #include -#include + #include +#include + #include "../../../src/common/io.h" #include "../../../src/data/adapter.h" +#include "../../../src/data/file_iterator.h" #include "../../../src/data/simple_dmatrix.h" #include "../../../src/data/sparse_page_dmatrix.h" -#include "../../../src/data/file_iterator.h" +#include "../filesystem.h" // dmlc::TemporaryDirectory #include "../helpers.h" using namespace xgboost; // NOLINT diff --git a/tests/cpp/data/test_sparse_page_dmatrix.cu b/tests/cpp/data/test_sparse_page_dmatrix.cu index b9e91e6b1ef3..07c86c93f99c 100644 --- a/tests/cpp/data/test_sparse_page_dmatrix.cu +++ b/tests/cpp/data/test_sparse_page_dmatrix.cu @@ -1,10 +1,10 @@ // Copyright by Contributors -#include -#include "../helpers.h" #include "../../../src/common/compressed_iterator.h" #include "../../../src/data/ellpack_page.cuh" #include "../../../src/data/sparse_page_dmatrix.h" +#include "../filesystem.h" // dmlc::TemporaryDirectory +#include "../helpers.h" namespace xgboost { diff --git a/tests/cpp/data/test_sparse_page_raw_format.cc b/tests/cpp/data/test_sparse_page_raw_format.cc index dc7c5b2be77f..5743c4223a1b 100644 --- a/tests/cpp/data/test_sparse_page_raw_format.cc +++ b/tests/cpp/data/test_sparse_page_raw_format.cc @@ -2,10 +2,10 @@ * Copyright 2021 XGBoost contributors */ #include -#include #include #include "../../../src/data/sparse_page_source.h" +#include "../filesystem.h" // dmlc::TemporaryDirectory #include "../helpers.h" namespace xgboost { diff --git a/tests/cpp/filesystem.h b/tests/cpp/filesystem.h new file mode 100644 index 000000000000..5410feede316 --- /dev/null +++ b/tests/cpp/filesystem.h @@ -0,0 +1,15 @@ +/*! + * Copyright (c) 2022 by XGBoost Contributors + */ + +#ifndef XGBOOST_TESTS_CPP_FILESYSTEM_H +#define XGBOOST_TESTS_CPP_FILESYSTEM_H + +// A macro used inside `windows.h` to avoid conflicts with `winsock2.h` +#ifndef WIN32_LEAN_AND_MEAN +#define WIN32_LEAN_AND_MEAN +#endif // WIN32_LEAN_AND_MEAN + +#include "dmlc/filesystem.h" + +#endif // XGBOOST_TESTS_CPP_FILESYSTEM_H diff --git a/tests/cpp/gbm/test_gbtree.cc b/tests/cpp/gbm/test_gbtree.cc index 00201769bc03..a5c16f7951d7 100644 --- a/tests/cpp/gbm/test_gbtree.cc +++ b/tests/cpp/gbm/test_gbtree.cc @@ -1,13 +1,13 @@ /*! * Copyright 2019-2022 XGBoost contributors */ -#include #include #include #include "../../../src/data/adapter.h" #include "../../../src/data/proxy_dmatrix.h" #include "../../../src/gbm/gbtree.h" +#include "../filesystem.h" // dmlc::TemporaryDirectory #include "../helpers.h" #include "xgboost/base.h" #include "xgboost/host_device_vector.h" diff --git a/tests/cpp/helpers.cc b/tests/cpp/helpers.cc index 17972dc8fc63..0273d964f707 100644 --- a/tests/cpp/helpers.cc +++ b/tests/cpp/helpers.cc @@ -3,7 +3,6 @@ */ #include "helpers.h" -#include #include #include #include @@ -21,6 +20,7 @@ #include "../../src/data/simple_dmatrix.h" #include "../../src/data/sparse_page_dmatrix.h" #include "../../src/gbm/gbtree_model.h" +#include "filesystem.h" // dmlc::TemporaryDirectory #include "xgboost/c_api.h" #include "xgboost/predictor.h" diff --git a/tests/cpp/helpers.h b/tests/cpp/helpers.h index d54e75f2f35c..b79ea27187f5 100644 --- a/tests/cpp/helpers.h +++ b/tests/cpp/helpers.h @@ -4,25 +4,24 @@ #ifndef XGBOOST_TESTS_CPP_HELPERS_H_ #define XGBOOST_TESTS_CPP_HELPERS_H_ -#include -#include -#include -#include -#include -#include +#include #include #include - -#include - -#include #include -#include #include +#include + +#include +#include +#include +#include +#include +#include #include "../../src/common/common.h" -#include "../../src/gbm/gbtree_model.h" #include "../../src/data/array_interface.h" +#include "../../src/gbm/gbtree_model.h" +#include "filesystem.h" // dmlc::TemporaryDirectory #if defined(__CUDACC__) #define DeclareUnifiedTest(name) GPU ## name diff --git a/tests/cpp/plugin/test_predictor_oneapi.cc b/tests/cpp/plugin/test_predictor_oneapi.cc index 61d82d846fe9..1e5df312c694 100755 --- a/tests/cpp/plugin/test_predictor_oneapi.cc +++ b/tests/cpp/plugin/test_predictor_oneapi.cc @@ -1,14 +1,14 @@ /*! * Copyright 2017-2020 XGBoost contributors */ -#include #include #include +#include "../../../src/data/adapter.h" +#include "../../../src/gbm/gbtree_model.h" +#include "../filesystem.h" // dmlc::TemporaryDirectory #include "../helpers.h" #include "../predictor/test_predictor.h" -#include "../../../src/gbm/gbtree_model.h" -#include "../../../src/data/adapter.h" namespace xgboost { TEST(Plugin, OneAPIPredictorBasic) { diff --git a/tests/cpp/predictor/test_cpu_predictor.cc b/tests/cpp/predictor/test_cpu_predictor.cc index ea5dce20c1e5..8ba270083c74 100644 --- a/tests/cpp/predictor/test_cpu_predictor.cc +++ b/tests/cpp/predictor/test_cpu_predictor.cc @@ -1,7 +1,6 @@ /*! * Copyright 2017-2022 XGBoost contributors */ -#include #include #include @@ -9,6 +8,7 @@ #include "../../../src/data/proxy_dmatrix.h" #include "../../../src/gbm/gbtree.h" #include "../../../src/gbm/gbtree_model.h" +#include "../filesystem.h" // dmlc::TemporaryDirectory #include "../helpers.h" #include "test_predictor.h" diff --git a/tests/cpp/predictor/test_gpu_predictor.cu b/tests/cpp/predictor/test_gpu_predictor.cu index fa849cf51bd9..8dacadac5403 100644 --- a/tests/cpp/predictor/test_gpu_predictor.cu +++ b/tests/cpp/predictor/test_gpu_predictor.cu @@ -1,7 +1,6 @@ /*! * Copyright 2017-2020 XGBoost contributors */ -#include #include #include #include diff --git a/tests/cpp/test_learner.cc b/tests/cpp/test_learner.cc index 987626df86bf..4a8214e9c5cd 100644 --- a/tests/cpp/test_learner.cc +++ b/tests/cpp/test_learner.cc @@ -2,17 +2,18 @@ * Copyright 2017-2022 by XGBoost contributors */ #include -#include -#include -#include "helpers.h" -#include - #include #include -#include "xgboost/json.h" + +#include +#include + #include "../../src/common/io.h" -#include "../../src/common/random.h" #include "../../src/common/linalg_op.h" +#include "../../src/common/random.h" +#include "filesystem.h" // dmlc::TemporaryDirectory +#include "helpers.h" +#include "xgboost/json.h" namespace xgboost { TEST(Learner, Basic) { diff --git a/tests/cpp/test_serialization.cc b/tests/cpp/test_serialization.cc index bf459cf35dcb..d80a7442202e 100644 --- a/tests/cpp/test_serialization.cc +++ b/tests/cpp/test_serialization.cc @@ -1,14 +1,16 @@ // Copyright (c) 2019-2022 by Contributors #include -#include -#include -#include -#include #include +#include #include -#include "helpers.h" +#include + +#include + #include "../../src/common/io.h" #include "../../src/common/random.h" +#include "filesystem.h" // dmlc::TemporaryDirectory +#include "helpers.h" namespace xgboost { template diff --git a/tests/cpp/tree/gpu_hist/test_gradient_based_sampler.cu b/tests/cpp/tree/gpu_hist/test_gradient_based_sampler.cu index 9e8cd19bec74..e211fe70a483 100644 --- a/tests/cpp/tree/gpu_hist/test_gradient_based_sampler.cu +++ b/tests/cpp/tree/gpu_hist/test_gradient_based_sampler.cu @@ -6,8 +6,8 @@ #include "../../../../src/data/ellpack_page.cuh" #include "../../../../src/tree/gpu_hist/gradient_based_sampler.cuh" #include "../../../../src/tree/param.h" +#include "../../filesystem.h" // dmlc::TemporaryDirectory #include "../../helpers.h" -#include "dmlc/filesystem.h" namespace xgboost { namespace tree { diff --git a/tests/cpp/tree/test_gpu_hist.cu b/tests/cpp/tree/test_gpu_hist.cu index f28f723301af..24fe40f42f58 100644 --- a/tests/cpp/tree/test_gpu_hist.cu +++ b/tests/cpp/tree/test_gpu_hist.cu @@ -4,22 +4,22 @@ #include #include #include -#include #include + #include #include #include +#include "../../../src/common/common.h" +#include "../../../src/data/sparse_page_source.h" +#include "../../../src/tree/constraints.cuh" +#include "../../../src/tree/updater_gpu_common.cuh" +#include "../../../src/tree/updater_gpu_hist.cu" +#include "../filesystem.h" // dmlc::TemporaryDirectory #include "../helpers.h" #include "../histogram_helpers.h" - #include "xgboost/generic_parameters.h" #include "xgboost/json.h" -#include "../../../src/data/sparse_page_source.h" -#include "../../../src/tree/updater_gpu_hist.cu" -#include "../../../src/tree/updater_gpu_common.cuh" -#include "../../../src/common/common.h" -#include "../../../src/tree/constraints.cuh" namespace xgboost { namespace tree { diff --git a/tests/cpp/tree/test_tree_model.cc b/tests/cpp/tree/test_tree_model.cc index fb14e300c0f4..65957255bf38 100644 --- a/tests/cpp/tree/test_tree_model.cc +++ b/tests/cpp/tree/test_tree_model.cc @@ -1,11 +1,12 @@ // Copyright by Contributors #include + +#include "../../../src/common/bitfield.h" +#include "../../../src/common/categorical.h" +#include "../filesystem.h" #include "../helpers.h" -#include "dmlc/filesystem.h" #include "xgboost/json_io.h" #include "xgboost/tree_model.h" -#include "../../../src/common/bitfield.h" -#include "../../../src/common/categorical.h" namespace xgboost { TEST(Tree, ModelShape) { From 571da234b492e4544810235767510f07c05f5c50 Mon Sep 17 00:00:00 2001 From: fis Date: Thu, 1 Sep 2022 00:19:50 +0800 Subject: [PATCH 08/10] Copy ellpack to ghist. Start working on sparse data. Fix race. Remove check. Merge functions. Cleanup. Cleanup. Start writing tests. Fix. comp column. Python test. lint. lint. Fix. Cleanup. Avoid binary search. Use quantile dmatrix by default in sklearn interface. dask as well. Fix max_bin. Fix empty dmatrix for CPU. Fix GPU version. Fix empty DMatrix. pylint. --- python-package/xgboost/dask.py | 25 +++++++++++++------ python-package/xgboost/sklearn.py | 23 ++++++++++++++---- src/data/iterative_dmatrix.cc | 40 +++++++++++++++++++++++++++++++ src/data/iterative_dmatrix.cu | 11 +++++++-- src/data/iterative_dmatrix.h | 25 +------------------ 5 files changed, 87 insertions(+), 37 deletions(-) diff --git a/python-package/xgboost/dask.py b/python-package/xgboost/dask.py index 75eeba875fee..9a74d0143681 100644 --- a/python-package/xgboost/dask.py +++ b/python-package/xgboost/dask.py @@ -726,10 +726,9 @@ def _create_quantile_dmatrix( if parts is None: msg = f"worker {worker.address} has an empty DMatrix." LOGGER.warning(msg) - import cupy d = QuantileDMatrix( - cupy.zeros((0, 0)), + numpy.empty((0, 0)), feature_names=feature_names, feature_types=feature_types, max_bin=max_bin, @@ -1544,15 +1543,21 @@ def inplace_predict( # pylint: disable=unused-argument async def _async_wrap_evaluation_matrices( - client: Optional["distributed.Client"], **kwargs: Any + client: Optional["distributed.Client"], + tree_method: Optional[str], + max_bin: Optional[int], + **kwargs: Any, ) -> Tuple[DaskDMatrix, Optional[List[Tuple[DaskDMatrix, str]]]]: """A switch function for async environment.""" - def _inner(**kwargs: Any) -> DaskDMatrix: - m = DaskDMatrix(client=client, **kwargs) - return m + def _dispatch(ref: Optional[DaskDMatrix], **kwargs: Any) -> DaskDMatrix: + if tree_method in ("hist", "gpu_hist"): + return DaskQuantileDMatrix( + client=client, ref=ref, max_bin=max_bin, **kwargs + ) + return DaskDMatrix(client=client, **kwargs) - train_dmatrix, evals = _wrap_evaluation_matrices(create_dmatrix=_inner, **kwargs) + train_dmatrix, evals = _wrap_evaluation_matrices(create_dmatrix=_dispatch, **kwargs) train_dmatrix = await train_dmatrix if evals is None: return train_dmatrix, evals @@ -1756,6 +1761,8 @@ async def _fit_async( params = self.get_xgb_params() dtrain, evals = await _async_wrap_evaluation_matrices( client=self.client, + tree_method=self.tree_method, + max_bin=self.max_bin, X=X, y=y, group=None, @@ -1851,6 +1858,8 @@ async def _fit_async( params = self.get_xgb_params() dtrain, evals = await _async_wrap_evaluation_matrices( self.client, + tree_method=self.tree_method, + max_bin=self.max_bin, X=X, y=y, group=None, @@ -2057,6 +2066,8 @@ async def _fit_async( params = self.get_xgb_params() dtrain, evals = await _async_wrap_evaluation_matrices( self.client, + tree_method=self.tree_method, + max_bin=self.max_bin, X=X, y=y, group=None, diff --git a/python-package/xgboost/sklearn.py b/python-package/xgboost/sklearn.py index 98ad43af64ad..d820ccc01f87 100644 --- a/python-package/xgboost/sklearn.py +++ b/python-package/xgboost/sklearn.py @@ -38,6 +38,7 @@ Booster, DMatrix, Metric, + QuantileDMatrix, XGBoostError, _convert_ntree_limit, _deprecate_positional_args, @@ -430,7 +431,8 @@ def _wrap_evaluation_matrices( enable_categorical: bool, feature_types: Optional[FeatureTypes], ) -> Tuple[Any, List[Tuple[Any, str]]]: - """Convert array_like evaluation matrices into DMatrix. Perform validation on the way.""" + """Convert array_like evaluation matrices into DMatrix. Perform validation on the + way.""" train_dmatrix = create_dmatrix( data=X, label=y, @@ -442,6 +444,7 @@ def _wrap_evaluation_matrices( missing=missing, enable_categorical=enable_categorical, feature_types=feature_types, + ref=None, ) n_validation = 0 if eval_set is None else len(eval_set) @@ -491,6 +494,7 @@ def validate_or_none(meta: Optional[Sequence], name: str) -> Sequence: missing=missing, enable_categorical=enable_categorical, feature_types=feature_types, + ref=train_dmatrix, ) evals.append(m) nevals = len(evals) @@ -904,6 +908,17 @@ def _duplicated(parameter: str) -> None: return model, metric, params, early_stopping_rounds, callbacks + def _create_dmatrix(self, ref: Optional[DMatrix], **kwargs: Any) -> DMatrix: + # Use `QuantileDMatrix` to save memory. + if self.tree_method in ("hist", "gpu_hist"): + try: + return QuantileDMatrix( + **kwargs, ref=ref, nthread=self.n_jobs, max_bin=self.max_bin + ) + except TypeError: # `QuantileDMatrix` supports lesser types than DMatrix + pass + return DMatrix(**kwargs, nthread=self.n_jobs) + def _set_evaluation_result(self, evals_result: TrainingCallback.EvalsLog) -> None: if evals_result: self.evals_result_ = cast(Dict[str, Dict[str, List[float]]], evals_result) @@ -996,7 +1011,7 @@ def fit( base_margin_eval_set=base_margin_eval_set, eval_group=None, eval_qid=None, - create_dmatrix=lambda **kwargs: DMatrix(nthread=self.n_jobs, **kwargs), + create_dmatrix=self._create_dmatrix, enable_categorical=self.enable_categorical, feature_types=self.feature_types, ) @@ -1479,7 +1494,7 @@ def fit( base_margin_eval_set=base_margin_eval_set, eval_group=None, eval_qid=None, - create_dmatrix=lambda **kwargs: DMatrix(nthread=self.n_jobs, **kwargs), + create_dmatrix=self._create_dmatrix, enable_categorical=self.enable_categorical, feature_types=self.feature_types, ) @@ -1930,7 +1945,7 @@ def fit( base_margin_eval_set=base_margin_eval_set, eval_group=eval_group, eval_qid=eval_qid, - create_dmatrix=lambda **kwargs: DMatrix(nthread=self.n_jobs, **kwargs), + create_dmatrix=self._create_dmatrix, enable_categorical=self.enable_categorical, feature_types=self.feature_types, ) diff --git a/src/data/iterative_dmatrix.cc b/src/data/iterative_dmatrix.cc index f108c746ba09..352c7cc2cce7 100644 --- a/src/data/iterative_dmatrix.cc +++ b/src/data/iterative_dmatrix.cc @@ -14,6 +14,45 @@ namespace xgboost { namespace data { +IterativeDMatrix::IterativeDMatrix(DataIterHandle iter_handle, DMatrixHandle proxy, + std::shared_ptr ref, DataIterResetCallback* reset, + XGDMatrixCallbackNext* next, float missing, int nthread, + bst_bin_t max_bin) + : proxy_{proxy}, reset_{reset}, next_{next} { + // fetch the first batch + auto iter = + DataIterProxy{iter_handle, reset_, next_}; + iter.Reset(); + bool valid = iter.Next(); + CHECK(valid) << "Iterative DMatrix must have at least 1 batch."; + + auto d = MakeProxy(proxy_)->DeviceIdx(); + + StringView msg{"All batch should be on the same device."}; + if (batch_param_.gpu_id != Context::kCpuId) { + CHECK_EQ(d, batch_param_.gpu_id) << msg; + } + + int32_t max_device{d}; + rabit::Allreduce(&max_device, 1); + if (max_device != d) { + CHECK_EQ(MakeProxy(proxy_)->Info().num_row_, 0); + CHECK_NE(d, Context::kCpuId) << msg; + d = max_device; + } + + batch_param_ = BatchParam{d, max_bin}; + batch_param_.sparse_thresh = 0.2; // default from TrainParam + + ctx_.UpdateAllowUnknown( + Args{{"nthread", std::to_string(nthread)}, {"gpu_id", std::to_string(d)}}); + if (ctx_.IsCPU()) { + this->InitFromCPU(iter_handle, missing, ref); + } else { + this->InitFromCUDA(iter_handle, missing, ref); + } +} + void GetCutsFromRef(std::shared_ptr ref_, bst_feature_t n_features, BatchParam p, common::HistogramCuts* p_cuts) { CHECK(ref_); @@ -199,6 +238,7 @@ void IterativeDMatrix::InitFromCPU(DataIterHandle iter_handle, float missing, if (n_batches == 1) { this->info_ = std::move(proxy->Info()); this->info_.num_nonzero_ = nnz; + this->info_.num_col_ = n_features; // proxy might be empty. CHECK_EQ(proxy->Info().labels.Size(), 0); } } diff --git a/src/data/iterative_dmatrix.cu b/src/data/iterative_dmatrix.cu index 901662852a15..ceb470a5c7e5 100644 --- a/src/data/iterative_dmatrix.cu +++ b/src/data/iterative_dmatrix.cu @@ -173,8 +173,15 @@ BatchSet IterativeDMatrix::GetEllpackBatches(BatchParam const& para } if (!ellpack_ && ghist_) { ellpack_.reset(new EllpackPage()); - this->ctx_.gpu_id = param.gpu_id; - this->Info().feature_types.SetDevice(param.gpu_id); + // Evaluation QuantileDMatrix initialized from CPU data might not have the correct GPU + // ID. + if (this->ctx_.IsCPU()) { + this->ctx_.gpu_id = param.gpu_id; + } + if (this->ctx_.IsCPU()) { + this->ctx_.gpu_id = dh::CurrentDevice(); + } + this->Info().feature_types.SetDevice(this->ctx_.gpu_id); *ellpack_->Impl() = EllpackPageImpl(&ctx_, *this->ghist_, this->Info().feature_types.ConstDeviceSpan()); } diff --git a/src/data/iterative_dmatrix.h b/src/data/iterative_dmatrix.h index 7a8e5188c921..30358bb819ca 100644 --- a/src/data/iterative_dmatrix.h +++ b/src/data/iterative_dmatrix.h @@ -75,30 +75,7 @@ class IterativeDMatrix : public DMatrix { explicit IterativeDMatrix(DataIterHandle iter_handle, DMatrixHandle proxy, std::shared_ptr ref, DataIterResetCallback *reset, XGDMatrixCallbackNext *next, float missing, int nthread, - bst_bin_t max_bin) - : proxy_{proxy}, reset_{reset}, next_{next} { - // fetch the first batch - auto iter = - DataIterProxy{iter_handle, reset_, next_}; - iter.Reset(); - bool valid = iter.Next(); - CHECK(valid) << "Iterative DMatrix must have at least 1 batch."; - - auto d = MakeProxy(proxy_)->DeviceIdx(); - if (batch_param_.gpu_id != Context::kCpuId) { - CHECK_EQ(d, batch_param_.gpu_id) << "All batch should be on the same device."; - } - batch_param_ = BatchParam{d, max_bin}; - batch_param_.sparse_thresh = 0.2; // default from TrainParam - - ctx_.UpdateAllowUnknown( - Args{{"nthread", std::to_string(nthread)}, {"gpu_id", std::to_string(d)}}); - if (ctx_.IsCPU()) { - this->InitFromCPU(iter_handle, missing, ref); - } else { - this->InitFromCUDA(iter_handle, missing, ref); - } - } + bst_bin_t max_bin); ~IterativeDMatrix() override = default; bool EllpackExists() const override { return static_cast(ellpack_); } From 9d6e47da9bd8a5453867b0aaabbed605ff333047 Mon Sep 17 00:00:00 2001 From: fis Date: Thu, 8 Sep 2022 18:46:35 +0800 Subject: [PATCH 09/10] log. --- src/data/iterative_dmatrix.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/data/iterative_dmatrix.cc b/src/data/iterative_dmatrix.cc index 352c7cc2cce7..c8752785f43b 100644 --- a/src/data/iterative_dmatrix.cc +++ b/src/data/iterative_dmatrix.cc @@ -36,7 +36,8 @@ IterativeDMatrix::IterativeDMatrix(DataIterHandle iter_handle, DMatrixHandle pro int32_t max_device{d}; rabit::Allreduce(&max_device, 1); if (max_device != d) { - CHECK_EQ(MakeProxy(proxy_)->Info().num_row_, 0); + CHECK_EQ(MakeProxy(proxy_)->Info().num_row_, 0) + << "max device:" << max_device << " device:" << d; CHECK_NE(d, Context::kCpuId) << msg; d = max_device; } From 4e388eb9445b94c7a119fd0d668865931a4c6462 Mon Sep 17 00:00:00 2001 From: fis Date: Thu, 8 Sep 2022 18:58:12 +0800 Subject: [PATCH 10/10] Remove hack. --- src/data/iterative_dmatrix.cc | 9 --------- 1 file changed, 9 deletions(-) diff --git a/src/data/iterative_dmatrix.cc b/src/data/iterative_dmatrix.cc index c8752785f43b..60591a1d91b8 100644 --- a/src/data/iterative_dmatrix.cc +++ b/src/data/iterative_dmatrix.cc @@ -33,15 +33,6 @@ IterativeDMatrix::IterativeDMatrix(DataIterHandle iter_handle, DMatrixHandle pro CHECK_EQ(d, batch_param_.gpu_id) << msg; } - int32_t max_device{d}; - rabit::Allreduce(&max_device, 1); - if (max_device != d) { - CHECK_EQ(MakeProxy(proxy_)->Info().num_row_, 0) - << "max device:" << max_device << " device:" << d; - CHECK_NE(d, Context::kCpuId) << msg; - d = max_device; - } - batch_param_ = BatchParam{d, max_bin}; batch_param_.sparse_thresh = 0.2; // default from TrainParam