Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Make QuantileDMatrix default to sklearn esitmators. #8220

Merged
merged 11 commits into from Sep 13, 2022
Merged
21 changes: 13 additions & 8 deletions .github/workflows/r_tests.yml
Expand Up @@ -31,8 +31,8 @@ jobs:
uses: actions/cache@v2
with:
path: ${{ env.R_LIBS_USER }}
key: ${{ runner.os }}-r-${{ matrix.config.r }}-4-${{ hashFiles('R-package/DESCRIPTION') }}
restore-keys: ${{ runner.os }}-r-${{ matrix.config.r }}-4-${{ hashFiles('R-package/DESCRIPTION') }}
key: ${{ runner.os }}-r-${{ matrix.config.r }}-5-${{ hashFiles('R-package/DESCRIPTION') }}
restore-keys: ${{ runner.os }}-r-${{ matrix.config.r }}-5-${{ hashFiles('R-package/DESCRIPTION') }}

- name: Install dependencies
shell: Rscript {0}
Expand Down Expand Up @@ -80,20 +80,25 @@ jobs:
uses: actions/cache@v2
with:
path: ${{ env.R_LIBS_USER }}
key: ${{ runner.os }}-r-${{ matrix.config.r }}-4-${{ hashFiles('R-package/DESCRIPTION') }}
restore-keys: ${{ runner.os }}-r-${{ matrix.config.r }}-4-${{ hashFiles('R-package/DESCRIPTION') }}
key: ${{ runner.os }}-r-${{ matrix.config.r }}-5-${{ hashFiles('R-package/DESCRIPTION') }}
restore-keys: ${{ runner.os }}-r-${{ matrix.config.r }}-5-${{ hashFiles('R-package/DESCRIPTION') }}

- name: Install dependencies
shell: Rscript {0}
if: matrix.config.os != 'windows-latest'
run: |
install.packages(${{ env.R_PACKAGES }},
repos = 'http://cloud.r-project.org',
dependencies = c('Depends', 'Imports', 'LinkingTo'))
- name: Install igraph on Windows

- name: Install binary dependencies
shell: Rscript {0}
if: matrix.config.os == 'windows-latest'
run: |
install.packages('igraph', type='binary', dependencies = c('Depends', 'Imports', 'LinkingTo'))
install.packages(${{ env.R_PACKAGES }},
type = 'binary',
repos = 'http://cloud.r-project.org',
dependencies = c('Depends', 'Imports', 'LinkingTo'))

- uses: actions/setup-python@v2
with:
Expand Down Expand Up @@ -132,8 +137,8 @@ jobs:
uses: actions/cache@v2
with:
path: ${{ env.R_LIBS_USER }}
key: ${{ runner.os }}-r-${{ matrix.config.r }}-4-${{ hashFiles('R-package/DESCRIPTION') }}
restore-keys: ${{ runner.os }}-r-${{ matrix.config.r }}-4-${{ hashFiles('R-package/DESCRIPTION') }}
key: ${{ runner.os }}-r-${{ matrix.config.r }}-5-${{ hashFiles('R-package/DESCRIPTION') }}
restore-keys: ${{ runner.os }}-r-${{ matrix.config.r }}-5-${{ hashFiles('R-package/DESCRIPTION') }}

- name: Install dependencies
shell: Rscript {0}
Expand Down
7 changes: 5 additions & 2 deletions .gitignore
Expand Up @@ -97,8 +97,11 @@ metastore_db
R-package/src/Makevars
*.lib

# Visual Studio Code
/.vscode/
# Visual Studio
.vs/
CMakeSettings.json
*.ilk
*.pdb

# IntelliJ/CLion
.idea
Expand Down
28 changes: 28 additions & 0 deletions .readthedocs.yaml
@@ -0,0 +1,28 @@
# .readthedocs.yaml
# Read the Docs configuration file
# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details

# Required
version: 2

# Set the version of Python and other tools you might need
build:
os: ubuntu-22.04
tools:
python: "3.8"
apt_packages:
- graphviz

# Build documentation in the docs/ directory with Sphinx
sphinx:
configuration: doc/conf.py

# If using Sphinx, optionally build your docs in additional formats such as PDF
formats:
- pdf

# Optionally declare the Python requirements required to build your docs
python:
install:
- requirements: doc/requirements.txt
system_packages: true
2 changes: 1 addition & 1 deletion R-package/src/Makevars.win
Expand Up @@ -30,7 +30,7 @@ $(foreach v, $(XGB_RFLAGS), $(warning $(v)))

PKG_CPPFLAGS= -I$(PKGROOT)/include -I$(PKGROOT)/dmlc-core/include -I$(PKGROOT)/rabit/include -I$(PKGROOT) $(XGB_RFLAGS)
PKG_CXXFLAGS= $(SHLIB_OPENMP_CXXFLAGS) $(SHLIB_PTHREAD_FLAGS)
PKG_LIBS = $(SHLIB_OPENMP_CXXFLAGS) $(SHLIB_PTHREAD_FLAGS)
PKG_LIBS = $(SHLIB_OPENMP_CXXFLAGS) $(SHLIB_PTHREAD_FLAGS) -lwsock32 -lws2_32
OBJECTS= ./xgboost_R.o ./xgboost_custom.o ./xgboost_assert.o ./init.o \
$(PKGROOT)/amalgamation/xgboost-all0.o $(PKGROOT)/amalgamation/dmlc-minimum0.o \
$(PKGROOT)/rabit/src/engine.o $(PKGROOT)/rabit/src/rabit_c_api.o \
Expand Down
6 changes: 5 additions & 1 deletion cmake/Utils.cmake
Expand Up @@ -244,7 +244,7 @@ macro(xgboost_target_properties target)
$<$<NOT:$<COMPILE_LANGUAGE:CUDA>>:/utf-8>
-D_CRT_SECURE_NO_WARNINGS
-D_CRT_SECURE_NO_DEPRECATE
)
)
endif (MSVC)

if (WIN32 AND MINGW)
Expand Down Expand Up @@ -314,4 +314,8 @@ macro(xgboost_target_link_libraries target)
if (RABIT_BUILD_MPI)
target_link_libraries(${target} PRIVATE MPI::MPI_CXX)
endif (RABIT_BUILD_MPI)

if (MINGW)
target_link_libraries(${target} PRIVATE wsock32 ws2_32)
endif (MINGW)
endmacro(xgboost_target_link_libraries)
25 changes: 18 additions & 7 deletions python-package/xgboost/dask.py
Expand Up @@ -726,10 +726,9 @@ def _create_quantile_dmatrix(
if parts is None:
msg = f"worker {worker.address} has an empty DMatrix."
LOGGER.warning(msg)
import cupy

d = QuantileDMatrix(
cupy.zeros((0, 0)),
numpy.empty((0, 0)),
feature_names=feature_names,
feature_types=feature_types,
max_bin=max_bin,
Expand Down Expand Up @@ -1544,15 +1543,21 @@ def inplace_predict( # pylint: disable=unused-argument


async def _async_wrap_evaluation_matrices(
client: Optional["distributed.Client"], **kwargs: Any
client: Optional["distributed.Client"],
tree_method: Optional[str],
max_bin: Optional[int],
**kwargs: Any,
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It appears that tree_method and max_bin will no longer be in the kwargs dictionary. Can you make sure this will not cause undesirable behavior? For example, is kwargs passed to xgb.train?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Training parameters are obtained via params = self.get_xgb_params(), which you can find in the async def _fit_async function.

) -> Tuple[DaskDMatrix, Optional[List[Tuple[DaskDMatrix, str]]]]:
"""A switch function for async environment."""

def _inner(**kwargs: Any) -> DaskDMatrix:
m = DaskDMatrix(client=client, **kwargs)
return m
def _dispatch(ref: Optional[DaskDMatrix], **kwargs: Any) -> DaskDMatrix:
if tree_method in ("hist", "gpu_hist"):
return DaskQuantileDMatrix(
client=client, ref=ref, max_bin=max_bin, **kwargs
)
return DaskDMatrix(client=client, **kwargs)

train_dmatrix, evals = _wrap_evaluation_matrices(create_dmatrix=_inner, **kwargs)
train_dmatrix, evals = _wrap_evaluation_matrices(create_dmatrix=_dispatch, **kwargs)
train_dmatrix = await train_dmatrix
if evals is None:
return train_dmatrix, evals
Expand Down Expand Up @@ -1756,6 +1761,8 @@ async def _fit_async(
params = self.get_xgb_params()
dtrain, evals = await _async_wrap_evaluation_matrices(
client=self.client,
tree_method=self.tree_method,
max_bin=self.max_bin,
X=X,
y=y,
group=None,
Expand Down Expand Up @@ -1851,6 +1858,8 @@ async def _fit_async(
params = self.get_xgb_params()
dtrain, evals = await _async_wrap_evaluation_matrices(
self.client,
tree_method=self.tree_method,
max_bin=self.max_bin,
X=X,
y=y,
group=None,
Expand Down Expand Up @@ -2057,6 +2066,8 @@ async def _fit_async(
params = self.get_xgb_params()
dtrain, evals = await _async_wrap_evaluation_matrices(
self.client,
tree_method=self.tree_method,
max_bin=self.max_bin,
X=X,
y=y,
group=None,
Expand Down
23 changes: 19 additions & 4 deletions python-package/xgboost/sklearn.py
Expand Up @@ -38,6 +38,7 @@
Booster,
DMatrix,
Metric,
QuantileDMatrix,
XGBoostError,
_convert_ntree_limit,
_deprecate_positional_args,
Expand Down Expand Up @@ -430,7 +431,8 @@ def _wrap_evaluation_matrices(
enable_categorical: bool,
feature_types: Optional[FeatureTypes],
) -> Tuple[Any, List[Tuple[Any, str]]]:
"""Convert array_like evaluation matrices into DMatrix. Perform validation on the way."""
"""Convert array_like evaluation matrices into DMatrix. Perform validation on the
way."""
train_dmatrix = create_dmatrix(
data=X,
label=y,
Expand All @@ -442,6 +444,7 @@ def _wrap_evaluation_matrices(
missing=missing,
enable_categorical=enable_categorical,
feature_types=feature_types,
ref=None,
)

n_validation = 0 if eval_set is None else len(eval_set)
Expand Down Expand Up @@ -491,6 +494,7 @@ def validate_or_none(meta: Optional[Sequence], name: str) -> Sequence:
missing=missing,
enable_categorical=enable_categorical,
feature_types=feature_types,
ref=train_dmatrix,
)
evals.append(m)
nevals = len(evals)
Expand Down Expand Up @@ -904,6 +908,17 @@ def _duplicated(parameter: str) -> None:

return model, metric, params, early_stopping_rounds, callbacks

def _create_dmatrix(self, ref: Optional[DMatrix], **kwargs: Any) -> DMatrix:
# Use `QuantileDMatrix` to save memory.
if self.tree_method in ("hist", "gpu_hist"):
try:
return QuantileDMatrix(
**kwargs, ref=ref, nthread=self.n_jobs, max_bin=self.max_bin
)
except TypeError: # `QuantileDMatrix` supports lesser types than DMatrix
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Which cases are not supported out of curiosity? Just wondering if this is going to run into the exception regularly in some cases.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

datatable, CSC, arrow. Also, the DMatrix has a dispatcher to convert unknown types to scipy csr.

pass
return DMatrix(**kwargs, nthread=self.n_jobs)

def _set_evaluation_result(self, evals_result: TrainingCallback.EvalsLog) -> None:
if evals_result:
self.evals_result_ = cast(Dict[str, Dict[str, List[float]]], evals_result)
Expand Down Expand Up @@ -996,7 +1011,7 @@ def fit(
base_margin_eval_set=base_margin_eval_set,
eval_group=None,
eval_qid=None,
create_dmatrix=lambda **kwargs: DMatrix(nthread=self.n_jobs, **kwargs),
create_dmatrix=self._create_dmatrix,
enable_categorical=self.enable_categorical,
feature_types=self.feature_types,
)
Expand Down Expand Up @@ -1479,7 +1494,7 @@ def fit(
base_margin_eval_set=base_margin_eval_set,
eval_group=None,
eval_qid=None,
create_dmatrix=lambda **kwargs: DMatrix(nthread=self.n_jobs, **kwargs),
create_dmatrix=self._create_dmatrix,
enable_categorical=self.enable_categorical,
feature_types=self.feature_types,
)
Expand Down Expand Up @@ -1930,7 +1945,7 @@ def fit(
base_margin_eval_set=base_margin_eval_set,
eval_group=eval_group,
eval_qid=eval_qid,
create_dmatrix=lambda **kwargs: DMatrix(nthread=self.n_jobs, **kwargs),
create_dmatrix=self._create_dmatrix,
enable_categorical=self.enable_categorical,
feature_types=self.feature_types,
)
Expand Down
3 changes: 3 additions & 0 deletions rabit/src/allreduce_base.cc
Expand Up @@ -5,7 +5,10 @@
*
* \author Tianqi Chen, Ignacio Cano, Tianyi Zhou
*/
#if !defined(NOMINMAX) && defined(_WIN32)
#define NOMINMAX
#endif // !defined(NOMINMAX)

#include "rabit/base.h"
#include "rabit/internal/rabit-inl.h"
#include "allreduce_base.h"
Expand Down
4 changes: 4 additions & 0 deletions src/cli_main.cc
Expand Up @@ -6,7 +6,11 @@
*/
#define _CRT_SECURE_NO_WARNINGS
#define _CRT_SECURE_NO_DEPRECATE

#if !defined(NOMINMAX) && defined(_WIN32)
#define NOMINMAX
#endif // !defined(NOMINMAX)

#include <dmlc/timer.h>

#include <xgboost/learner.h>
Expand Down
38 changes: 38 additions & 0 deletions src/data/iterative_dmatrix.cc
Expand Up @@ -7,13 +7,46 @@

#include "../common/column_matrix.h"
#include "../common/hist_util.h"
#include "../tree/param.h" // FIXME(jiamingy): Find a better way to share this parameter.
#include "gradient_index.h"
#include "proxy_dmatrix.h"
#include "simple_batch_iterator.h"

namespace xgboost {
namespace data {

IterativeDMatrix::IterativeDMatrix(DataIterHandle iter_handle, DMatrixHandle proxy,
std::shared_ptr<DMatrix> ref, DataIterResetCallback* reset,
XGDMatrixCallbackNext* next, float missing, int nthread,
bst_bin_t max_bin)
: proxy_{proxy}, reset_{reset}, next_{next} {
// fetch the first batch
auto iter =
DataIterProxy<DataIterResetCallback, XGDMatrixCallbackNext>{iter_handle, reset_, next_};
iter.Reset();
bool valid = iter.Next();
CHECK(valid) << "Iterative DMatrix must have at least 1 batch.";

auto d = MakeProxy(proxy_)->DeviceIdx();

StringView msg{"All batch should be on the same device."};
if (batch_param_.gpu_id != Context::kCpuId) {
CHECK_EQ(d, batch_param_.gpu_id) << msg;
}

batch_param_ = BatchParam{d, max_bin};
// hardcoded parameter.
batch_param_.sparse_thresh = tree::TrainParam::DftSparseThreshold();

ctx_.UpdateAllowUnknown(
Args{{"nthread", std::to_string(nthread)}, {"gpu_id", std::to_string(d)}});
if (ctx_.IsCPU()) {
this->InitFromCPU(iter_handle, missing, ref);
} else {
this->InitFromCUDA(iter_handle, missing, ref);
}
}

void GetCutsFromRef(std::shared_ptr<DMatrix> ref_, bst_feature_t n_features, BatchParam p,
common::HistogramCuts* p_cuts) {
CHECK(ref_);
Expand Down Expand Up @@ -199,6 +232,7 @@ void IterativeDMatrix::InitFromCPU(DataIterHandle iter_handle, float missing,
if (n_batches == 1) {
this->info_ = std::move(proxy->Info());
this->info_.num_nonzero_ = nnz;
this->info_.num_col_ = n_features; // proxy might be empty.
CHECK_EQ(proxy->Info().labels.Size(), 0);
}
}
Expand All @@ -210,6 +244,10 @@ BatchSet<GHistIndexMatrix> IterativeDMatrix::GetGradientIndex(BatchParam const&
ghist_ = std::make_shared<GHistIndexMatrix>(&ctx_, Info(), *ellpack_, param);
}

if (param.sparse_thresh != tree::TrainParam::DftSparseThreshold()) {
LOG(WARNING) << "`sparse_threshold` can not be changed when `QuantileDMatrix` is used instead "
"of `DMatrix`.";
}
auto begin_iter =
BatchIterator<GHistIndexMatrix>(new SimpleBatchIteratorImpl<GHistIndexMatrix>(ghist_));
return BatchSet<GHistIndexMatrix>(begin_iter);
Expand Down
11 changes: 9 additions & 2 deletions src/data/iterative_dmatrix.cu
Expand Up @@ -173,8 +173,15 @@ BatchSet<EllpackPage> IterativeDMatrix::GetEllpackBatches(BatchParam const& para
}
if (!ellpack_ && ghist_) {
ellpack_.reset(new EllpackPage());
this->ctx_.gpu_id = param.gpu_id;
this->Info().feature_types.SetDevice(param.gpu_id);
// Evaluation QuantileDMatrix initialized from CPU data might not have the correct GPU
// ID.
if (this->ctx_.IsCPU()) {
this->ctx_.gpu_id = param.gpu_id;
}
if (this->ctx_.IsCPU()) {
this->ctx_.gpu_id = dh::CurrentDevice();
}
this->Info().feature_types.SetDevice(this->ctx_.gpu_id);
*ellpack_->Impl() =
EllpackPageImpl(&ctx_, *this->ghist_, this->Info().feature_types.ConstDeviceSpan());
}
Expand Down