New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Make QuantileDMatrix
default to sklearn esitmators.
#8220
Changes from 3 commits
d6781b3
7654b88
cc90c2b
23faf65
7004573
dd44ac9
bc81831
571da23
9d6e47d
4e388eb
376bdd6
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -38,6 +38,7 @@ | |
Booster, | ||
DMatrix, | ||
Metric, | ||
QuantileDMatrix, | ||
XGBoostError, | ||
_convert_ntree_limit, | ||
_deprecate_positional_args, | ||
|
@@ -430,7 +431,8 @@ def _wrap_evaluation_matrices( | |
enable_categorical: bool, | ||
feature_types: Optional[FeatureTypes], | ||
) -> Tuple[Any, List[Tuple[Any, str]]]: | ||
"""Convert array_like evaluation matrices into DMatrix. Perform validation on the way.""" | ||
"""Convert array_like evaluation matrices into DMatrix. Perform validation on the | ||
way.""" | ||
train_dmatrix = create_dmatrix( | ||
data=X, | ||
label=y, | ||
|
@@ -442,6 +444,7 @@ def _wrap_evaluation_matrices( | |
missing=missing, | ||
enable_categorical=enable_categorical, | ||
feature_types=feature_types, | ||
ref=None, | ||
) | ||
|
||
n_validation = 0 if eval_set is None else len(eval_set) | ||
|
@@ -491,6 +494,7 @@ def validate_or_none(meta: Optional[Sequence], name: str) -> Sequence: | |
missing=missing, | ||
enable_categorical=enable_categorical, | ||
feature_types=feature_types, | ||
ref=train_dmatrix, | ||
) | ||
evals.append(m) | ||
nevals = len(evals) | ||
|
@@ -904,6 +908,17 @@ def _duplicated(parameter: str) -> None: | |
|
||
return model, metric, params, early_stopping_rounds, callbacks | ||
|
||
def _create_dmatrix(self, ref: Optional[DMatrix], **kwargs: Any) -> DMatrix: | ||
# Use `QuantileDMatrix` to save memory. | ||
if self.tree_method in ("hist", "gpu_hist"): | ||
try: | ||
return QuantileDMatrix( | ||
**kwargs, ref=ref, nthread=self.n_jobs, max_bin=self.max_bin | ||
) | ||
except TypeError: # `QuantileDMatrix` supports lesser types than DMatrix | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Which cases are not supported out of curiosity? Just wondering if this is going to run into the exception regularly in some cases. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. datatable, CSC, arrow. Also, the DMatrix has a dispatcher to convert unknown types to scipy csr. |
||
pass | ||
return DMatrix(**kwargs, nthread=self.n_jobs) | ||
|
||
def _set_evaluation_result(self, evals_result: TrainingCallback.EvalsLog) -> None: | ||
if evals_result: | ||
self.evals_result_ = cast(Dict[str, Dict[str, List[float]]], evals_result) | ||
|
@@ -996,7 +1011,7 @@ def fit( | |
base_margin_eval_set=base_margin_eval_set, | ||
eval_group=None, | ||
eval_qid=None, | ||
create_dmatrix=lambda **kwargs: DMatrix(nthread=self.n_jobs, **kwargs), | ||
create_dmatrix=self._create_dmatrix, | ||
enable_categorical=self.enable_categorical, | ||
feature_types=self.feature_types, | ||
) | ||
|
@@ -1479,7 +1494,7 @@ def fit( | |
base_margin_eval_set=base_margin_eval_set, | ||
eval_group=None, | ||
eval_qid=None, | ||
create_dmatrix=lambda **kwargs: DMatrix(nthread=self.n_jobs, **kwargs), | ||
create_dmatrix=self._create_dmatrix, | ||
enable_categorical=self.enable_categorical, | ||
feature_types=self.feature_types, | ||
) | ||
|
@@ -1930,7 +1945,7 @@ def fit( | |
base_margin_eval_set=base_margin_eval_set, | ||
eval_group=eval_group, | ||
eval_qid=eval_qid, | ||
create_dmatrix=lambda **kwargs: DMatrix(nthread=self.n_jobs, **kwargs), | ||
create_dmatrix=self._create_dmatrix, | ||
enable_categorical=self.enable_categorical, | ||
feature_types=self.feature_types, | ||
) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -14,6 +14,37 @@ | |
namespace xgboost { | ||
namespace data { | ||
|
||
IterativeDMatrix::IterativeDMatrix(DataIterHandle iter_handle, DMatrixHandle proxy, | ||
std::shared_ptr<DMatrix> ref, DataIterResetCallback* reset, | ||
XGDMatrixCallbackNext* next, float missing, int nthread, | ||
bst_bin_t max_bin) | ||
: proxy_{proxy}, reset_{reset}, next_{next} { | ||
// fetch the first batch | ||
auto iter = | ||
DataIterProxy<DataIterResetCallback, XGDMatrixCallbackNext>{iter_handle, reset_, next_}; | ||
iter.Reset(); | ||
bool valid = iter.Next(); | ||
CHECK(valid) << "Iterative DMatrix must have at least 1 batch."; | ||
|
||
auto d = MakeProxy(proxy_)->DeviceIdx(); | ||
|
||
StringView msg{"All batch should be on the same device."}; | ||
if (batch_param_.gpu_id != Context::kCpuId) { | ||
CHECK_EQ(d, batch_param_.gpu_id) << msg; | ||
} | ||
|
||
batch_param_ = BatchParam{d, max_bin}; | ||
batch_param_.sparse_thresh = 0.2; // default from TrainParam | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If we change the default, this hardcoded value will be forgotten. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Let me fix it. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Changed to obtain the value from train param. |
||
|
||
ctx_.UpdateAllowUnknown( | ||
Args{{"nthread", std::to_string(nthread)}, {"gpu_id", std::to_string(d)}}); | ||
if (ctx_.IsCPU()) { | ||
this->InitFromCPU(iter_handle, missing, ref); | ||
} else { | ||
this->InitFromCUDA(iter_handle, missing, ref); | ||
} | ||
} | ||
|
||
void GetCutsFromRef(std::shared_ptr<DMatrix> ref_, bst_feature_t n_features, BatchParam p, | ||
common::HistogramCuts* p_cuts) { | ||
CHECK(ref_); | ||
|
@@ -199,6 +230,7 @@ void IterativeDMatrix::InitFromCPU(DataIterHandle iter_handle, float missing, | |
if (n_batches == 1) { | ||
this->info_ = std::move(proxy->Info()); | ||
this->info_.num_nonzero_ = nnz; | ||
this->info_.num_col_ = n_features; // proxy might be empty. | ||
CHECK_EQ(proxy->Info().labels.Size(), 0); | ||
} | ||
} | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It appears that
tree_method
andmax_bin
will no longer be in thekwargs
dictionary. Can you make sure this will not cause undesirable behavior? For example, iskwargs
passed toxgb.train
?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Training parameters are obtained via
params = self.get_xgb_params()
, which you can find in theasync def _fit_async
function.