From 1c9b7586e6d61e41209f2f85ce1297665a8a7275 Mon Sep 17 00:00:00 2001 From: fis Date: Wed, 23 Feb 2022 12:23:08 +0800 Subject: [PATCH 1/3] Support categorical data for hist. --- doc/parameter.rst | 7 +- doc/tutorials/categorical.rst | 83 +++---- include/xgboost/tree_model.h | 10 + python-package/xgboost/core.py | 7 +- python-package/xgboost/sklearn.py | 17 +- src/common/partition_builder.h | 82 ++++--- src/common/threading_utils.h | 3 - src/tree/hist/evaluate_splits.h | 14 +- src/tree/updater_quantile_hist.cc | 209 +++-------------- src/tree/updater_quantile_hist.h | 163 ++++++++++--- tests/cpp/tree/hist/test_evaluate_splits.cc | 18 +- tests/cpp/tree/test_approx.cc | 16 +- tests/cpp/tree/test_partitioner.h | 21 ++ tests/cpp/tree/test_quantile_hist.cc | 240 +++++--------------- tests/python/test_updaters.py | 1 + 15 files changed, 393 insertions(+), 498 deletions(-) create mode 100644 tests/cpp/tree/test_partitioner.h diff --git a/doc/parameter.rst b/doc/parameter.rst index 227263e7da61..992bd9f5af8b 100644 --- a/doc/parameter.rst +++ b/doc/parameter.rst @@ -244,9 +244,6 @@ Additional parameters for ``hist``, ``gpu_hist`` and ``approx`` tree method - Use single precision to build histograms instead of double precision. -Additional parameters for ``approx`` and ``gpu_hist`` tree method -================================================================= - * ``max_cat_to_onehot`` .. versionadded:: 1.6 @@ -256,8 +253,8 @@ Additional parameters for ``approx`` and ``gpu_hist`` tree method - A threshold for deciding whether XGBoost should use one-hot encoding based split for categorical data. When number of categories is lesser than the threshold then one-hot encoding is chosen, otherwise the categories will be partitioned into children nodes. - Only relevant for regression and binary classification. Also, `approx` or `gpu_hist` - tree method is required. + Only relevant for regression and binary classification. Also, ``exact`` tree method is + not supported Additional parameters for Dart Booster (``booster=dart``) ========================================================= diff --git a/doc/tutorials/categorical.rst b/doc/tutorials/categorical.rst index 65081be57030..7a185a113116 100644 --- a/doc/tutorials/categorical.rst +++ b/doc/tutorials/categorical.rst @@ -4,16 +4,16 @@ Categorical Data .. note:: - As of XGBoost 1.6, the feature is highly experimental and has limited features + As of XGBoost 1.6, the feature is experimental and has limited features Starting from version 1.5, XGBoost has experimental support for categorical data available -for public testing. At the moment, the support is implemented as one-hot encoding based -categorical tree splits. For numerical data, the split condition is defined as -:math:`value < threshold`, while for categorical data the split is defined as :math:`value -== category` and ``category`` is a discrete value. More advanced categorical split -strategy is planned for future releases and this tutorial details how to inform XGBoost -about the data type. Also, the current support for training is limited to ``gpu_hist`` -tree method. +for public testing. For numerical data, the split condition is defined as :math:`value < +threshold`, while for categorical data the split is defined depending on whether +partitioning or onehot encoding is used. For partition-based splits, the splits are +specified as :math:`value \in categories`, where ``categories`` is the set of categories +in one feature. If onehot encoding is used instead, then the split is defined as +:math:`value == category`. More advanced categorical split strategy is planned for future +releases and this tutorial details how to inform XGBoost about the data type. ************************************ Training with scikit-learn Interface @@ -35,13 +35,13 @@ parameter ``enable_categorical``: .. code:: python - # Only gpu_hist is supported for categorical data as mentioned previously + # Supported tree methods are `gpu_hist`, `approx`, and `hist`. clf = xgb.XGBClassifier( tree_method="gpu_hist", enable_categorical=True, use_label_encoder=False ) # X is the dataframe we created in previous snippet clf.fit(X, y) - # Must use JSON for serialization, otherwise the information is lost + # Must use JSON/UBJSON for serialization, otherwise the information is lost. clf.save_model("categorical-model.json") @@ -60,11 +60,37 @@ can plot the model and calculate the global feature importance: The ``scikit-learn`` interface from dask is similar to single node version. The basic -idea is create dataframe with category feature type, and tell XGBoost to use ``gpu_hist`` -with parameter ``enable_categorical``. See :ref:`sphx_glr_python_examples_categorical.py` -for a worked example of using categorical data with ``scikit-learn`` interface. A -comparison between using one-hot encoded data and XGBoost's categorical data support can -be found :ref:`sphx_glr_python_examples_cat_in_the_dat.py`. +idea is create dataframe with category feature type, and tell XGBoost to use it by setting +the ``enable_categorical`` parameter. See :ref:`sphx_glr_python_examples_categorical.py` +for a worked example of using categorical data with ``scikit-learn`` interface with +one-hot encoding. A comparison between using one-hot encoded data and XGBoost's +categorical data support can be found :ref:`sphx_glr_python_examples_cat_in_the_dat.py`. + + +******************** +Optimal Partitioning +******************** + +.. versionadded:: 1.6 + +Optimal partitioning is a technique for partitioning the categorical predictors for each +node split, the proof of optimality for numerical objectives like ``RMSE`` was first +introduced by `[1] <#references>`__. The algorithm is used in decision trees for handling +regression and binary classification tasks `[2] <#references>`__, later LightGBM `[3] +<#references>`__ brought it to the context of gradient boosting trees and now is also +adopted in XGBoost as an optional feature for handling categorical splits. More +specifically, the proof by Fisher `[1] <#references>`__ states that, when trying to +partition a set of discrete values into groups based on the distances between a measure of +these values, one only needs to look at sorted partitions instead of enumerating all +possible permutations. In the context of decision trees, the discrete values are +categories, and the measure is the output leaf value. Intuitively, we want to group the +categories that output similar leaf values. During split finding, we first sort the +gradient histogram to prepare the contiguous partitions then enumerate the splits +according to these sorted values. One of the related parameters for XGBoost is +``max_cat_to_one_hot``, which controls whether one-hot encoding or partitioning should be +used for each feature, see :doc:`/parameter` for details. When objective is not +regression or binary classification, XGBoost will fallback to using onehot encoding +instead. ********************** @@ -82,7 +108,7 @@ categorical data, we need to pass the similar parameter to :class:`DMatrix # X is a dataframe we created in previous snippet Xy = xgb.DMatrix(X, y, enable_categorical=True) - booster = xgb.train({"tree_method": "gpu_hist"}, Xy) + booster = xgb.train({"tree_method": "hist", "max_cat_to_onehot": 5}, Xy) # Must use JSON for serialization, otherwise the information is lost booster.save_model("categorical-model.json") @@ -109,30 +135,7 @@ types by using the ``feature_types`` parameter in :class:`DMatrix ` can also be used as categorical data. - -******************** -Optimal Partitioning -******************** - -.. versionadded:: 1.6 - -Optimal partitioning is a technique for partitioning the categorical predictors for each -node split, the proof of optimality for numerical objectives like ``RMSE`` was first -introduced by `[1] <#references>`__. The algorithm is used in decision trees for handling -regression and binary classification tasks `[2] <#references>`__, later LightGBM `[3] -<#references>`__ brought it to the context of gradient boosting trees and now is also -adopted in XGBoost as an optional feature for handling categorical splits. More -specifically, the proof by Fisher `[1] <#references>`__ states that, when trying to -partition a set of discrete values into groups based on the distances between a measure of -these values, one only needs to look at sorted partitions instead of enumerating all -possible permutations. In the context of decision trees, the discrete values are -categories, and the measure is the output leaf value. Intuitively, we want to group the -categories that output similar leaf values. During split finding, we first sort the -gradient histogram to prepare the contiguous partitions then enumerate the splits -according to these sorted values. One of the related parameters for XGBoost is -``max_cat_to_one_hot``, which controls whether one-hot encoding or partitioning should be -used for each feature, see :doc:`/parameter` for details. +:class:`dask.Array ` can also be used for categorical data. ************* Miscellaneous diff --git a/include/xgboost/tree_model.h b/include/xgboost/tree_model.h index 5c23f278a83b..b2d2ad3383de 100644 --- a/include/xgboost/tree_model.h +++ b/include/xgboost/tree_model.h @@ -604,6 +604,16 @@ class RegTree : public Model { */ std::vector const &GetSplitTypes() const { return split_types_; } common::Span GetSplitCategories() const { return split_categories_; } + /*! + * \brief Get the bit storage for categories + */ + common::Span NodeCats(bst_node_t nidx) const { + auto node_ptr = GetCategoriesMatrix().node_ptr; + auto categories = GetCategoriesMatrix().categories; + auto segment = node_ptr[nidx]; + auto node_cats = categories.subspan(segment.beg, segment.size); + return node_cats; + } auto const& GetSplitCategoriesPtr() const { return split_categories_segments_; } // The fields of split_categories_segments_[i] are set such that diff --git a/python-package/xgboost/core.py b/python-package/xgboost/core.py index 009dea9904a0..f22371ab867a 100644 --- a/python-package/xgboost/core.py +++ b/python-package/xgboost/core.py @@ -582,10 +582,11 @@ def __init__( .. versionadded:: 1.3.0 + .. note:: This parameter is experimental + Experimental support of specializing for categorical features. Do not set - to True unless you are interested in development. Currently it's only - available for `gpu_hist` and `approx` tree methods. Also, JSON/UBJSON - serialization format is required. (XGBoost 1.6 for approx) + to True unless you are interested in development. Also, JSON/UBJSON + serialization format is required. """ if group is not None and qid is not None: diff --git a/python-package/xgboost/sklearn.py b/python-package/xgboost/sklearn.py index b7bfe8b3219e..6efbf7cd36f7 100644 --- a/python-package/xgboost/sklearn.py +++ b/python-package/xgboost/sklearn.py @@ -206,10 +206,11 @@ def inner(y_score: np.ndarray, dmatrix: DMatrix) -> Tuple[str, float]: .. versionadded:: 1.5.0 - Experimental support for categorical data. Do not set to true unless you are - interested in development. Only valid when `gpu_hist` or `approx` is used along - with dataframe as input. Also, JSON/UBJSON serialization format is - required. (XGBoost 1.6 for approx) + .. note:: This parameter is experimental + + Experimental support for categorical data. When enabled, cudf/pandas.DataFrame + should be used to specify categorical data type. Also, JSON/UBJSON + serialization format is required. max_cat_to_onehot : Optional[int] @@ -220,9 +221,8 @@ def inner(y_score: np.ndarray, dmatrix: DMatrix) -> Tuple[str, float]: A threshold for deciding whether XGBoost should use one-hot encoding based split for categorical data. When number of categories is lesser than the threshold then one-hot encoding is chosen, otherwise the categories will be partitioned - into children nodes. Only relevant for regression and binary - classification. Also, ``approx`` or ``gpu_hist`` tree method is required. See - :doc:`Categorical Data ` for details. + into children nodes. Only relevant for regression and binary classification. + See :doc:`Categorical Data ` for details. eval_metric : Optional[Union[str, List[str], Callable]] @@ -846,7 +846,8 @@ def _duplicated(parameter: str) -> None: callbacks = self.callbacks if self.callbacks is not None else callbacks tree_method = params.get("tree_method", None) - if self.enable_categorical and tree_method not in ("gpu_hist", "approx"): + cat_support = {"gpu_hist", "approx", "hist"} + if self.enable_categorical and tree_method not in cat_support: raise ValueError( "Experimental support for categorical data is not implemented for" " current tree method yet." diff --git a/src/common/partition_builder.h b/src/common/partition_builder.h index 5235ea3b9404..0fdde231f731 100644 --- a/src/common/partition_builder.h +++ b/src/common/partition_builder.h @@ -1,5 +1,5 @@ /*! - * Copyright 2021 by Contributors + * Copyright 2021-2022 by Contributors * \file row_set.h * \brief Quick Utility to compute subset of rows * \author Philip Cho, Tianqi Chen @@ -8,12 +8,15 @@ #define XGBOOST_COMMON_PARTITION_BUILDER_H_ #include + #include -#include -#include #include +#include +#include + +#include "categorical.h" +#include "column_matrix.h" #include "xgboost/tree_model.h" -#include "../common/column_matrix.h" namespace xgboost { namespace common { @@ -46,18 +49,20 @@ class PartitionBuilder { // on comparison of indexes values (idx_span) and split point (split_cond) // Handle dense columns // Analog of std::stable_partition, but in no-inplace manner - template + template inline std::pair PartitionKernel(const ColumnType& column, - common::Span rid_span, const int32_t split_cond, - common::Span left_part, common::Span right_part) { + common::Span row_indices, + common::Span left_part, + common::Span right_part, + size_t base_rowid, Predicate&& pred) { size_t* p_left_part = left_part.data(); size_t* p_right_part = right_part.data(); size_t nleft_elems = 0; size_t nright_elems = 0; - auto state = column.GetInitialState(rid_span.front()); + auto state = column.GetInitialState(row_indices.front() - base_rowid); - for (auto rid : rid_span) { - const int32_t bin_id = column.GetBinIdx(rid, &state); + for (auto rid : row_indices) { + const int32_t bin_id = column.GetBinIdx(rid - base_rowid, &state); if (any_missing && bin_id == ColumnType::kMissingId) { if (default_left) { p_left_part[nleft_elems++] = rid; @@ -65,7 +70,7 @@ class PartitionBuilder { p_right_part[nright_elems++] = rid; } } else { - if (bin_id <= split_cond) { + if (pred(rid, bin_id)) { p_left_part[nleft_elems++] = rid; } else { p_right_part[nright_elems++] = rid; @@ -97,39 +102,64 @@ class PartitionBuilder { template void Partition(const size_t node_in_set, const size_t nid, const common::Range1d range, - const int32_t split_cond, - const ColumnMatrix& column_matrix, const RegTree& tree, const size_t* rid) { + const int32_t split_cond, GHistIndexMatrix const& gmat, + const ColumnMatrix& column_matrix, const RegTree& tree, const size_t* rid) { common::Span rid_span(rid + range.begin(), rid + range.end()); - common::Span left = GetLeftBuffer(node_in_set, - range.begin(), range.end()); - common::Span right = GetRightBuffer(node_in_set, - range.begin(), range.end()); + common::Span left = GetLeftBuffer(node_in_set, range.begin(), range.end()); + common::Span right = GetRightBuffer(node_in_set, range.begin(), range.end()); const bst_uint fid = tree[nid].SplitIndex(); const bool default_left = tree[nid].DefaultLeft(); const auto column_ptr = column_matrix.GetColumn(fid); - std::pair child_nodes_sizes; + bool is_cat = tree.GetSplitTypes()[nid] == FeatureType::kCategorical; + auto node_cats = tree.NodeCats(nid); + + auto const& index = gmat.index; + auto const& cut_values = gmat.cut.Values(); + auto cut_ptrs = gmat.cut.Ptrs(); + + auto pred = [&](auto ridx, auto bin_id) { + bool go_left; + if (is_cat) { + auto begin = gmat.RowIdx(ridx); + auto end = gmat.RowIdx(ridx + 1); + auto f_begin = cut_ptrs[fid]; + auto f_end = cut_ptrs[fid + 1]; + // bypassing the column matrix as we need the cut value instead of bin idx for categorical + // features. + auto gidx = BinarySearchBin(begin, end, index, f_begin, f_end); + if (gidx == -1) { + go_left = default_left; + } else { + go_left = Decision(node_cats, cut_values[gidx], default_left); + } + } else { + go_left = bin_id <= split_cond; + } + return go_left; + }; + std::pair child_nodes_sizes; if (column_ptr->GetType() == xgboost::common::kDenseColumn) { const common::DenseColumn& column = static_cast& >(*(column_ptr.get())); if (default_left) { - child_nodes_sizes = PartitionKernel(column, rid_span, - split_cond, left, right); + child_nodes_sizes = PartitionKernel(column, rid_span, left, right, + gmat.base_rowid, pred); } else { - child_nodes_sizes = PartitionKernel(column, rid_span, - split_cond, left, right); + child_nodes_sizes = PartitionKernel(column, rid_span, left, right, + gmat.base_rowid, pred); } } else { CHECK_EQ(any_missing, true); const common::SparseColumn& column = static_cast& >(*(column_ptr.get())); if (default_left) { - child_nodes_sizes = PartitionKernel(column, rid_span, - split_cond, left, right); + child_nodes_sizes = PartitionKernel(column, rid_span, left, right, + gmat.base_rowid, pred); } else { - child_nodes_sizes = PartitionKernel(column, rid_span, - split_cond, left, right); + child_nodes_sizes = PartitionKernel(column, rid_span, left, right, + gmat.base_rowid, pred); } } diff --git a/src/common/threading_utils.h b/src/common/threading_utils.h index 4691fce7cd95..75e9ba5b3b18 100644 --- a/src/common/threading_utils.h +++ b/src/common/threading_utils.h @@ -275,9 +275,6 @@ class MemStackAllocator { T& operator[](size_t i) { return ptr_[i]; } T const& operator[](size_t i) const { return ptr_[i]; } - // FIXME(jiamingy): Remove this once we merge partitioner cleanup for hist. - auto Get() { return ptr_; } - private: T* ptr_ = nullptr; size_t required_size_; diff --git a/src/tree/hist/evaluate_splits.h b/src/tree/hist/evaluate_splits.h index ed3f14e605da..169d1cdc541a 100644 --- a/src/tree/hist/evaluate_splits.h +++ b/src/tree/hist/evaluate_splits.h @@ -288,10 +288,10 @@ template class HistEvaluator { auto base_weight = evaluator.CalcWeight(candidate.nid, param_, GradStats{parent_sum}); - auto left_weight = evaluator.CalcWeight( - candidate.nid, param_, GradStats{candidate.split.left_sum}); - auto right_weight = evaluator.CalcWeight( - candidate.nid, param_, GradStats{candidate.split.right_sum}); + auto left_weight = + evaluator.CalcWeight(candidate.nid, param_, GradStats{candidate.split.left_sum}); + auto right_weight = + evaluator.CalcWeight(candidate.nid, param_, GradStats{candidate.split.right_sum}); if (candidate.split.is_cat) { std::vector split_cats; @@ -308,11 +308,11 @@ template class HistEvaluator { split_cats = candidate.split.cat_bits; common::CatBitField cat_bits{split_cats}; } - tree.ExpandCategorical( candidate.nid, candidate.split.SplitIndex(), split_cats, candidate.split.DefaultLeft(), - base_weight, left_weight, right_weight, candidate.split.loss_chg, parent_sum.GetHess(), - candidate.split.left_sum.GetHess(), candidate.split.right_sum.GetHess()); + base_weight, left_weight * param_.learning_rate, right_weight * param_.learning_rate, + candidate.split.loss_chg, parent_sum.GetHess(), candidate.split.left_sum.GetHess(), + candidate.split.right_sum.GetHess()); } else { tree.ExpandNode(candidate.nid, candidate.split.SplitIndex(), candidate.split.split_value, candidate.split.DefaultLeft(), base_weight, diff --git a/src/tree/updater_quantile_hist.cc b/src/tree/updater_quantile_hist.cc index 616d1c5718ce..60b501b7c4a4 100644 --- a/src/tree/updater_quantile_hist.cc +++ b/src/tree/updater_quantile_hist.cc @@ -124,11 +124,12 @@ void QuantileHistMaker::Builder::InitRoot( nodes_for_subtraction_trick_.clear(); nodes_for_explicit_hist_build_.push_back(node); + auto const& row_set_collection = partitioner_.front().Partitions(); size_t page_id = 0; for (auto const& gidx : p_fmat->GetBatches(HistBatch(param_))) { this->histogram_builder_->BuildHist( - page_id, gidx, p_tree, row_set_collection_, + page_id, gidx, p_tree, row_set_collection, nodes_for_explicit_hist_build_, nodes_for_subtraction_trick_, gpair_h); ++page_id; } @@ -149,7 +150,7 @@ void QuantileHistMaker::Builder::InitRoot( grad_stat.Add(et.GetGrad(), et.GetHess()); } } else { - const common::RowSetCollection::Elem e = row_set_collection_[nid]; + const common::RowSetCollection::Elem e = row_set_collection[nid]; for (const size_t *it = e.begin; it < e.end; ++it) { grad_stat.Add(gpair_h[*it].GetGrad(), gpair_h[*it].GetHess()); } @@ -204,6 +205,7 @@ void QuantileHistMaker::Builder::SplitSiblings( const std::vector &nodes_for_apply_split, std::vector *nodes_to_evaluate, RegTree *p_tree) { builder_monitor_.Start("SplitSiblings"); + auto const& row_set_collection = this->partitioner_.front().Partitions(); for (auto const& entry : nodes_for_apply_split) { int nid = entry.nid; @@ -213,7 +215,7 @@ void QuantileHistMaker::Builder::SplitSiblings( const CPUExpandEntry right_node = CPUExpandEntry(cright, p_tree->GetDepth(cright), 0.0); nodes_to_evaluate->push_back(left_node); nodes_to_evaluate->push_back(right_node); - if (row_set_collection_[cleft].Size() < row_set_collection_[cright].Size()) { + if (row_set_collection[cleft].Size() < row_set_collection[cright].Size()) { nodes_for_explicit_hist_build_.push_back(left_node); nodes_for_subtraction_trick_.push_back(right_node); } else { @@ -253,16 +255,18 @@ void QuantileHistMaker::Builder::ExpandTree( AddSplitsToTree(expand, p_tree, &num_leaves, &nodes_for_apply_split); if (nodes_for_apply_split.size() != 0) { - ApplySplit(nodes_for_apply_split, gmat, column_matrix, p_tree); + HistRowPartitioner &partitioner = this->partitioner_.front(); + partitioner.UpdatePosition(this->ctx_, gmat, column_matrix, + nodes_for_apply_split, p_tree); + SplitSiblings(nodes_for_apply_split, &nodes_to_evaluate, p_tree); if (param_.max_depth == 0 || depth < param_.max_depth) { size_t i = 0; - for (auto const& gidx : p_fmat->GetBatches(HistBatch(param_))) { - this->histogram_builder_->BuildHist( - i, gidx, p_tree, row_set_collection_, - nodes_for_explicit_hist_build_, nodes_for_subtraction_trick_, - gpair_h); + for (auto const &gidx : p_fmat->GetBatches(HistBatch(param_))) { + this->histogram_builder_->BuildHist(i, gidx, p_tree, partitioner_.front().Partitions(), + nodes_for_explicit_hist_build_, + nodes_for_subtraction_trick_, gpair_h); ++i; } } else { @@ -293,7 +297,7 @@ void QuantileHistMaker::Builder::ExpandTree( template void QuantileHistMaker::Builder::Update( const GHistIndexMatrix &gmat, - const ColumnMatrix &column_matrix, + const common::ColumnMatrix &column_matrix, HostDeviceVector *gpair, DMatrix *p_fmat, RegTree *p_tree) { builder_monitor_.Start("Update"); @@ -333,14 +337,14 @@ bool QuantileHistMaker::Builder::UpdatePredictionCache( CHECK_GT(out_preds.Size(), 0U); - size_t n_nodes = row_set_collection_.end() - row_set_collection_.begin(); - - common::BlockedSpace2d space(n_nodes, [&](size_t node) { - return row_set_collection_[node].Size(); - }, 1024); + CHECK_EQ(partitioner_.size(), 1); + auto const &row_set_collection = this->partitioner_.front().Partitions(); + size_t n_nodes = row_set_collection.end() - row_set_collection.begin(); + common::BlockedSpace2d space( + n_nodes, [&](size_t node) { return partitioner_.front()[node].Size(); }, 1024); CHECK_EQ(out_preds.DeviceIdx(), GenericParameter::kCpuId); common::ParallelFor2d(space, this->ctx_->Threads(), [&](size_t node, common::Range1d r) { - const RowSetCollection::Elem rowset = row_set_collection_[node]; + const common::RowSetCollection::Elem rowset = row_set_collection[node]; if (rowset.begin != nullptr && rowset.end != nullptr) { int nid = rowset.node_id; bst_float leaf_value; @@ -354,7 +358,7 @@ bool QuantileHistMaker::Builder::UpdatePredictionCache( } leaf_value = (*p_last_tree_)[nid].LeafValue(); - for (const size_t* it = rowset.begin + r.begin(); it < rowset.begin + r.end(); ++it) { + for (const size_t *it = rowset.begin + r.begin(); it < rowset.begin + r.end(); ++it) { out_preds(*it) += leaf_value; } } @@ -364,10 +368,9 @@ bool QuantileHistMaker::Builder::UpdatePredictionCache( return true; } -template +template void QuantileHistMaker::Builder::InitSampling(const DMatrix& fmat, - std::vector* gpair, - std::vector* row_indices) { + std::vector* gpair) { const auto& info = fmat.Info(); auto& rnd = common::GlobalRandom(); std::vector& gpair_ref = *gpair; @@ -410,101 +413,31 @@ template void QuantileHistMaker::Builder::InitData( const GHistIndexMatrix &gmat, const DMatrix &fmat, const RegTree &tree, std::vector *gpair) { - CHECK((param_.max_depth > 0 || param_.max_leaves > 0)) - << "max_depth or max_leaves cannot be both 0 (unlimited); " - << "at least one should be a positive quantity."; - if (param_.grow_policy == TrainParam::kDepthWise) { - CHECK(param_.max_depth > 0) << "max_depth cannot be 0 (unlimited) " - << "when grow_policy is depthwise."; - } builder_monitor_.Start("InitData"); const auto& info = fmat.Info(); { - // initialize the row set - row_set_collection_.Clear(); // initialize histogram collection uint32_t nbins = gmat.cut.Ptrs().back(); // initialize histogram builder dmlc::OMPException exc; - exc.Rethrow(); this->histogram_builder_->Reset(nbins, BatchParam{GenericParameter::kCpuId, param_.max_bin}, this->ctx_->Threads(), 1, rabit::IsDistributed()); - std::vector& row_indices = *row_set_collection_.Data(); - row_indices.resize(info.num_row_); - size_t* p_row_indices = row_indices.data(); - // mark subsample and build list of member rows - if (param_.subsample < 1.0f) { CHECK_EQ(param_.sampling_method, TrainParam::kUniform) << "Only uniform sampling is supported, " << "gradient-based sampling is only support by GPU Hist."; builder_monitor_.Start("InitSampling"); - InitSampling(fmat, gpair, &row_indices); + InitSampling(fmat, gpair); builder_monitor_.Stop("InitSampling"); - CHECK_EQ(row_indices.size(), info.num_row_); // We should check that the partitioning was done correctly // and each row of the dataset fell into exactly one of the categories } - auto n_threads = this->ctx_->Threads(); - common::MemStackAllocator buff(n_threads); - bool* p_buff = buff.Get(); - std::fill(p_buff, p_buff + this->ctx_->Threads(), false); - - const size_t block_size = info.num_row_ / n_threads + !!(info.num_row_ % n_threads); - -#pragma omp parallel num_threads(n_threads) - { - exc.Run([&]() { - const size_t tid = omp_get_thread_num(); - const size_t ibegin = tid * block_size; - const size_t iend = std::min(static_cast(ibegin + block_size), - static_cast(info.num_row_)); - - for (size_t i = ibegin; i < iend; ++i) { - if ((*gpair)[i].GetHess() < 0.0f) { - p_buff[tid] = true; - break; - } - } - }); - } - exc.Rethrow(); - - bool has_neg_hess = false; - for (int32_t tid = 0; tid < n_threads; ++tid) { - if (p_buff[tid]) { - has_neg_hess = true; - } - } - - if (has_neg_hess) { - size_t j = 0; - for (size_t i = 0; i < info.num_row_; ++i) { - if ((*gpair)[i].GetHess() >= 0.0f) { - p_row_indices[j++] = i; - } - } - row_indices.resize(j); - } else { - #pragma omp parallel num_threads(n_threads) - { - exc.Run([&]() { - const size_t tid = omp_get_thread_num(); - const size_t ibegin = tid * block_size; - const size_t iend = std::min(static_cast(ibegin + block_size), - static_cast(info.num_row_)); - for (size_t i = ibegin; i < iend; ++i) { - p_row_indices[i] = i; - } - }); - } - exc.Rethrow(); - } } - row_set_collection_.Init(); + partitioner_.clear(); + partitioner_.emplace_back(info.num_row_, 0, this->ctx_->Threads()); { /* determine layout of data */ @@ -558,12 +491,9 @@ void QuantileHistMaker::Builder::InitData( builder_monitor_.Stop("InitData"); } -template -void QuantileHistMaker::Builder::FindSplitConditions( - const std::vector& nodes, - const RegTree& tree, - const GHistIndexMatrix& gmat, - std::vector* split_conditions) { +void HistRowPartitioner::FindSplitConditions(const std::vector &nodes, + const RegTree &tree, const GHistIndexMatrix &gmat, + std::vector *split_conditions) { const size_t n_nodes = nodes.size(); split_conditions->resize(n_nodes); @@ -576,8 +506,7 @@ void QuantileHistMaker::Builder::FindSplitConditions( int32_t split_cond = -1; // convert floating-point split_pt into corresponding bin_id // split_cond = -1 indicates that split_pt is less than all known cut points - CHECK_LT(upper_bound, - static_cast(std::numeric_limits::max())); + CHECK_LT(upper_bound, static_cast(std::numeric_limits::max())); for (uint32_t bound = lower_bound; bound < upper_bound; ++bound) { if (split_pt == gmat.cut.Values()[bound]) { split_cond = static_cast(bound); @@ -586,88 +515,20 @@ void QuantileHistMaker::Builder::FindSplitConditions( (*split_conditions)[i] = split_cond; } } -template -void QuantileHistMaker::Builder::AddSplitsToRowSet( - const std::vector& nodes, - RegTree* p_tree) { + +void HistRowPartitioner::AddSplitsToRowSet(const std::vector &nodes, + RegTree const *p_tree) { const size_t n_nodes = nodes.size(); for (unsigned int i = 0; i < n_nodes; ++i) { const int32_t nid = nodes[i].nid; const size_t n_left = partition_builder_.GetNLeftElems(i); const size_t n_right = partition_builder_.GetNRightElems(i); CHECK_EQ((*p_tree)[nid].LeftChild() + 1, (*p_tree)[nid].RightChild()); - row_set_collection_.AddSplit(nid, (*p_tree)[nid].LeftChild(), - (*p_tree)[nid].RightChild(), n_left, n_right); + row_set_collection_.AddSplit(nid, (*p_tree)[nid].LeftChild(), (*p_tree)[nid].RightChild(), + n_left, n_right); } } -template -template -void QuantileHistMaker::Builder::ApplySplit(const std::vector nodes, - const GHistIndexMatrix& gmat, - const ColumnMatrix& column_matrix, - RegTree* p_tree) { - builder_monitor_.Start("ApplySplit"); - // 1. Find split condition for each split - const size_t n_nodes = nodes.size(); - std::vector split_conditions; - FindSplitConditions(nodes, *p_tree, gmat, &split_conditions); - // 2.1 Create a blocked space of size SUM(samples in each node) - common::BlockedSpace2d space(n_nodes, [&](size_t node_in_set) { - int32_t nid = nodes[node_in_set].nid; - return row_set_collection_[nid].Size(); - }, kPartitionBlockSize); - // 2.2 Initialize the partition builder - // allocate buffers for storage intermediate results by each thread - partition_builder_.Init(space.Size(), n_nodes, [&](size_t node_in_set) { - const int32_t nid = nodes[node_in_set].nid; - const size_t size = row_set_collection_[nid].Size(); - const size_t n_tasks = size / kPartitionBlockSize + !!(size % kPartitionBlockSize); - return n_tasks; - }); - // 2.3 Split elements of row_set_collection_ to left and right child-nodes for each node - // Store results in intermediate buffers from partition_builder_ - common::ParallelFor2d(space, this->ctx_->Threads(), [&](size_t node_in_set, common::Range1d r) { - size_t begin = r.begin(); - const int32_t nid = nodes[node_in_set].nid; - const size_t task_id = partition_builder_.GetTaskIdx(node_in_set, begin); - partition_builder_.AllocateForTask(task_id); - switch (column_matrix.GetTypeSize()) { - case common::kUint8BinsTypeSize: - partition_builder_.template Partition(node_in_set, nid, r, - split_conditions[node_in_set], column_matrix, - *p_tree, row_set_collection_[nid].begin); - break; - case common::kUint16BinsTypeSize: - partition_builder_.template Partition(node_in_set, nid, r, - split_conditions[node_in_set], column_matrix, - *p_tree, row_set_collection_[nid].begin); - break; - case common::kUint32BinsTypeSize: - partition_builder_.template Partition(node_in_set, nid, r, - split_conditions[node_in_set], column_matrix, - *p_tree, row_set_collection_[nid].begin); - break; - default: - CHECK(false); // no default behavior - } - }); - // 3. Compute offsets to copy blocks of row-indexes - // from partition_builder_ to row_set_collection_ - partition_builder_.CalculateRowOffsets(); - - // 4. Copy elements from partition_builder_ to row_set_collection_ back - // with updated row-indexes for each tree-node - common::ParallelFor2d(space, this->ctx_->Threads(), [&](size_t node_in_set, common::Range1d r) { - const int32_t nid = nodes[node_in_set].nid; - partition_builder_.MergeToArray(node_in_set, r.begin(), - const_cast(row_set_collection_[nid].begin)); - }); - // 5. Add info about splits into row_set_collection_ - AddSplitsToRowSet(nodes, p_tree); - builder_monitor_.Stop("ApplySplit"); -} - template struct QuantileHistMaker::Builder; template struct QuantileHistMaker::Builder; diff --git a/src/tree/updater_quantile_hist.h b/src/tree/updater_quantile_hist.h index 09df175cd0f8..4213d5dc8862 100644 --- a/src/tree/updater_quantile_hist.h +++ b/src/tree/updater_quantile_hist.h @@ -11,9 +11,9 @@ #include #include -#include +#include +#include #include -#include #include #include #include @@ -38,8 +38,6 @@ #include "../common/column_matrix.h" namespace xgboost { - - struct RandomReplace { public: // similar value as for minstd_rand @@ -82,15 +80,127 @@ struct RandomReplace { }; namespace tree { +class HistRowPartitioner { + // heuristically chosen block size of parallel partitioning + static constexpr size_t kPartitionBlockSize = 2048; + // worker class that partition a block of rows + common::PartitionBuilder partition_builder_; + // storage for row index + common::RowSetCollection row_set_collection_; + + /** + * \brief Turn split values into discrete bin indices. + */ + static void FindSplitConditions(const std::vector& nodes, const RegTree& tree, + const GHistIndexMatrix& gmat, + std::vector* split_conditions); + /** + * \brief Update the row set for new splits specifed by nodes. + */ + void AddSplitsToRowSet(const std::vector& nodes, RegTree const* p_tree); + + public: + bst_row_t base_rowid = 0; + + public: + HistRowPartitioner(size_t n_samples, size_t base_rowid, int32_t n_threads) { + row_set_collection_.Clear(); + const size_t block_size = n_samples / n_threads + !!(n_samples % n_threads); + dmlc::OMPException exc; + std::vector& row_indices = *row_set_collection_.Data(); + row_indices.resize(n_samples); + size_t* p_row_indices = row_indices.data(); + // parallel initialization o f row indices. (std::iota) +#pragma omp parallel num_threads(n_threads) + { + exc.Run([&]() { + const size_t tid = omp_get_thread_num(); + const size_t ibegin = tid * block_size; + const size_t iend = std::min(static_cast(ibegin + block_size), n_samples); + for (size_t i = ibegin; i < iend; ++i) { + p_row_indices[i] = i + base_rowid; + } + }); + } + row_set_collection_.Init(); + this->base_rowid = base_rowid; + } -using xgboost::GHistIndexMatrix; -using xgboost::common::GHistIndexRow; -using xgboost::common::HistCollection; -using xgboost::common::RowSetCollection; -using xgboost::common::GHistRow; -using xgboost::common::GHistBuilder; -using xgboost::common::ColumnMatrix; -using xgboost::common::Column; + template + void UpdatePosition(GenericParameter const* ctx, GHistIndexMatrix const& gmat, + common::ColumnMatrix const& column_matrix, + std::vector const& nodes, RegTree const* p_tree) { + // 1. Find split condition for each split + const size_t n_nodes = nodes.size(); + std::vector split_conditions; + FindSplitConditions(nodes, *p_tree, gmat, &split_conditions); + // 2.1 Create a blocked space of size SUM(samples in each node) + common::BlockedSpace2d space( + n_nodes, + [&](size_t node_in_set) { + int32_t nid = nodes[node_in_set].nid; + return row_set_collection_[nid].Size(); + }, + kPartitionBlockSize); + // 2.2 Initialize the partition builder + // allocate buffers for storage intermediate results by each thread + partition_builder_.Init(space.Size(), n_nodes, [&](size_t node_in_set) { + const int32_t nid = nodes[node_in_set].nid; + const size_t size = row_set_collection_[nid].Size(); + const size_t n_tasks = size / kPartitionBlockSize + !!(size % kPartitionBlockSize); + return n_tasks; + }); + CHECK_EQ(base_rowid, gmat.base_rowid); + // 2.3 Split elements of row_set_collection_ to left and right child-nodes for each node + // Store results in intermediate buffers from partition_builder_ + common::ParallelFor2d(space, ctx->Threads(), [&](size_t node_in_set, common::Range1d r) { + size_t begin = r.begin(); + const int32_t nid = nodes[node_in_set].nid; + const size_t task_id = partition_builder_.GetTaskIdx(node_in_set, begin); + partition_builder_.AllocateForTask(task_id); + switch (column_matrix.GetTypeSize()) { + case common::kUint8BinsTypeSize: + partition_builder_.template Partition( + node_in_set, nid, r, split_conditions[node_in_set], gmat, column_matrix, *p_tree, + row_set_collection_[nid].begin); + break; + case common::kUint16BinsTypeSize: + partition_builder_.template Partition( + node_in_set, nid, r, split_conditions[node_in_set], gmat, column_matrix, *p_tree, + row_set_collection_[nid].begin); + break; + case common::kUint32BinsTypeSize: + partition_builder_.template Partition( + node_in_set, nid, r, split_conditions[node_in_set], gmat, column_matrix, *p_tree, + row_set_collection_[nid].begin); + break; + default: + // no default behavior + CHECK(false) << column_matrix.GetTypeSize(); + } + }); + // 3. Compute offsets to copy blocks of row-indexes + // from partition_builder_ to row_set_collection_ + partition_builder_.CalculateRowOffsets(); + + // 4. Copy elements from partition_builder_ to row_set_collection_ back + // with updated row-indexes for each tree-node + common::ParallelFor2d(space, ctx->Threads(), [&](size_t node_in_set, common::Range1d r) { + const int32_t nid = nodes[node_in_set].nid; + partition_builder_.MergeToArray(node_in_set, r.begin(), + const_cast(row_set_collection_[nid].begin)); + }); + // 5. Add info about splits into row_set_collection_ + AddSplitsToRowSet(nodes, p_tree); + } + + auto const& Partitions() const { return row_set_collection_; } + size_t Size() const { + return std::distance(row_set_collection_.begin(), row_set_collection_.end()); + } + auto& operator[](bst_node_t nidx) { return row_set_collection_[nidx]; } + auto const& operator[](bst_node_t nidx) const { return row_set_collection_[nidx]; } +}; inline BatchParam HistBatch(TrainParam const& param) { return {param.max_bin, param.sparse_threshold}; @@ -185,21 +295,7 @@ class QuantileHistMaker: public TreeUpdater { size_t GetNumberOfTrees(); - void InitSampling(const DMatrix& fmat, - std::vector* gpair, - std::vector* row_indices); - - template - void ApplySplit(std::vector nodes, - const GHistIndexMatrix& gmat, - const ColumnMatrix& column_matrix, - RegTree* p_tree); - - void AddSplitsToRowSet(const std::vector& nodes, RegTree* p_tree); - - - void FindSplitConditions(const std::vector& nodes, const RegTree& tree, - const GHistIndexMatrix& gmat, std::vector* split_conditions); + void InitSampling(const DMatrix& fmat, std::vector* gpair); template void InitRoot(DMatrix* p_fmat, @@ -221,7 +317,7 @@ class QuantileHistMaker: public TreeUpdater { template void ExpandTree(const GHistIndexMatrix& gmat, - const ColumnMatrix& column_matrix, + const common::ColumnMatrix& column_matrix, DMatrix* p_fmat, RegTree* p_tree, const std::vector& gpair_h); @@ -232,9 +328,6 @@ class QuantileHistMaker: public TreeUpdater { std::shared_ptr column_sampler_{ std::make_shared()}; - std::vector unused_rows_; - // the internal row sets - RowSetCollection row_set_collection_; std::vector gpair_local_; /*! \brief feature with least # of bins. to be used for dense specialization @@ -243,12 +336,12 @@ class QuantileHistMaker: public TreeUpdater { std::unique_ptr pruner_; std::unique_ptr> evaluator_; - - static constexpr size_t kPartitionBlockSize = 2048; - common::PartitionBuilder partition_builder_; + // Right now there's only 1 partitioner in this vector, when external memory is fully + // supported we will have number of partitioners equal to number of pages. + std::vector partitioner_; // back pointers to tree and data matrix - const RegTree* p_last_tree_; + const RegTree* p_last_tree_{nullptr}; DMatrix const* const p_last_fmat_; DMatrix* p_last_fmat_mutable_; diff --git a/tests/cpp/tree/hist/test_evaluate_splits.cc b/tests/cpp/tree/hist/test_evaluate_splits.cc index f3760534b4ab..e46726e7401c 100644 --- a/tests/cpp/tree/hist/test_evaluate_splits.cc +++ b/tests/cpp/tree/hist/test_evaluate_splits.cc @@ -40,7 +40,7 @@ template void TestEvaluateSplits() { std::iota(row_indices.begin(), row_indices.end(), 0); row_set_collection.Init(); - auto hist_builder = GHistBuilder(gmat.cut.Ptrs().back()); + auto hist_builder = common::GHistBuilder(gmat.cut.Ptrs().back()); hist.Init(gmat.cut.Ptrs().back()); hist.AddHistRow(0); hist.AllocateAllData(); @@ -94,7 +94,7 @@ TEST(HistEvaluator, Apply) { RegTree tree; int static constexpr kNRows = 8, kNCols = 16; TrainParam param; - param.UpdateAllowUnknown(Args{{}}); + param.UpdateAllowUnknown(Args{{"min_child_weight", "0"}, {"reg_lambda", "0.0"}}); auto dmat = RandomDataGenerator(kNRows, kNCols, 0).Seed(3).GenerateDMatrix(); auto sampler = std::make_shared(); auto evaluator_ = HistEvaluator{param, dmat->Info(), 4, sampler, @@ -102,12 +102,22 @@ TEST(HistEvaluator, Apply) { CPUExpandEntry entry{0, 0, 10.0f}; entry.split.left_sum = GradStats{0.4, 0.6f}; - entry.split.right_sum = GradStats{0.5, 0.7f}; + entry.split.right_sum = GradStats{0.5, 0.5f}; evaluator_.ApplyTreeSplit(entry, &tree); ASSERT_EQ(tree.NumExtraNodes(), 2); ASSERT_EQ(tree.Stat(tree[0].LeftChild()).sum_hess, 0.6f); - ASSERT_EQ(tree.Stat(tree[0].RightChild()).sum_hess, 0.7f); + ASSERT_EQ(tree.Stat(tree[0].RightChild()).sum_hess, 0.5f); + + { + RegTree tree; + entry.split.is_cat = true; + entry.split.split_value = 1.0; + evaluator_.ApplyTreeSplit(entry, &tree); + auto l = entry.split.left_sum; + ASSERT_NEAR(tree[1].LeafValue(), -l.sum_grad / l.sum_hess * param.learning_rate, kRtEps); + ASSERT_NEAR(tree[2].LeafValue(), -param.learning_rate, kRtEps); + } } TEST_F(TestPartitionBasedSplit, CPUHist) { diff --git a/tests/cpp/tree/test_approx.cc b/tests/cpp/tree/test_approx.cc index 639768b5eabb..a37c0973627e 100644 --- a/tests/cpp/tree/test_approx.cc +++ b/tests/cpp/tree/test_approx.cc @@ -1,26 +1,14 @@ /*! - * Copyright 2021 XGBoost contributors + * Copyright 2021-2022, XGBoost contributors. */ #include #include "../../../src/tree/updater_approx.h" #include "../helpers.h" +#include "test_partitioner.h" namespace xgboost { namespace tree { -namespace { -void GetSplit(RegTree *tree, float split_value, std::vector *candidates) { - tree->ExpandNode( - /*nid=*/RegTree::kRoot, /*split_index=*/0, /*split_value=*/split_value, - /*default_left=*/true, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, - /*left_sum=*/0.0f, - /*right_sum=*/0.0f); - candidates->front().split.split_value = split_value; - candidates->front().split.sindex = 0; - candidates->front().split.sindex |= (1U << 31); -} -} // anonymous namespace - TEST(Approx, Partitioner) { size_t n_samples = 1024, n_features = 1, base_rowid = 0; ApproxRowPartitioner partitioner{n_samples, base_rowid}; diff --git a/tests/cpp/tree/test_partitioner.h b/tests/cpp/tree/test_partitioner.h new file mode 100644 index 000000000000..109749a2832f --- /dev/null +++ b/tests/cpp/tree/test_partitioner.h @@ -0,0 +1,21 @@ +/*! + * Copyright 2021-2022, XGBoost contributors. + */ +#include +#include +#include "../../../src/tree/hist/expand_entry.h" + +namespace xgboost { +namespace tree { +inline void GetSplit(RegTree *tree, float split_value, std::vector *candidates) { + tree->ExpandNode( + /*nid=*/RegTree::kRoot, /*split_index=*/0, /*split_value=*/split_value, + /*default_left=*/true, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, + /*left_sum=*/0.0f, + /*right_sum=*/0.0f); + candidates->front().split.split_value = split_value; + candidates->front().split.sindex = 0; + candidates->front().split.sindex |= (1U << 31); +} +} // namespace tree +} // namespace xgboost diff --git a/tests/cpp/tree/test_quantile_hist.cc b/tests/cpp/tree/test_quantile_hist.cc index d043c5bb5a47..d2bebf0616d1 100644 --- a/tests/cpp/tree/test_quantile_hist.cc +++ b/tests/cpp/tree/test_quantile_hist.cc @@ -1,18 +1,19 @@ /*! * Copyright 2018-2022 by XGBoost Contributors */ +#include #include #include -#include #include -#include #include +#include -#include "../helpers.h" #include "../../../src/tree/param.h" -#include "../../../src/tree/updater_quantile_hist.h" #include "../../../src/tree/split_evaluator.h" +#include "../../../src/tree/updater_quantile_hist.h" +#include "../helpers.h" +#include "test_partitioner.h" #include "xgboost/data.h" namespace xgboost { @@ -94,130 +95,6 @@ class QuantileHistMock : public QuantileHistMaker { } } } - - void TestInitDataSampling(const GHistIndexMatrix& gmat, - std::vector* gpair, - DMatrix* p_fmat, - const RegTree& tree) { - // check SimpleSkip - size_t initial_seed = 777; - std::linear_congruential_engine(1) << 63 > eng_first(initial_seed); - for (size_t i = 0; i < 100; ++i) { - eng_first(); - } - uint64_t initial_seed_th = RandomReplace::SimpleSkip(100, initial_seed, 16807, RandomReplace::kMod); - std::linear_congruential_engine eng_second(initial_seed_th); - ASSERT_EQ(eng_first(), eng_second()); - - const size_t nthreads = omp_get_num_threads(); - // save state of global rng engine - auto initial_rnd = common::GlobalRandom(); - std::vector unused_rows_cpy = this->unused_rows_; - RealImpl::InitData(gmat, *p_fmat, tree, gpair); - std::vector row_indices_initial = *(this->row_set_collection_.Data()); - std::vector unused_row_indices_initial = this->unused_rows_; - ASSERT_EQ(row_indices_initial.size(), p_fmat->Info().num_row_); - auto check_each_row_occurs_in_one_of_arrays = [](const std::vector& first, - const std::vector& second, - size_t nrows) { - ASSERT_EQ(first.size(), nrows); - ASSERT_EQ(second.size(), 0); - }; - check_each_row_occurs_in_one_of_arrays(row_indices_initial, unused_row_indices_initial, - p_fmat->Info().num_row_); - - for (size_t i_nthreads = 1; i_nthreads < 4; ++i_nthreads) { - omp_set_num_threads(i_nthreads); - // return initial state of global rng engine - common::GlobalRandom() = initial_rnd; - this->unused_rows_ = unused_rows_cpy; - RealImpl::InitData(gmat, *p_fmat, tree, gpair); - std::vector& row_indices = *(this->row_set_collection_.Data()); - ASSERT_EQ(row_indices_initial.size(), row_indices.size()); - for (size_t i = 0; i < row_indices_initial.size(); ++i) { - ASSERT_EQ(row_indices_initial[i], row_indices[i]); - } - std::vector& unused_row_indices = this->unused_rows_; - ASSERT_EQ(unused_row_indices_initial.size(), unused_row_indices.size()); - for (size_t i = 0; i < unused_row_indices_initial.size(); ++i) { - ASSERT_EQ(unused_row_indices_initial[i], unused_row_indices[i]); - } - check_each_row_occurs_in_one_of_arrays(row_indices, unused_row_indices, - p_fmat->Info().num_row_); - } - omp_set_num_threads(nthreads); - } - - void TestApplySplit(const RegTree& tree) { - std::vector row_gpairs = - { {1.23f, 0.24f}, {0.24f, 0.25f}, {0.26f, 0.27f}, {2.27f, 0.28f}, - {0.27f, 0.29f}, {0.37f, 0.39f}, {-0.47f, 0.49f}, {0.57f, 0.59f} }; - int32_t constexpr kMaxBins = 4; - - // try out different sparsity to get different number of missing values - for (double sparsity : {0.0, 0.1, 0.2}) { - // kNRows samples with kNCols features - auto dmat = RandomDataGenerator(kNRows, kNCols, sparsity).Seed(3).GenerateDMatrix(); - - float sparse_th = 0.0; - GHistIndexMatrix gmat{dmat.get(), kMaxBins, sparse_th, false, common::OmpGetNumThreads(0)}; - ColumnMatrix cm; - - // treat everything as dense, as this is what we intend to test here - cm.Init(gmat, sparse_th, common::OmpGetNumThreads(0)); - RealImpl::InitData(gmat, *dmat, tree, &row_gpairs); - const size_t num_row = dmat->Info().num_row_; - // split by feature 0 - const size_t bin_id_min = gmat.cut.Ptrs()[0]; - const size_t bin_id_max = gmat.cut.Ptrs()[1]; - - // attempt to split at different bins - for (size_t split = 0; split < 4; split++) { - size_t left_cnt = 0, right_cnt = 0; - - // manually compute how many samples go left or right - for (size_t rid = 0; rid < num_row; ++rid) { - for (size_t offset = gmat.row_ptr[rid]; offset < gmat.row_ptr[rid + 1]; ++offset) { - const size_t bin_id = gmat.index[offset]; - if (bin_id >= bin_id_min && bin_id < bin_id_max) { - if (bin_id <= split) { - left_cnt++; - } else { - right_cnt++; - } - } - } - } - - // if any were missing due to sparsity, we add them to the left or to the right - size_t missing = kNRows - left_cnt - right_cnt; - if (tree[0].DefaultLeft()) { - left_cnt += missing; - } else { - right_cnt += missing; - } - - // have one node with kNRows (=8 at the moment) rows, just one task - RealImpl::partition_builder_.Init(1, 1, [&](size_t node_in_set) { - return 1; - }); - const size_t task_id = RealImpl::partition_builder_.GetTaskIdx(0, 0); - RealImpl::partition_builder_.AllocateForTask(task_id); - if (cm.AnyMissing()) { - RealImpl::partition_builder_.template Partition(0, 0, common::Range1d(0, kNRows), - split, cm, tree, this->row_set_collection_[0].begin); - } else { - RealImpl::partition_builder_.template Partition(0, 0, common::Range1d(0, kNRows), - split, cm, tree, this->row_set_collection_[0].begin); - } - RealImpl::partition_builder_.CalculateRowOffsets(); - ASSERT_EQ(RealImpl::partition_builder_.GetNLeftElems(0), left_cnt); - ASSERT_EQ(RealImpl::partition_builder_.GetNRightElems(0), right_cnt); - } - } - } }; int static constexpr kNRows = 8, kNCols = 16; @@ -262,33 +139,6 @@ class QuantileHistMock : public QuantileHistMaker { float_builder_->TestInitData(gmat, &gpair, dmat_.get(), tree); } } - - void TestInitDataSampling() { - int32_t constexpr kMaxBins = 4; - GHistIndexMatrix gmat{dmat_.get(), kMaxBins, 0.0f, false, common::OmpGetNumThreads(0)}; - - RegTree tree = RegTree(); - tree.param.UpdateAllowUnknown(cfg_); - - std::vector gpair = - { {0.23f, 0.24f}, {0.23f, 0.24f}, {0.23f, 0.24f}, {0.23f, 0.24f}, - {0.27f, 0.29f}, {0.27f, 0.29f}, {0.27f, 0.29f}, {0.27f, 0.29f} }; - if (double_builder_) { - double_builder_->TestInitDataSampling(gmat, &gpair, dmat_.get(), tree); - } else { - float_builder_->TestInitDataSampling(gmat, &gpair, dmat_.get(), tree); - } - } - - void TestApplySplit() { - RegTree tree = RegTree(); - tree.param.UpdateAllowUnknown(cfg_); - if (double_builder_) { - double_builder_->TestApplySplit(tree); - } else { - float_builder_->TestApplySplit(tree); - } - } }; TEST(QuantileHist, InitData) { @@ -301,30 +151,62 @@ TEST(QuantileHist, InitData) { maker_float.TestInitData(); } -TEST(QuantileHist, InitDataSampling) { - const float subsample = 0.5; - std::vector> cfg - {{"num_feature", std::to_string(QuantileHistMock::GetNumColumns())}, - {"subsample", std::to_string(subsample)}}; - QuantileHistMock maker(cfg); - maker.TestInitDataSampling(); - const bool single_precision_histogram = true; - QuantileHistMock maker_float(cfg, single_precision_histogram); - maker_float.TestInitDataSampling(); -} - -TEST(QuantileHist, ApplySplit) { - std::vector> cfg - {{"num_feature", std::to_string(QuantileHistMock::GetNumColumns())}, - {"split_evaluator", "elastic_net"}, - {"reg_lambda", "0"}, {"reg_alpha", "0"}, {"max_delta_step", "0"}, - {"min_child_weight", "0"}}; - QuantileHistMock maker(cfg); - maker.TestApplySplit(); - const bool single_precision_histogram = true; - QuantileHistMock maker_float(cfg, single_precision_histogram); - maker_float.TestApplySplit(); +TEST(QuantileHist, Partitioner) { + size_t n_samples = 1024, n_features = 1, base_rowid = 0; + GenericParameter ctx; + ctx.InitAllowUnknown(Args{}); + + HistRowPartitioner partitioner{n_samples, base_rowid, ctx.Threads()}; + ASSERT_EQ(partitioner.base_rowid, base_rowid); + ASSERT_EQ(partitioner.Size(), 1); + ASSERT_EQ(partitioner.Partitions()[0].Size(), n_samples); + + auto Xy = RandomDataGenerator{n_samples, n_features, 0}.GenerateDMatrix(true); + std::vector candidates{{0, 0, 0.4}}; + + auto grad = GenerateRandomGradients(n_samples); + std::vector hess(grad.Size()); + std::transform(grad.HostVector().cbegin(), grad.HostVector().cend(), hess.begin(), + [](auto gpair) { return gpair.GetHess(); }); + + for (auto const& page : Xy->GetBatches({64, 0.5})) { + bst_feature_t const split_ind = 0; + common::ColumnMatrix column_indices; + column_indices.Init(page, 0.5, ctx.Threads()); + { + auto min_value = page.cut.MinValues()[split_ind]; + RegTree tree; + HistRowPartitioner partitioner{n_samples, base_rowid, ctx.Threads()}; + GetSplit(&tree, min_value, &candidates); + partitioner.UpdatePosition(&ctx, page, column_indices, candidates, &tree); + ASSERT_EQ(partitioner.Size(), 3); + ASSERT_EQ(partitioner[1].Size(), 0); + ASSERT_EQ(partitioner[2].Size(), n_samples); + } + { + HistRowPartitioner partitioner{n_samples, base_rowid, ctx.Threads()}; + auto ptr = page.cut.Ptrs()[split_ind + 1]; + float split_value = page.cut.Values().at(ptr / 2); + RegTree tree; + GetSplit(&tree, split_value, &candidates); + auto left_nidx = tree[RegTree::kRoot].LeftChild(); + partitioner.UpdatePosition(&ctx, page, column_indices, candidates, &tree); + + auto elem = partitioner[left_nidx]; + ASSERT_LT(elem.Size(), n_samples); + ASSERT_GT(elem.Size(), 1); + for (auto it = elem.begin; it != elem.end; ++it) { + auto value = page.cut.Values().at(page.index[*it]); + ASSERT_LE(value, split_value); + } + auto right_nidx = tree[RegTree::kRoot].RightChild(); + elem = partitioner[right_nidx]; + for (auto it = elem.begin; it != elem.end; ++it) { + auto value = page.cut.Values().at(page.index[*it]); + ASSERT_GT(value, split_value) << *it; + } + } + } } - } // namespace tree } // namespace xgboost diff --git a/tests/python/test_updaters.py b/tests/python/test_updaters.py index 3e80d273899f..b73736c6950f 100644 --- a/tests/python/test_updaters.py +++ b/tests/python/test_updaters.py @@ -245,3 +245,4 @@ def run_categorical_basic(self, rows, cols, rounds, cats, tree_method): @pytest.mark.skipif(**tm.no_pandas()) def test_categorical(self, rows, cols, rounds, cats): self.run_categorical_basic(rows, cols, rounds, cats, "approx") + self.run_categorical_basic(rows, cols, rounds, cats, "hist") From eba5252a9d5afce7f7b0ddb54fed5fdb355df2ef Mon Sep 17 00:00:00 2001 From: fis Date: Wed, 23 Feb 2022 23:18:39 +0800 Subject: [PATCH 2/3] Fix ref. --- src/common/partition_builder.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/common/partition_builder.h b/src/common/partition_builder.h index 0fdde231f731..774126dd76f1 100644 --- a/src/common/partition_builder.h +++ b/src/common/partition_builder.h @@ -116,7 +116,7 @@ class PartitionBuilder { auto const& index = gmat.index; auto const& cut_values = gmat.cut.Values(); - auto cut_ptrs = gmat.cut.Ptrs(); + auto const& cut_ptrs = gmat.cut.Ptrs(); auto pred = [&](auto ridx, auto bin_id) { bool go_left; From d55fcc9473d74599761ee2581b65618809e20de9 Mon Sep 17 00:00:00 2001 From: fis Date: Thu, 24 Feb 2022 01:40:38 +0800 Subject: [PATCH 3/3] Small optimization. --- src/common/partition_builder.h | 16 ++++++++++------ src/tree/updater_quantile_hist.cc | 9 +++++++-- src/tree/updater_quantile_hist.h | 8 ++++---- tests/cpp/tree/test_quantile_hist.cc | 4 ++-- 4 files changed, 23 insertions(+), 14 deletions(-) diff --git a/src/common/partition_builder.h b/src/common/partition_builder.h index 774126dd76f1..3250b9d2bf25 100644 --- a/src/common/partition_builder.h +++ b/src/common/partition_builder.h @@ -61,7 +61,11 @@ class PartitionBuilder { size_t nright_elems = 0; auto state = column.GetInitialState(row_indices.front() - base_rowid); - for (auto rid : row_indices) { + auto p_row_indices = row_indices.data(); + auto n_samples = row_indices.size(); + + for (size_t i = 0; i < n_samples; ++i) { + auto rid = p_row_indices[i]; const int32_t bin_id = column.GetBinIdx(rid - base_rowid, &state); if (any_missing && bin_id == ColumnType::kMissingId) { if (default_left) { @@ -100,7 +104,7 @@ class PartitionBuilder { return {nleft_elems, nright_elems}; } - template + template void Partition(const size_t node_in_set, const size_t nid, const common::Range1d range, const int32_t split_cond, GHistIndexMatrix const& gmat, const ColumnMatrix& column_matrix, const RegTree& tree, const size_t* rid) { @@ -119,8 +123,7 @@ class PartitionBuilder { auto const& cut_ptrs = gmat.cut.Ptrs(); auto pred = [&](auto ridx, auto bin_id) { - bool go_left; - if (is_cat) { + if (any_cat && is_cat) { auto begin = gmat.RowIdx(ridx); auto end = gmat.RowIdx(ridx + 1); auto f_begin = cut_ptrs[fid]; @@ -128,15 +131,16 @@ class PartitionBuilder { // bypassing the column matrix as we need the cut value instead of bin idx for categorical // features. auto gidx = BinarySearchBin(begin, end, index, f_begin, f_end); + bool go_left; if (gidx == -1) { go_left = default_left; } else { go_left = Decision(node_cats, cut_values[gidx], default_left); } + return go_left; } else { - go_left = bin_id <= split_cond; + return bin_id <= split_cond; } - return go_left; }; std::pair child_nodes_sizes; diff --git a/src/tree/updater_quantile_hist.cc b/src/tree/updater_quantile_hist.cc index 60b501b7c4a4..cfc95039ea24 100644 --- a/src/tree/updater_quantile_hist.cc +++ b/src/tree/updater_quantile_hist.cc @@ -256,8 +256,13 @@ void QuantileHistMaker::Builder::ExpandTree( if (nodes_for_apply_split.size() != 0) { HistRowPartitioner &partitioner = this->partitioner_.front(); - partitioner.UpdatePosition(this->ctx_, gmat, column_matrix, - nodes_for_apply_split, p_tree); + if (gmat.cut.HasCategorical()) { + partitioner.UpdatePosition(this->ctx_, gmat, column_matrix, + nodes_for_apply_split, p_tree); + } else { + partitioner.UpdatePosition(this->ctx_, gmat, column_matrix, + nodes_for_apply_split, p_tree); + } SplitSiblings(nodes_for_apply_split, &nodes_to_evaluate, p_tree); diff --git a/src/tree/updater_quantile_hist.h b/src/tree/updater_quantile_hist.h index 4213d5dc8862..1c863028c53b 100644 --- a/src/tree/updater_quantile_hist.h +++ b/src/tree/updater_quantile_hist.h @@ -126,7 +126,7 @@ class HistRowPartitioner { this->base_rowid = base_rowid; } - template + template void UpdatePosition(GenericParameter const* ctx, GHistIndexMatrix const& gmat, common::ColumnMatrix const& column_matrix, std::vector const& nodes, RegTree const* p_tree) { @@ -160,17 +160,17 @@ class HistRowPartitioner { partition_builder_.AllocateForTask(task_id); switch (column_matrix.GetTypeSize()) { case common::kUint8BinsTypeSize: - partition_builder_.template Partition( + partition_builder_.template Partition( node_in_set, nid, r, split_conditions[node_in_set], gmat, column_matrix, *p_tree, row_set_collection_[nid].begin); break; case common::kUint16BinsTypeSize: - partition_builder_.template Partition( + partition_builder_.template Partition( node_in_set, nid, r, split_conditions[node_in_set], gmat, column_matrix, *p_tree, row_set_collection_[nid].begin); break; case common::kUint32BinsTypeSize: - partition_builder_.template Partition( + partition_builder_.template Partition( node_in_set, nid, r, split_conditions[node_in_set], gmat, column_matrix, *p_tree, row_set_collection_[nid].begin); break; diff --git a/tests/cpp/tree/test_quantile_hist.cc b/tests/cpp/tree/test_quantile_hist.cc index d2bebf0616d1..4a26496cb80f 100644 --- a/tests/cpp/tree/test_quantile_hist.cc +++ b/tests/cpp/tree/test_quantile_hist.cc @@ -178,7 +178,7 @@ TEST(QuantileHist, Partitioner) { RegTree tree; HistRowPartitioner partitioner{n_samples, base_rowid, ctx.Threads()}; GetSplit(&tree, min_value, &candidates); - partitioner.UpdatePosition(&ctx, page, column_indices, candidates, &tree); + partitioner.UpdatePosition(&ctx, page, column_indices, candidates, &tree); ASSERT_EQ(partitioner.Size(), 3); ASSERT_EQ(partitioner[1].Size(), 0); ASSERT_EQ(partitioner[2].Size(), n_samples); @@ -190,7 +190,7 @@ TEST(QuantileHist, Partitioner) { RegTree tree; GetSplit(&tree, split_value, &candidates); auto left_nidx = tree[RegTree::kRoot].LeftChild(); - partitioner.UpdatePosition(&ctx, page, column_indices, candidates, &tree); + partitioner.UpdatePosition(&ctx, page, column_indices, candidates, &tree); auto elem = partitioner[left_nidx]; ASSERT_LT(elem.Size(), n_samples);