Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Expand categorical node. #6028

Merged
merged 12 commits into from Aug 25, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
5 changes: 3 additions & 2 deletions R-package/tests/testthat/test_basic.R
Expand Up @@ -245,11 +245,12 @@ test_that("training continuation works", {
expect_equal(bst$raw, bst2$raw)
expect_equal(dim(bst2$evaluation_log), c(2, 2))
# test continuing from a model in file
xgb.save(bst1, "xgboost.model")
bst2 <- xgb.train(param, dtrain, nrounds = 2, watchlist, verbose = 0, xgb_model = "xgboost.model")
xgb.save(bst1, "xgboost.json")
bst2 <- xgb.train(param, dtrain, nrounds = 2, watchlist, verbose = 0, xgb_model = "xgboost.json")
if (!windows_flag && !solaris_flag)
expect_equal(bst$raw, bst2$raw)
expect_equal(dim(bst2$evaluation_log), c(2, 2))
file.remove("xgboost.json")
})

test_that("model serialization works", {
Expand Down
18 changes: 9 additions & 9 deletions R-package/tests/testthat/test_callbacks.R
Expand Up @@ -173,16 +173,16 @@ test_that("cb.reset.parameters works as expected", {
})

test_that("cb.save.model works as expected", {
files <- c('xgboost_01.model', 'xgboost_02.model', 'xgboost.model')
files <- c('xgboost_01.json', 'xgboost_02.json', 'xgboost.json')
trivialfis marked this conversation as resolved.
Show resolved Hide resolved
for (f in files) if (file.exists(f)) file.remove(f)

bst <- xgb.train(param, dtrain, nrounds = 2, watchlist, eta = 1, verbose = 0,
save_period = 1, save_name = "xgboost_%02d.model")
expect_true(file.exists('xgboost_01.model'))
expect_true(file.exists('xgboost_02.model'))
b1 <- xgb.load('xgboost_01.model')
save_period = 1, save_name = "xgboost_%02d.json")
expect_true(file.exists('xgboost_01.json'))
expect_true(file.exists('xgboost_02.json'))
b1 <- xgb.load('xgboost_01.json')
expect_equal(xgb.ntree(b1), 1)
b2 <- xgb.load('xgboost_02.model')
b2 <- xgb.load('xgboost_02.json')
expect_equal(xgb.ntree(b2), 2)

xgb.config(b2) <- xgb.config(bst)
Expand All @@ -191,9 +191,9 @@ test_that("cb.save.model works as expected", {

# save_period = 0 saves the last iteration's model
bst <- xgb.train(param, dtrain, nrounds = 2, watchlist, eta = 1, verbose = 0,
save_period = 0)
expect_true(file.exists('xgboost.model'))
b2 <- xgb.load('xgboost.model')
save_period = 0, save_name = 'xgboost.json')
expect_true(file.exists('xgboost.json'))
b2 <- xgb.load('xgboost.json')
xgb.config(b2) <- xgb.config(bst)
expect_equal(bst$raw, b2$raw)

Expand Down
3 changes: 2 additions & 1 deletion include/xgboost/base.h
Expand Up @@ -109,7 +109,8 @@ using bst_int = int32_t; // NOLINT
using bst_ulong = uint64_t; // NOLINT
/*! \brief float type, used for storing statistics */
using bst_float = float; // NOLINT

/*! \brief Categorical value type. */
using bst_cat_t = int32_t; // NOLINT
/*! \brief Type for data column (feature) index. */
using bst_feature_t = uint32_t; // NOLINT
/*! \brief Type for data row index.
Expand Down
9 changes: 2 additions & 7 deletions include/xgboost/data.h
Expand Up @@ -35,7 +35,8 @@ enum class DataType : uint8_t {
};

enum class FeatureType : uint8_t {
kNumerical
kNumerical,
kCategorical
};

/*!
Expand Down Expand Up @@ -309,12 +310,6 @@ class SparsePage {
}
}

/*!
* \brief Push row block into the page.
* \param batch the row batch.
*/
void Push(const dmlc::RowBlock<uint32_t>& batch);

/**
* \brief Pushes external data batch onto this page
*
Expand Down
15 changes: 13 additions & 2 deletions include/xgboost/span.h
Expand Up @@ -101,6 +101,18 @@ namespace common {
} while (0);
#endif // __CUDA_ARCH__

#if defined(__CUDA_ARCH__)
#define SPAN_LT(lhs, rhs) \
if (!((lhs) < (rhs))) { \
printf("%lu < %lu failed\n", static_cast<size_t>(lhs), \
static_cast<size_t>(rhs)); \
asm("trap;"); \
}
#else
#define SPAN_LT(lhs, rhs) \
SPAN_CHECK((lhs) < (rhs))
#endif // defined(__CUDA_ARCH__)

namespace detail {
/*!
* By default, XGBoost uses uint32_t for indexing data. int64_t covers all
Expand Down Expand Up @@ -515,7 +527,7 @@ class Span {
}

XGBOOST_DEVICE reference operator[](index_type _idx) const {
SPAN_CHECK(_idx < size());
SPAN_LT(_idx, size());
return data()[_idx];
}

Expand Down Expand Up @@ -575,7 +587,6 @@ class Span {
detail::ExtentValue<Extent, Offset, Count>::value> {
SPAN_CHECK((Count == dynamic_extent) ?
(Offset <= size()) : (Offset + Count <= size()));

return {data() + Offset, Count == dynamic_extent ? size() - Offset : Count};
}

Expand Down
78 changes: 57 additions & 21 deletions include/xgboost/tree_model.h
Expand Up @@ -318,6 +318,8 @@ class RegTree : public Model {
param.num_deleted = 0;
nodes_.resize(param.num_nodes);
stats_.resize(param.num_nodes);
split_types_.resize(param.num_nodes, FeatureType::kNumerical);
split_categories_segments_.resize(param.num_nodes);
for (int i = 0; i < param.num_nodes; i ++) {
nodes_[i].SetLeaf(0.0f);
nodes_[i].SetParent(kInvalidNodeId);
Expand Down Expand Up @@ -412,30 +414,33 @@ class RegTree : public Model {
* \param leaf_right_child The right child index of leaf, by default kInvalidNodeId,
* some updaters use the right child index of leaf as a marker
*/
void ExpandNode(int nid, unsigned split_index, bst_float split_value,
void ExpandNode(bst_node_t nid, unsigned split_index, bst_float split_value,
bool default_left, bst_float base_weight,
bst_float left_leaf_weight, bst_float right_leaf_weight,
bst_float loss_change, float sum_hess, float left_sum,
float right_sum,
bst_node_t leaf_right_child = kInvalidNodeId) {
int pleft = this->AllocNode();
int pright = this->AllocNode();
auto &node = nodes_[nid];
CHECK(node.IsLeaf());
node.SetLeftChild(pleft);
node.SetRightChild(pright);
nodes_[node.LeftChild()].SetParent(nid, true);
nodes_[node.RightChild()].SetParent(nid, false);
node.SetSplit(split_index, split_value,
default_left);

nodes_[pleft].SetLeaf(left_leaf_weight, leaf_right_child);
nodes_[pright].SetLeaf(right_leaf_weight, leaf_right_child);

this->Stat(nid) = {loss_change, sum_hess, base_weight};
this->Stat(pleft) = {0.0f, left_sum, left_leaf_weight};
this->Stat(pright) = {0.0f, right_sum, right_leaf_weight};
}
bst_node_t leaf_right_child = kInvalidNodeId);

/**
* \brief Expands a leaf node with categories
*
* \param nid The node index to expand.
* \param split_index Feature index of the split.
* \param split_cat The bitset containing categories
* \param default_left True to default left.
* \param base_weight The base weight, before learning rate.
* \param left_leaf_weight The left leaf weight for prediction, modified by learning rate.
* \param right_leaf_weight The right leaf weight for prediction, modified by learning rate.
* \param loss_change The loss change.
* \param sum_hess The sum hess.
* \param left_sum The sum hess of left leaf.
* \param right_sum The sum hess of right leaf.
*/
void ExpandCategorical(bst_node_t nid, unsigned split_index,
RAMitchell marked this conversation as resolved.
Show resolved Hide resolved
common::Span<uint32_t> split_cat, bool default_left,
bst_float base_weight, bst_float left_leaf_weight,
bst_float right_leaf_weight, bst_float loss_change,
float sum_hess, float left_sum, float right_sum);

/*!
* \brief get current depth
Expand Down Expand Up @@ -588,6 +593,28 @@ class RegTree : public Model {
* \brief calculate the mean value for each node, required for feature contributions
*/
void FillNodeMeanValues();
/*!
* \brief Get split type for a node.
* \param nidx Index of node.
* \return The type of this split. For leaf node it's always kNumerical.
*/
FeatureType NodeSplitType(bst_node_t nidx) const {
return split_types_.at(nidx);
}
/*!
* \brief Get split types for all nodes.
*/
std::vector<FeatureType> const &GetSplitTypes() const { return split_types_; }
common::Span<uint32_t const> GetSplitCategories() const { return split_categories_; }
auto const& GetSplitCategoriesPtr() const { return split_categories_segments_; }

// The fields of split_categories_segments_[i] are set such that
// the range split_categories_[beg:(beg+size)] stores the bitset for
// the matching categories for the i-th node.
struct Segment {
trivialfis marked this conversation as resolved.
Show resolved Hide resolved
size_t beg {0};
size_t size {0};
};

private:
// vector of nodes
Expand All @@ -597,9 +624,16 @@ class RegTree : public Model {
// stats of nodes
std::vector<RTreeNodeStat> stats_;
std::vector<bst_float> node_mean_values_;
std::vector<FeatureType> split_types_;

// Categories for each internal node.
std::vector<uint32_t> split_categories_;
// Ptr to split categories of each node.
std::vector<Segment> split_categories_segments_;
hcho3 marked this conversation as resolved.
Show resolved Hide resolved

// allocate a new node,
// !!!!!! NOTE: may cause BUG here, nodes.resize
int AllocNode() {
bst_node_t AllocNode() {
if (param.num_deleted != 0) {
int nid = deleted_nodes_.back();
deleted_nodes_.pop_back();
Expand All @@ -612,6 +646,8 @@ class RegTree : public Model {
<< "number of nodes in the tree exceed 2^31";
nodes_.resize(param.num_nodes);
stats_.resize(param.num_nodes);
split_types_.resize(param.num_nodes, FeatureType::kNumerical);
split_categories_segments_.resize(param.num_nodes);
return nd;
}
// delete a tree node, keep the parent field to allow trace back
Expand Down
2 changes: 1 addition & 1 deletion python-package/xgboost/core.py
Expand Up @@ -40,7 +40,7 @@ class EarlyStopException(Exception):
"""

def __init__(self, best_iteration):
super(EarlyStopException, self).__init__()
super().__init__()
self.best_iteration = best_iteration


Expand Down
4 changes: 2 additions & 2 deletions python-package/xgboost/sklearn.py
Expand Up @@ -1022,7 +1022,7 @@ def __init__(self,
**kwargs)

def get_xgb_params(self):
params = super(XGBRFClassifier, self).get_xgb_params()
params = super().get_xgb_params()
params['num_parallel_tree'] = self.n_estimators
return params

Expand Down Expand Up @@ -1051,7 +1051,7 @@ def __init__(self, learning_rate=1, subsample=0.8, colsample_bynode=0.8,
reg_lambda=reg_lambda, **kwargs)

def get_xgb_params(self):
params = super(XGBRFRegressor, self).get_xgb_params()
params = super().get_xgb_params()
params['num_parallel_tree'] = self.n_estimators
return params

Expand Down