Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[R] Fix global feature importance and predict with 1 sample. #7394

Merged
merged 3 commits into from
Nov 5, 2021
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
6 changes: 3 additions & 3 deletions R-package/R/xgb.importance.R
Original file line number Diff line number Diff line change
Expand Up @@ -115,14 +115,14 @@ xgb.importance <- function(feature_names = NULL, model = NULL, trees = NULL,
} else {
concatenated <- list()
output_names <- vector()
for (importance_type in c("weight", "gain", "cover")) {
args <- list(importance_type = importance_type, feature_names = feature_names)
for (importance_type in c("weight", "total_gain", "total_cover")) {
args <- list(importance_type = importance_type, feature_names = feature_names, tree_idx = trees)
results <- .Call(
XGBoosterFeatureScore_R, model$handle, jsonlite::toJSON(args, auto_unbox = TRUE, null = "null")
)
names(results) <- c("features", "shape", importance_type)
concatenated[
switch(importance_type, "weight" = "Frequency", "gain" = "Gain", "cover" = "Cover")
switch(importance_type, "weight" = "Frequency", "total_gain" = "Gain", "total_cover" = "Cover")
] <- results[importance_type]
output_names <- results$features
}
Expand Down
31 changes: 30 additions & 1 deletion R-package/tests/testthat/test_helpers.R
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
library(testthat)
context('Test helper functions')

require(xgboost)
Expand Down Expand Up @@ -310,7 +311,35 @@ test_that("xgb.importance works with and without feature names", {
# for multiclass
imp.Tree <- xgb.importance(model = mbst.Tree)
expect_equal(dim(imp.Tree), c(4, 4))
xgb.importance(model = mbst.Tree, trees = seq(from = 0, by = nclass, length.out = nrounds))

trees <- seq(from = 0, by = 2, length.out = 2)
importance <- xgb.importance(feature_names = feature.names, model = bst.Tree, trees = trees)

importance_from_dump <- function() {
model_text_dump <- xgb.dump(model = bst.Tree, with_stats = TRUE, trees = trees)
imp <- xgb.model.dt.tree(
feature_names = feature.names,
text = model_text_dump,
trees = trees
)[
Feature != "Leaf", .(
Gain = sum(Quality),
Cover = sum(Cover),
Frequency = .N
),
by = Feature
][
, `:=`(
Gain = Gain / sum(Gain),
Cover = Cover / sum(Cover),
Frequency = Frequency / sum(Frequency)
)
][
order(Gain, decreasing = TRUE)
]
imp
}
expect_equal(importance_from_dump(), importance, tolerance = 1e-6)
})

test_that("xgb.importance works with GLM model", {
Expand Down
7 changes: 4 additions & 3 deletions include/xgboost/gbm.h
Original file line number Diff line number Diff line change
Expand Up @@ -182,9 +182,10 @@ class GradientBooster : public Model, public Configurable {
bool with_stats,
std::string format) const = 0;

virtual void FeatureScore(std::string const &importance_type,
std::vector<bst_feature_t> *features,
std::vector<float> *scores) const = 0;
virtual void FeatureScore(std::string const& importance_type,
common::Span<int32_t const> trees,
std::vector<bst_feature_t>* features,
std::vector<float>* scores) const = 0;
/*!
* \brief Whether the current booster uses GPU.
*/
Expand Down
7 changes: 4 additions & 3 deletions include/xgboost/learner.h
Original file line number Diff line number Diff line change
Expand Up @@ -156,9 +156,10 @@ class Learner : public Model, public Configurable, public dmlc::Serializable {
/*!
* \brief Calculate feature score. See doc in C API for outputs.
*/
virtual void CalcFeatureScore(std::string const &importance_type,
std::vector<bst_feature_t> *features,
std::vector<float> *scores) = 0;
virtual void CalcFeatureScore(std::string const& importance_type,
common::Span<int32_t const> trees,
std::vector<bst_feature_t>* features,
std::vector<float>* scores) = 0;

/*
* \brief Get number of boosted rounds from gradient booster.
Expand Down
12 changes: 10 additions & 2 deletions src/c_api/c_api.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1159,9 +1159,17 @@ XGB_DLL int XGBoosterFeatureScore(BoosterHandle handle, char const *json_config,
custom_feature_names = get<Array const>(config["feature_names"]);
}

auto& scores = learner->GetThreadLocal().ret_vec_float;
std::vector<int32_t> tree_idx;
if (!IsA<Null>(config["tree_idx"])) {
auto j_tree_idx = get<Array const>(config["tree_idx"]);
for (auto const &idx : j_tree_idx) {
tree_idx.push_back(get<Integer const>(idx));
}
}

auto &scores = learner->GetThreadLocal().ret_vec_float;
std::vector<bst_feature_t> features;
learner->CalcFeatureScore(importance, &features, &scores);
learner->CalcFeatureScore(importance, common::Span<int32_t const>(tree_idx), &features, &scores);

auto n_features = learner->GetNumFeature();
GenerateFeatureMap(learner, custom_feature_names, n_features, &feature_map);
Expand Down
2 changes: 2 additions & 0 deletions src/gbm/gblinear.cc
Original file line number Diff line number Diff line change
Expand Up @@ -232,9 +232,11 @@ class GBLinear : public GradientBooster {
}

void FeatureScore(std::string const &importance_type,
common::Span<int32_t const> trees,
std::vector<bst_feature_t> *out_features,
std::vector<float> *out_scores) const override {
CHECK(!model_.weight.empty()) << "Model is not initialized";
CHECK(trees.empty()) << "gblinear doesn't support number of trees for feature importance.";
CHECK_EQ(importance_type, "weight")
<< "gblinear only has `weight` defined for feature importance.";
out_features->resize(this->learner_model_param_->num_feature, 0);
Expand Down
20 changes: 15 additions & 5 deletions src/gbm/gbtree.h
Original file line number Diff line number Diff line change
Expand Up @@ -300,18 +300,28 @@ class GBTree : public GradientBooster {
}
}

void FeatureScore(std::string const &importance_type,
std::vector<bst_feature_t> *features,
std::vector<float> *scores) const override {
void FeatureScore(std::string const& importance_type, common::Span<int32_t const> trees,
std::vector<bst_feature_t>* features,
std::vector<float>* scores) const override {
// Because feature with no importance doesn't appear in the return value so
// we need to set up another pair of vectors to store the values during
// computation.
std::vector<size_t> split_counts(this->model_.learner_model_param->num_feature, 0);
std::vector<float> gain_map(this->model_.learner_model_param->num_feature, 0);
std::vector<int32_t> tree_idx;
if (trees.empty()) {
tree_idx.resize(this->model_.trees.size());
std::iota(tree_idx.begin(), tree_idx.end(), 0);
trees = common::Span<int32_t const>(tree_idx);
}

auto total_n_trees = model_.trees.size();
auto add_score = [&](auto fn) {
for (auto const &p_tree : model_.trees) {
for (auto idx : trees) {
CHECK_LE(idx, total_n_trees) << "Invalid tree index.";
auto const& p_tree = model_.trees[idx];
p_tree->WalkTree([&](bst_node_t nidx) {
auto const &node = (*p_tree)[nidx];
auto const& node = (*p_tree)[nidx];
if (!node.IsLeaf()) {
split_counts[node.SplitIndex()]++;
fn(p_tree, nidx, node.SplitIndex());
Expand Down
7 changes: 3 additions & 4 deletions src/learner.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1212,11 +1212,10 @@ class LearnerImpl : public LearnerIO {
*out_preds = &out_predictions.predictions;
}

void CalcFeatureScore(std::string const &importance_type,
std::vector<bst_feature_t> *features,
std::vector<float> *scores) override {
void CalcFeatureScore(std::string const& importance_type, common::Span<int32_t const> trees,
std::vector<bst_feature_t>* features, std::vector<float>* scores) override {
this->Configure();
gbm_->FeatureScore(importance_type, features, scores);
gbm_->FeatureScore(importance_type, trees, features, scores);
}

const std::map<std::string, std::string>& GetConfigurationArguments() const override {
Expand Down
6 changes: 3 additions & 3 deletions tests/cpp/gbm/test_gbtree.cc
Original file line number Diff line number Diff line change
Expand Up @@ -430,19 +430,19 @@ TEST(GBTree, FeatureScore) {

std::vector<bst_feature_t> features_weight;
std::vector<float> scores_weight;
learner->CalcFeatureScore("weight", &features_weight, &scores_weight);
learner->CalcFeatureScore("weight", {}, &features_weight, &scores_weight);
ASSERT_EQ(features_weight.size(), scores_weight.size());
ASSERT_LE(features_weight.size(), learner->GetNumFeature());
ASSERT_TRUE(std::is_sorted(features_weight.begin(), features_weight.end()));

auto test_eq = [&learner, &scores_weight](std::string type) {
std::vector<bst_feature_t> features;
std::vector<float> scores;
learner->CalcFeatureScore(type, &features, &scores);
learner->CalcFeatureScore(type, {}, &features, &scores);

std::vector<bst_feature_t> features_total;
std::vector<float> scores_total;
learner->CalcFeatureScore("total_" + type, &features_total, &scores_total);
learner->CalcFeatureScore("total_" + type, {}, &features_total, &scores_total);

for (size_t i = 0; i < scores_weight.size(); ++i) {
ASSERT_LE(RelError(scores_total[i] / scores[i], scores_weight[i]), kRtEps);
Expand Down