Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[R] Fix global feature importance and predict with 1 sample. #7394

Merged
merged 3 commits into from
Nov 5, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
66 changes: 43 additions & 23 deletions R-package/R/xgb.Booster.R
Original file line number Diff line number Diff line change
Expand Up @@ -397,6 +397,7 @@ predict.xgb.Booster <- function(object, newdata, missing = NA, outputmargin = FA
shape <- predts$shape
ret <- predts$results

n_ret <- length(ret)
n_row <- nrow(newdata)
if (n_row != shape[1]) {
stop("Incorrect predict shape.")
Expand All @@ -405,36 +406,55 @@ predict.xgb.Booster <- function(object, newdata, missing = NA, outputmargin = FA
arr <- array(data = ret, dim = rev(shape))

cnames <- if (!is.null(colnames(newdata))) c(colnames(newdata), "BIAS") else NULL
n_groups <- shape[2]

## Needed regardless of whether strict shape is being used.
if (predcontrib) {
dimnames(arr) <- list(cnames, NULL, NULL)
if (!strict_shape) {
arr <- aperm(a = arr, perm = c(2, 3, 1)) # [group, row, col]
}
} else if (predinteraction) {
dimnames(arr) <- list(cnames, cnames, NULL, NULL)
if (!strict_shape) {
arr <- aperm(a = arr, perm = c(3, 4, 1, 2)) # [group, row, col, col]
}
}
if (strict_shape) {
return(arr) # strict shape is calculated by libxgboost uniformly.
}

if (!strict_shape) {
n_groups <- shape[2]
if (predleaf) {
arr <- matrix(arr, nrow = n_row, byrow = TRUE)
} else if (predcontrib && n_groups != 1) {
arr <- lapply(seq_len(n_groups), function(g) arr[g, , ])
} else if (predinteraction && n_groups != 1) {
arr <- lapply(seq_len(n_groups), function(g) arr[g, , , ])
} else if (!reshape && n_groups != 1) {
arr <- ret
} else if (reshape && n_groups != 1) {
arr <- matrix(arr, ncol = n_groups, byrow = TRUE)
if (predleaf) {
## Predict leaf
arr <- if (n_ret == n_row) {
matrix(arr, ncol = 1)
} else {
matrix(arr, nrow = n_row, byrow = TRUE)
}
arr <- drop(arr)
if (length(dim(arr)) == 1) {
arr <- as.vector(arr)
} else if (length(dim(arr)) == 2) {
arr <- as.matrix(arr)
} else if (predcontrib) {
## Predict contribution
arr <- aperm(a = arr, perm = c(2, 3, 1)) # [group, row, col]
arr <- if (n_ret == n_row) {
matrix(arr, ncol = 1, dimnames = list(NULL, cnames))
} else if (n_groups != 1) {
## turns array into list of matrices
lapply(seq_len(n_groups), function(g) arr[g, , ])
} else {
## remove the first axis (group)
as.matrix(arr[1, , ])
}
} else if (predinteraction) {
## Predict interaction
arr <- aperm(a = arr, perm = c(3, 4, 1, 2)) # [group, row, col, col]
arr <- if (n_ret == n_row) {
matrix(arr, ncol = 1, dimnames = list(NULL, cnames))
} else if (n_groups != 1) {
## turns array into list of matrices
lapply(seq_len(n_groups), function(g) arr[g, , , ])
} else {
## remove the first axis (group)
arr[1, , , ]
}
} else {
## Normal prediction
arr <- if (reshape && n_groups != 1) {
matrix(arr, ncol = n_groups, byrow = TRUE)
} else {
as.vector(ret)
}
}
return(arr)
Expand Down
6 changes: 3 additions & 3 deletions R-package/R/xgb.importance.R
Original file line number Diff line number Diff line change
Expand Up @@ -115,14 +115,14 @@ xgb.importance <- function(feature_names = NULL, model = NULL, trees = NULL,
} else {
concatenated <- list()
output_names <- vector()
for (importance_type in c("weight", "gain", "cover")) {
args <- list(importance_type = importance_type, feature_names = feature_names)
for (importance_type in c("weight", "total_gain", "total_cover")) {
args <- list(importance_type = importance_type, feature_names = feature_names, tree_idx = trees)
results <- .Call(
XGBoosterFeatureScore_R, model$handle, jsonlite::toJSON(args, auto_unbox = TRUE, null = "null")
)
names(results) <- c("features", "shape", importance_type)
concatenated[
switch(importance_type, "weight" = "Frequency", "gain" = "Gain", "cover" = "Cover")
switch(importance_type, "weight" = "Frequency", "total_gain" = "Gain", "total_cover" = "Cover")
] <- results[importance_type]
output_names <- results$features
}
Expand Down
31 changes: 30 additions & 1 deletion R-package/tests/testthat/test_helpers.R
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
library(testthat)
context('Test helper functions')

require(xgboost)
Expand Down Expand Up @@ -310,7 +311,35 @@ test_that("xgb.importance works with and without feature names", {
# for multiclass
imp.Tree <- xgb.importance(model = mbst.Tree)
expect_equal(dim(imp.Tree), c(4, 4))
xgb.importance(model = mbst.Tree, trees = seq(from = 0, by = nclass, length.out = nrounds))

trees <- seq(from = 0, by = 2, length.out = 2)
importance <- xgb.importance(feature_names = feature.names, model = bst.Tree, trees = trees)

importance_from_dump <- function() {
model_text_dump <- xgb.dump(model = bst.Tree, with_stats = TRUE, trees = trees)
imp <- xgb.model.dt.tree(
feature_names = feature.names,
text = model_text_dump,
trees = trees
)[
Feature != "Leaf", .(
Gain = sum(Quality),
Cover = sum(Cover),
Frequency = .N
),
by = Feature
][
, `:=`(
Gain = Gain / sum(Gain),
Cover = Cover / sum(Cover),
Frequency = Frequency / sum(Frequency)
)
][
order(Gain, decreasing = TRUE)
]
imp
}
expect_equal(importance_from_dump(), importance, tolerance = 1e-6)
})

test_that("xgb.importance works with GLM model", {
Expand Down
4 changes: 2 additions & 2 deletions doc/prediction.rst
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,8 @@ After 1.4 release, we added a new parameter called ``strict_shape``, one can set
- When using ``output_margin`` to avoid transformation and ``strict_shape`` is set to ``True``:

Similar to the previous case, output is a 2-dim array, except for that ``multi:softmax``
has equivalent output of ``multi:softprob`` due to dropped transformation. If strict
shape is set to False then output can have 1 or 2 dim depending on used model.
has equivalent output shape of ``multi:softprob`` due to dropped transformation. If
strict shape is set to False then output can have 1 or 2 dim depending on used model.

- When using ``preds_contribs`` with ``strict_shape`` set to ``True``:

Expand Down
7 changes: 4 additions & 3 deletions include/xgboost/gbm.h
Original file line number Diff line number Diff line change
Expand Up @@ -182,9 +182,10 @@ class GradientBooster : public Model, public Configurable {
bool with_stats,
std::string format) const = 0;

virtual void FeatureScore(std::string const &importance_type,
std::vector<bst_feature_t> *features,
std::vector<float> *scores) const = 0;
virtual void FeatureScore(std::string const& importance_type,
common::Span<int32_t const> trees,
std::vector<bst_feature_t>* features,
std::vector<float>* scores) const = 0;
/*!
* \brief Whether the current booster uses GPU.
*/
Expand Down
7 changes: 4 additions & 3 deletions include/xgboost/learner.h
Original file line number Diff line number Diff line change
Expand Up @@ -156,9 +156,10 @@ class Learner : public Model, public Configurable, public dmlc::Serializable {
/*!
* \brief Calculate feature score. See doc in C API for outputs.
*/
virtual void CalcFeatureScore(std::string const &importance_type,
std::vector<bst_feature_t> *features,
std::vector<float> *scores) = 0;
virtual void CalcFeatureScore(std::string const& importance_type,
common::Span<int32_t const> trees,
std::vector<bst_feature_t>* features,
std::vector<float>* scores) = 0;

/*
* \brief Get number of boosted rounds from gradient booster.
Expand Down
12 changes: 10 additions & 2 deletions src/c_api/c_api.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1159,9 +1159,17 @@ XGB_DLL int XGBoosterFeatureScore(BoosterHandle handle, char const *json_config,
custom_feature_names = get<Array const>(config["feature_names"]);
}

auto& scores = learner->GetThreadLocal().ret_vec_float;
std::vector<int32_t> tree_idx;
if (!IsA<Null>(config["tree_idx"])) {
auto j_tree_idx = get<Array const>(config["tree_idx"]);
for (auto const &idx : j_tree_idx) {
tree_idx.push_back(get<Integer const>(idx));
}
}

auto &scores = learner->GetThreadLocal().ret_vec_float;
std::vector<bst_feature_t> features;
learner->CalcFeatureScore(importance, &features, &scores);
learner->CalcFeatureScore(importance, common::Span<int32_t const>(tree_idx), &features, &scores);

auto n_features = learner->GetNumFeature();
GenerateFeatureMap(learner, custom_feature_names, n_features, &feature_map);
Expand Down
2 changes: 2 additions & 0 deletions src/gbm/gblinear.cc
Original file line number Diff line number Diff line change
Expand Up @@ -232,9 +232,11 @@ class GBLinear : public GradientBooster {
}

void FeatureScore(std::string const &importance_type,
common::Span<int32_t const> trees,
std::vector<bst_feature_t> *out_features,
std::vector<float> *out_scores) const override {
CHECK(!model_.weight.empty()) << "Model is not initialized";
CHECK(trees.empty()) << "gblinear doesn't support number of trees for feature importance.";
CHECK_EQ(importance_type, "weight")
<< "gblinear only has `weight` defined for feature importance.";
out_features->resize(this->learner_model_param_->num_feature, 0);
Expand Down
20 changes: 15 additions & 5 deletions src/gbm/gbtree.h
Original file line number Diff line number Diff line change
Expand Up @@ -300,18 +300,28 @@ class GBTree : public GradientBooster {
}
}

void FeatureScore(std::string const &importance_type,
std::vector<bst_feature_t> *features,
std::vector<float> *scores) const override {
void FeatureScore(std::string const& importance_type, common::Span<int32_t const> trees,
std::vector<bst_feature_t>* features,
std::vector<float>* scores) const override {
// Because feature with no importance doesn't appear in the return value so
// we need to set up another pair of vectors to store the values during
// computation.
std::vector<size_t> split_counts(this->model_.learner_model_param->num_feature, 0);
std::vector<float> gain_map(this->model_.learner_model_param->num_feature, 0);
std::vector<int32_t> tree_idx;
if (trees.empty()) {
tree_idx.resize(this->model_.trees.size());
std::iota(tree_idx.begin(), tree_idx.end(), 0);
trees = common::Span<int32_t const>(tree_idx);
}

auto total_n_trees = model_.trees.size();
auto add_score = [&](auto fn) {
for (auto const &p_tree : model_.trees) {
for (auto idx : trees) {
CHECK_LE(idx, total_n_trees) << "Invalid tree index.";
auto const& p_tree = model_.trees[idx];
p_tree->WalkTree([&](bst_node_t nidx) {
auto const &node = (*p_tree)[nidx];
auto const& node = (*p_tree)[nidx];
if (!node.IsLeaf()) {
split_counts[node.SplitIndex()]++;
fn(p_tree, nidx, node.SplitIndex());
Expand Down
7 changes: 3 additions & 4 deletions src/learner.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1212,11 +1212,10 @@ class LearnerImpl : public LearnerIO {
*out_preds = &out_predictions.predictions;
}

void CalcFeatureScore(std::string const &importance_type,
std::vector<bst_feature_t> *features,
std::vector<float> *scores) override {
void CalcFeatureScore(std::string const& importance_type, common::Span<int32_t const> trees,
std::vector<bst_feature_t>* features, std::vector<float>* scores) override {
this->Configure();
gbm_->FeatureScore(importance_type, features, scores);
gbm_->FeatureScore(importance_type, trees, features, scores);
}

const std::map<std::string, std::string>& GetConfigurationArguments() const override {
Expand Down
6 changes: 3 additions & 3 deletions tests/cpp/gbm/test_gbtree.cc
Original file line number Diff line number Diff line change
Expand Up @@ -430,19 +430,19 @@ TEST(GBTree, FeatureScore) {

std::vector<bst_feature_t> features_weight;
std::vector<float> scores_weight;
learner->CalcFeatureScore("weight", &features_weight, &scores_weight);
learner->CalcFeatureScore("weight", {}, &features_weight, &scores_weight);
ASSERT_EQ(features_weight.size(), scores_weight.size());
ASSERT_LE(features_weight.size(), learner->GetNumFeature());
ASSERT_TRUE(std::is_sorted(features_weight.begin(), features_weight.end()));

auto test_eq = [&learner, &scores_weight](std::string type) {
std::vector<bst_feature_t> features;
std::vector<float> scores;
learner->CalcFeatureScore(type, &features, &scores);
learner->CalcFeatureScore(type, {}, &features, &scores);

std::vector<bst_feature_t> features_total;
std::vector<float> scores_total;
learner->CalcFeatureScore("total_" + type, &features_total, &scores_total);
learner->CalcFeatureScore("total_" + type, {}, &features_total, &scores_total);

for (size_t i = 0; i < scores_weight.size(); ++i) {
ASSERT_LE(RelError(scores_total[i] / scores[i], scores_weight[i]), kRtEps);
Expand Down