Skip to content

Commit

Permalink
[backport] [R] Fix global feature importance and predict with 1 sampl…
Browse files Browse the repository at this point in the history
…e. (dmlc#7394) (dmlc#7397)

* [R] Fix global feature importance.

* Add implementation for tree index.  The parameter is not documented in C API since we
should work on porting the model slicing to R instead of supporting more use of tree
index.

* Fix the difference between "gain" and "total_gain".

* debug.

* Fix prediction.
  • Loading branch information
trivialfis committed Nov 5, 2021
1 parent a3d195e commit e7ac248
Show file tree
Hide file tree
Showing 11 changed files with 119 additions and 49 deletions.
66 changes: 43 additions & 23 deletions R-package/R/xgb.Booster.R
Expand Up @@ -397,6 +397,7 @@ predict.xgb.Booster <- function(object, newdata, missing = NA, outputmargin = FA
shape <- predts$shape
ret <- predts$results

n_ret <- length(ret)
n_row <- nrow(newdata)
if (n_row != shape[1]) {
stop("Incorrect predict shape.")
Expand All @@ -405,36 +406,55 @@ predict.xgb.Booster <- function(object, newdata, missing = NA, outputmargin = FA
arr <- array(data = ret, dim = rev(shape))

cnames <- if (!is.null(colnames(newdata))) c(colnames(newdata), "BIAS") else NULL
n_groups <- shape[2]

## Needed regardless of whether strict shape is being used.
if (predcontrib) {
dimnames(arr) <- list(cnames, NULL, NULL)
if (!strict_shape) {
arr <- aperm(a = arr, perm = c(2, 3, 1)) # [group, row, col]
}
} else if (predinteraction) {
dimnames(arr) <- list(cnames, cnames, NULL, NULL)
if (!strict_shape) {
arr <- aperm(a = arr, perm = c(3, 4, 1, 2)) # [group, row, col, col]
}
}
if (strict_shape) {
return(arr) # strict shape is calculated by libxgboost uniformly.
}

if (!strict_shape) {
n_groups <- shape[2]
if (predleaf) {
arr <- matrix(arr, nrow = n_row, byrow = TRUE)
} else if (predcontrib && n_groups != 1) {
arr <- lapply(seq_len(n_groups), function(g) arr[g, , ])
} else if (predinteraction && n_groups != 1) {
arr <- lapply(seq_len(n_groups), function(g) arr[g, , , ])
} else if (!reshape && n_groups != 1) {
arr <- ret
} else if (reshape && n_groups != 1) {
arr <- matrix(arr, ncol = n_groups, byrow = TRUE)
if (predleaf) {
## Predict leaf
arr <- if (n_ret == n_row) {
matrix(arr, ncol = 1)
} else {
matrix(arr, nrow = n_row, byrow = TRUE)
}
arr <- drop(arr)
if (length(dim(arr)) == 1) {
arr <- as.vector(arr)
} else if (length(dim(arr)) == 2) {
arr <- as.matrix(arr)
} else if (predcontrib) {
## Predict contribution
arr <- aperm(a = arr, perm = c(2, 3, 1)) # [group, row, col]
arr <- if (n_ret == n_row) {
matrix(arr, ncol = 1, dimnames = list(NULL, cnames))
} else if (n_groups != 1) {
## turns array into list of matrices
lapply(seq_len(n_groups), function(g) arr[g, , ])
} else {
## remove the first axis (group)
as.matrix(arr[1, , ])
}
} else if (predinteraction) {
## Predict interaction
arr <- aperm(a = arr, perm = c(3, 4, 1, 2)) # [group, row, col, col]
arr <- if (n_ret == n_row) {
matrix(arr, ncol = 1, dimnames = list(NULL, cnames))
} else if (n_groups != 1) {
## turns array into list of matrices
lapply(seq_len(n_groups), function(g) arr[g, , , ])
} else {
## remove the first axis (group)
arr[1, , , ]
}
} else {
## Normal prediction
arr <- if (reshape && n_groups != 1) {
matrix(arr, ncol = n_groups, byrow = TRUE)
} else {
as.vector(ret)
}
}
return(arr)
Expand Down
6 changes: 3 additions & 3 deletions R-package/R/xgb.importance.R
Expand Up @@ -115,14 +115,14 @@ xgb.importance <- function(feature_names = NULL, model = NULL, trees = NULL,
} else {
concatenated <- list()
output_names <- vector()
for (importance_type in c("weight", "gain", "cover")) {
args <- list(importance_type = importance_type, feature_names = feature_names)
for (importance_type in c("weight", "total_gain", "total_cover")) {
args <- list(importance_type = importance_type, feature_names = feature_names, tree_idx = trees)
results <- .Call(
XGBoosterFeatureScore_R, model$handle, jsonlite::toJSON(args, auto_unbox = TRUE, null = "null")
)
names(results) <- c("features", "shape", importance_type)
concatenated[
switch(importance_type, "weight" = "Frequency", "gain" = "Gain", "cover" = "Cover")
switch(importance_type, "weight" = "Frequency", "total_gain" = "Gain", "total_cover" = "Cover")
] <- results[importance_type]
output_names <- results$features
}
Expand Down
31 changes: 30 additions & 1 deletion R-package/tests/testthat/test_helpers.R
@@ -1,3 +1,4 @@
library(testthat)
context('Test helper functions')

require(xgboost)
Expand Down Expand Up @@ -310,7 +311,35 @@ test_that("xgb.importance works with and without feature names", {
# for multiclass
imp.Tree <- xgb.importance(model = mbst.Tree)
expect_equal(dim(imp.Tree), c(4, 4))
xgb.importance(model = mbst.Tree, trees = seq(from = 0, by = nclass, length.out = nrounds))

trees <- seq(from = 0, by = 2, length.out = 2)
importance <- xgb.importance(feature_names = feature.names, model = bst.Tree, trees = trees)

importance_from_dump <- function() {
model_text_dump <- xgb.dump(model = bst.Tree, with_stats = TRUE, trees = trees)
imp <- xgb.model.dt.tree(
feature_names = feature.names,
text = model_text_dump,
trees = trees
)[
Feature != "Leaf", .(
Gain = sum(Quality),
Cover = sum(Cover),
Frequency = .N
),
by = Feature
][
, `:=`(
Gain = Gain / sum(Gain),
Cover = Cover / sum(Cover),
Frequency = Frequency / sum(Frequency)
)
][
order(Gain, decreasing = TRUE)
]
imp
}
expect_equal(importance_from_dump(), importance, tolerance = 1e-6)
})

test_that("xgb.importance works with GLM model", {
Expand Down
4 changes: 2 additions & 2 deletions doc/prediction.rst
Expand Up @@ -32,8 +32,8 @@ After 1.4 release, we added a new parameter called ``strict_shape``, one can set
- When using ``output_margin`` to avoid transformation and ``strict_shape`` is set to ``True``:

Similar to the previous case, output is a 2-dim array, except for that ``multi:softmax``
has equivalent output of ``multi:softprob`` due to dropped transformation. If strict
shape is set to False then output can have 1 or 2 dim depending on used model.
has equivalent output shape of ``multi:softprob`` due to dropped transformation. If
strict shape is set to False then output can have 1 or 2 dim depending on used model.

- When using ``preds_contribs`` with ``strict_shape`` set to ``True``:

Expand Down
7 changes: 4 additions & 3 deletions include/xgboost/gbm.h
Expand Up @@ -182,9 +182,10 @@ class GradientBooster : public Model, public Configurable {
bool with_stats,
std::string format) const = 0;

virtual void FeatureScore(std::string const &importance_type,
std::vector<bst_feature_t> *features,
std::vector<float> *scores) const = 0;
virtual void FeatureScore(std::string const& importance_type,
common::Span<int32_t const> trees,
std::vector<bst_feature_t>* features,
std::vector<float>* scores) const = 0;
/*!
* \brief Whether the current booster uses GPU.
*/
Expand Down
7 changes: 4 additions & 3 deletions include/xgboost/learner.h
Expand Up @@ -155,9 +155,10 @@ class Learner : public Model, public Configurable, public dmlc::Serializable {
/*!
* \brief Calculate feature score. See doc in C API for outputs.
*/
virtual void CalcFeatureScore(std::string const &importance_type,
std::vector<bst_feature_t> *features,
std::vector<float> *scores) = 0;
virtual void CalcFeatureScore(std::string const& importance_type,
common::Span<int32_t const> trees,
std::vector<bst_feature_t>* features,
std::vector<float>* scores) = 0;

/*
* \brief Get number of boosted rounds from gradient booster.
Expand Down
12 changes: 10 additions & 2 deletions src/c_api/c_api.cc
Expand Up @@ -1159,9 +1159,17 @@ XGB_DLL int XGBoosterFeatureScore(BoosterHandle handle, char const *json_config,
custom_feature_names = get<Array const>(config["feature_names"]);
}

auto& scores = learner->GetThreadLocal().ret_vec_float;
std::vector<int32_t> tree_idx;
if (!IsA<Null>(config["tree_idx"])) {
auto j_tree_idx = get<Array const>(config["tree_idx"]);
for (auto const &idx : j_tree_idx) {
tree_idx.push_back(get<Integer const>(idx));
}
}

auto &scores = learner->GetThreadLocal().ret_vec_float;
std::vector<bst_feature_t> features;
learner->CalcFeatureScore(importance, &features, &scores);
learner->CalcFeatureScore(importance, common::Span<int32_t const>(tree_idx), &features, &scores);

auto n_features = learner->GetNumFeature();
GenerateFeatureMap(learner, custom_feature_names, n_features, &feature_map);
Expand Down
2 changes: 2 additions & 0 deletions src/gbm/gblinear.cc
Expand Up @@ -232,9 +232,11 @@ class GBLinear : public GradientBooster {
}

void FeatureScore(std::string const &importance_type,
common::Span<int32_t const> trees,
std::vector<bst_feature_t> *out_features,
std::vector<float> *out_scores) const override {
CHECK(!model_.weight.empty()) << "Model is not initialized";
CHECK(trees.empty()) << "gblinear doesn't support number of trees for feature importance.";
CHECK_EQ(importance_type, "weight")
<< "gblinear only has `weight` defined for feature importance.";
out_features->resize(this->learner_model_param_->num_feature, 0);
Expand Down
20 changes: 15 additions & 5 deletions src/gbm/gbtree.h
Expand Up @@ -300,18 +300,28 @@ class GBTree : public GradientBooster {
}
}

void FeatureScore(std::string const &importance_type,
std::vector<bst_feature_t> *features,
std::vector<float> *scores) const override {
void FeatureScore(std::string const& importance_type, common::Span<int32_t const> trees,
std::vector<bst_feature_t>* features,
std::vector<float>* scores) const override {
// Because feature with no importance doesn't appear in the return value so
// we need to set up another pair of vectors to store the values during
// computation.
std::vector<size_t> split_counts(this->model_.learner_model_param->num_feature, 0);
std::vector<float> gain_map(this->model_.learner_model_param->num_feature, 0);
std::vector<int32_t> tree_idx;
if (trees.empty()) {
tree_idx.resize(this->model_.trees.size());
std::iota(tree_idx.begin(), tree_idx.end(), 0);
trees = common::Span<int32_t const>(tree_idx);
}

auto total_n_trees = model_.trees.size();
auto add_score = [&](auto fn) {
for (auto const &p_tree : model_.trees) {
for (auto idx : trees) {
CHECK_LE(idx, total_n_trees) << "Invalid tree index.";
auto const& p_tree = model_.trees[idx];
p_tree->WalkTree([&](bst_node_t nidx) {
auto const &node = (*p_tree)[nidx];
auto const& node = (*p_tree)[nidx];
if (!node.IsLeaf()) {
split_counts[node.SplitIndex()]++;
fn(p_tree, nidx, node.SplitIndex());
Expand Down
7 changes: 3 additions & 4 deletions src/learner.cc
Expand Up @@ -1214,11 +1214,10 @@ class LearnerImpl : public LearnerIO {
*out_preds = &out_predictions.predictions;
}

void CalcFeatureScore(std::string const &importance_type,
std::vector<bst_feature_t> *features,
std::vector<float> *scores) override {
void CalcFeatureScore(std::string const& importance_type, common::Span<int32_t const> trees,
std::vector<bst_feature_t>* features, std::vector<float>* scores) override {
this->Configure();
gbm_->FeatureScore(importance_type, features, scores);
gbm_->FeatureScore(importance_type, trees, features, scores);
}

const std::map<std::string, std::string>& GetConfigurationArguments() const override {
Expand Down
6 changes: 3 additions & 3 deletions tests/cpp/gbm/test_gbtree.cc
Expand Up @@ -430,19 +430,19 @@ TEST(GBTree, FeatureScore) {

std::vector<bst_feature_t> features_weight;
std::vector<float> scores_weight;
learner->CalcFeatureScore("weight", &features_weight, &scores_weight);
learner->CalcFeatureScore("weight", {}, &features_weight, &scores_weight);
ASSERT_EQ(features_weight.size(), scores_weight.size());
ASSERT_LE(features_weight.size(), learner->GetNumFeature());
ASSERT_TRUE(std::is_sorted(features_weight.begin(), features_weight.end()));

auto test_eq = [&learner, &scores_weight](std::string type) {
std::vector<bst_feature_t> features;
std::vector<float> scores;
learner->CalcFeatureScore(type, &features, &scores);
learner->CalcFeatureScore(type, {}, &features, &scores);

std::vector<bst_feature_t> features_total;
std::vector<float> scores_total;
learner->CalcFeatureScore("total_" + type, &features_total, &scores_total);
learner->CalcFeatureScore("total_" + type, {}, &features_total, &scores_total);

for (size_t i = 0; i < scores_weight.size(); ++i) {
ASSERT_LE(RelError(scores_total[i] / scores[i], scores_weight[i]), kRtEps);
Expand Down

0 comments on commit e7ac248

Please sign in to comment.