From e7ac2486eb0b1ec3fad909643c1709d51495c91c Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Sat, 6 Nov 2021 00:07:36 +0800 Subject: [PATCH] [backport] [R] Fix global feature importance and predict with 1 sample. (#7394) (#7397) * [R] Fix global feature importance. * Add implementation for tree index. The parameter is not documented in C API since we should work on porting the model slicing to R instead of supporting more use of tree index. * Fix the difference between "gain" and "total_gain". * debug. * Fix prediction. --- R-package/R/xgb.Booster.R | 66 ++++++++++++++++--------- R-package/R/xgb.importance.R | 6 +-- R-package/tests/testthat/test_helpers.R | 31 +++++++++++- doc/prediction.rst | 4 +- include/xgboost/gbm.h | 7 +-- include/xgboost/learner.h | 7 +-- src/c_api/c_api.cc | 12 ++++- src/gbm/gblinear.cc | 2 + src/gbm/gbtree.h | 20 ++++++-- src/learner.cc | 7 ++- tests/cpp/gbm/test_gbtree.cc | 6 +-- 11 files changed, 119 insertions(+), 49 deletions(-) diff --git a/R-package/R/xgb.Booster.R b/R-package/R/xgb.Booster.R index 922af0eb0658..c19452925de3 100644 --- a/R-package/R/xgb.Booster.R +++ b/R-package/R/xgb.Booster.R @@ -397,6 +397,7 @@ predict.xgb.Booster <- function(object, newdata, missing = NA, outputmargin = FA shape <- predts$shape ret <- predts$results + n_ret <- length(ret) n_row <- nrow(newdata) if (n_row != shape[1]) { stop("Incorrect predict shape.") @@ -405,36 +406,55 @@ predict.xgb.Booster <- function(object, newdata, missing = NA, outputmargin = FA arr <- array(data = ret, dim = rev(shape)) cnames <- if (!is.null(colnames(newdata))) c(colnames(newdata), "BIAS") else NULL + n_groups <- shape[2] + + ## Needed regardless of whether strict shape is being used. if (predcontrib) { dimnames(arr) <- list(cnames, NULL, NULL) - if (!strict_shape) { - arr <- aperm(a = arr, perm = c(2, 3, 1)) # [group, row, col] - } } else if (predinteraction) { dimnames(arr) <- list(cnames, cnames, NULL, NULL) - if (!strict_shape) { - arr <- aperm(a = arr, perm = c(3, 4, 1, 2)) # [group, row, col, col] - } + } + if (strict_shape) { + return(arr) # strict shape is calculated by libxgboost uniformly. } - if (!strict_shape) { - n_groups <- shape[2] - if (predleaf) { - arr <- matrix(arr, nrow = n_row, byrow = TRUE) - } else if (predcontrib && n_groups != 1) { - arr <- lapply(seq_len(n_groups), function(g) arr[g, , ]) - } else if (predinteraction && n_groups != 1) { - arr <- lapply(seq_len(n_groups), function(g) arr[g, , , ]) - } else if (!reshape && n_groups != 1) { - arr <- ret - } else if (reshape && n_groups != 1) { - arr <- matrix(arr, ncol = n_groups, byrow = TRUE) + if (predleaf) { + ## Predict leaf + arr <- if (n_ret == n_row) { + matrix(arr, ncol = 1) + } else { + matrix(arr, nrow = n_row, byrow = TRUE) } - arr <- drop(arr) - if (length(dim(arr)) == 1) { - arr <- as.vector(arr) - } else if (length(dim(arr)) == 2) { - arr <- as.matrix(arr) + } else if (predcontrib) { + ## Predict contribution + arr <- aperm(a = arr, perm = c(2, 3, 1)) # [group, row, col] + arr <- if (n_ret == n_row) { + matrix(arr, ncol = 1, dimnames = list(NULL, cnames)) + } else if (n_groups != 1) { + ## turns array into list of matrices + lapply(seq_len(n_groups), function(g) arr[g, , ]) + } else { + ## remove the first axis (group) + as.matrix(arr[1, , ]) + } + } else if (predinteraction) { + ## Predict interaction + arr <- aperm(a = arr, perm = c(3, 4, 1, 2)) # [group, row, col, col] + arr <- if (n_ret == n_row) { + matrix(arr, ncol = 1, dimnames = list(NULL, cnames)) + } else if (n_groups != 1) { + ## turns array into list of matrices + lapply(seq_len(n_groups), function(g) arr[g, , , ]) + } else { + ## remove the first axis (group) + arr[1, , , ] + } + } else { + ## Normal prediction + arr <- if (reshape && n_groups != 1) { + matrix(arr, ncol = n_groups, byrow = TRUE) + } else { + as.vector(ret) } } return(arr) diff --git a/R-package/R/xgb.importance.R b/R-package/R/xgb.importance.R index 5176a9d54da0..67dbed0a5915 100644 --- a/R-package/R/xgb.importance.R +++ b/R-package/R/xgb.importance.R @@ -115,14 +115,14 @@ xgb.importance <- function(feature_names = NULL, model = NULL, trees = NULL, } else { concatenated <- list() output_names <- vector() - for (importance_type in c("weight", "gain", "cover")) { - args <- list(importance_type = importance_type, feature_names = feature_names) + for (importance_type in c("weight", "total_gain", "total_cover")) { + args <- list(importance_type = importance_type, feature_names = feature_names, tree_idx = trees) results <- .Call( XGBoosterFeatureScore_R, model$handle, jsonlite::toJSON(args, auto_unbox = TRUE, null = "null") ) names(results) <- c("features", "shape", importance_type) concatenated[ - switch(importance_type, "weight" = "Frequency", "gain" = "Gain", "cover" = "Cover") + switch(importance_type, "weight" = "Frequency", "total_gain" = "Gain", "total_cover" = "Cover") ] <- results[importance_type] output_names <- results$features } diff --git a/R-package/tests/testthat/test_helpers.R b/R-package/tests/testthat/test_helpers.R index d52b706194ec..d66b72430626 100644 --- a/R-package/tests/testthat/test_helpers.R +++ b/R-package/tests/testthat/test_helpers.R @@ -1,3 +1,4 @@ +library(testthat) context('Test helper functions') require(xgboost) @@ -310,7 +311,35 @@ test_that("xgb.importance works with and without feature names", { # for multiclass imp.Tree <- xgb.importance(model = mbst.Tree) expect_equal(dim(imp.Tree), c(4, 4)) - xgb.importance(model = mbst.Tree, trees = seq(from = 0, by = nclass, length.out = nrounds)) + + trees <- seq(from = 0, by = 2, length.out = 2) + importance <- xgb.importance(feature_names = feature.names, model = bst.Tree, trees = trees) + + importance_from_dump <- function() { + model_text_dump <- xgb.dump(model = bst.Tree, with_stats = TRUE, trees = trees) + imp <- xgb.model.dt.tree( + feature_names = feature.names, + text = model_text_dump, + trees = trees + )[ + Feature != "Leaf", .( + Gain = sum(Quality), + Cover = sum(Cover), + Frequency = .N + ), + by = Feature + ][ + , `:=`( + Gain = Gain / sum(Gain), + Cover = Cover / sum(Cover), + Frequency = Frequency / sum(Frequency) + ) + ][ + order(Gain, decreasing = TRUE) + ] + imp + } + expect_equal(importance_from_dump(), importance, tolerance = 1e-6) }) test_that("xgb.importance works with GLM model", { diff --git a/doc/prediction.rst b/doc/prediction.rst index 60b6ad089197..8d2243d42545 100644 --- a/doc/prediction.rst +++ b/doc/prediction.rst @@ -32,8 +32,8 @@ After 1.4 release, we added a new parameter called ``strict_shape``, one can set - When using ``output_margin`` to avoid transformation and ``strict_shape`` is set to ``True``: Similar to the previous case, output is a 2-dim array, except for that ``multi:softmax`` - has equivalent output of ``multi:softprob`` due to dropped transformation. If strict - shape is set to False then output can have 1 or 2 dim depending on used model. + has equivalent output shape of ``multi:softprob`` due to dropped transformation. If + strict shape is set to False then output can have 1 or 2 dim depending on used model. - When using ``preds_contribs`` with ``strict_shape`` set to ``True``: diff --git a/include/xgboost/gbm.h b/include/xgboost/gbm.h index 580cb52a59ea..b2808e377922 100644 --- a/include/xgboost/gbm.h +++ b/include/xgboost/gbm.h @@ -182,9 +182,10 @@ class GradientBooster : public Model, public Configurable { bool with_stats, std::string format) const = 0; - virtual void FeatureScore(std::string const &importance_type, - std::vector *features, - std::vector *scores) const = 0; + virtual void FeatureScore(std::string const& importance_type, + common::Span trees, + std::vector* features, + std::vector* scores) const = 0; /*! * \brief Whether the current booster uses GPU. */ diff --git a/include/xgboost/learner.h b/include/xgboost/learner.h index 09c16eff6cfa..cbbe91c00fb2 100644 --- a/include/xgboost/learner.h +++ b/include/xgboost/learner.h @@ -155,9 +155,10 @@ class Learner : public Model, public Configurable, public dmlc::Serializable { /*! * \brief Calculate feature score. See doc in C API for outputs. */ - virtual void CalcFeatureScore(std::string const &importance_type, - std::vector *features, - std::vector *scores) = 0; + virtual void CalcFeatureScore(std::string const& importance_type, + common::Span trees, + std::vector* features, + std::vector* scores) = 0; /* * \brief Get number of boosted rounds from gradient booster. diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc index e9cb6177b0da..42b9cad6fb4a 100644 --- a/src/c_api/c_api.cc +++ b/src/c_api/c_api.cc @@ -1159,9 +1159,17 @@ XGB_DLL int XGBoosterFeatureScore(BoosterHandle handle, char const *json_config, custom_feature_names = get(config["feature_names"]); } - auto& scores = learner->GetThreadLocal().ret_vec_float; + std::vector tree_idx; + if (!IsA(config["tree_idx"])) { + auto j_tree_idx = get(config["tree_idx"]); + for (auto const &idx : j_tree_idx) { + tree_idx.push_back(get(idx)); + } + } + + auto &scores = learner->GetThreadLocal().ret_vec_float; std::vector features; - learner->CalcFeatureScore(importance, &features, &scores); + learner->CalcFeatureScore(importance, common::Span(tree_idx), &features, &scores); auto n_features = learner->GetNumFeature(); GenerateFeatureMap(learner, custom_feature_names, n_features, &feature_map); diff --git a/src/gbm/gblinear.cc b/src/gbm/gblinear.cc index 8da1f67f40e2..e6ccd303de6f 100644 --- a/src/gbm/gblinear.cc +++ b/src/gbm/gblinear.cc @@ -232,9 +232,11 @@ class GBLinear : public GradientBooster { } void FeatureScore(std::string const &importance_type, + common::Span trees, std::vector *out_features, std::vector *out_scores) const override { CHECK(!model_.weight.empty()) << "Model is not initialized"; + CHECK(trees.empty()) << "gblinear doesn't support number of trees for feature importance."; CHECK_EQ(importance_type, "weight") << "gblinear only has `weight` defined for feature importance."; out_features->resize(this->learner_model_param_->num_feature, 0); diff --git a/src/gbm/gbtree.h b/src/gbm/gbtree.h index 958ce00f8463..a0b15603ea67 100644 --- a/src/gbm/gbtree.h +++ b/src/gbm/gbtree.h @@ -300,18 +300,28 @@ class GBTree : public GradientBooster { } } - void FeatureScore(std::string const &importance_type, - std::vector *features, - std::vector *scores) const override { + void FeatureScore(std::string const& importance_type, common::Span trees, + std::vector* features, + std::vector* scores) const override { // Because feature with no importance doesn't appear in the return value so // we need to set up another pair of vectors to store the values during // computation. std::vector split_counts(this->model_.learner_model_param->num_feature, 0); std::vector gain_map(this->model_.learner_model_param->num_feature, 0); + std::vector tree_idx; + if (trees.empty()) { + tree_idx.resize(this->model_.trees.size()); + std::iota(tree_idx.begin(), tree_idx.end(), 0); + trees = common::Span(tree_idx); + } + + auto total_n_trees = model_.trees.size(); auto add_score = [&](auto fn) { - for (auto const &p_tree : model_.trees) { + for (auto idx : trees) { + CHECK_LE(idx, total_n_trees) << "Invalid tree index."; + auto const& p_tree = model_.trees[idx]; p_tree->WalkTree([&](bst_node_t nidx) { - auto const &node = (*p_tree)[nidx]; + auto const& node = (*p_tree)[nidx]; if (!node.IsLeaf()) { split_counts[node.SplitIndex()]++; fn(p_tree, nidx, node.SplitIndex()); diff --git a/src/learner.cc b/src/learner.cc index 8fd19091d0be..e5244bc25ec2 100644 --- a/src/learner.cc +++ b/src/learner.cc @@ -1214,11 +1214,10 @@ class LearnerImpl : public LearnerIO { *out_preds = &out_predictions.predictions; } - void CalcFeatureScore(std::string const &importance_type, - std::vector *features, - std::vector *scores) override { + void CalcFeatureScore(std::string const& importance_type, common::Span trees, + std::vector* features, std::vector* scores) override { this->Configure(); - gbm_->FeatureScore(importance_type, features, scores); + gbm_->FeatureScore(importance_type, trees, features, scores); } const std::map& GetConfigurationArguments() const override { diff --git a/tests/cpp/gbm/test_gbtree.cc b/tests/cpp/gbm/test_gbtree.cc index 9255bf2c32dc..3c307594b8de 100644 --- a/tests/cpp/gbm/test_gbtree.cc +++ b/tests/cpp/gbm/test_gbtree.cc @@ -430,7 +430,7 @@ TEST(GBTree, FeatureScore) { std::vector features_weight; std::vector scores_weight; - learner->CalcFeatureScore("weight", &features_weight, &scores_weight); + learner->CalcFeatureScore("weight", {}, &features_weight, &scores_weight); ASSERT_EQ(features_weight.size(), scores_weight.size()); ASSERT_LE(features_weight.size(), learner->GetNumFeature()); ASSERT_TRUE(std::is_sorted(features_weight.begin(), features_weight.end())); @@ -438,11 +438,11 @@ TEST(GBTree, FeatureScore) { auto test_eq = [&learner, &scores_weight](std::string type) { std::vector features; std::vector scores; - learner->CalcFeatureScore(type, &features, &scores); + learner->CalcFeatureScore(type, {}, &features, &scores); std::vector features_total; std::vector scores_total; - learner->CalcFeatureScore("total_" + type, &features_total, &scores_total); + learner->CalcFeatureScore("total_" + type, {}, &features_total, &scores_total); for (size_t i = 0; i < scores_weight.size(); ++i) { ASSERT_LE(RelError(scores_total[i] / scores[i], scores_weight[i]), kRtEps);