From e7ac2486eb0b1ec3fad909643c1709d51495c91c Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Sat, 6 Nov 2021 00:07:36 +0800
Subject: [PATCH] [backport] [R] Fix global feature importance and predict with
 1 sample. (#7394) (#7397)

* [R] Fix global feature importance.

* Add implementation for tree index.  The parameter is not documented in C API since we
should work on porting the model slicing to R instead of supporting more use of tree
index.

* Fix the difference between "gain" and "total_gain".

* debug.

* Fix prediction.
---
 R-package/R/xgb.Booster.R               | 66 ++++++++++++++++---------
 R-package/R/xgb.importance.R            |  6 +--
 R-package/tests/testthat/test_helpers.R | 31 +++++++++++-
 doc/prediction.rst                      |  4 +-
 include/xgboost/gbm.h                   |  7 +--
 include/xgboost/learner.h               |  7 +--
 src/c_api/c_api.cc                      | 12 ++++-
 src/gbm/gblinear.cc                     |  2 +
 src/gbm/gbtree.h                        | 20 ++++++--
 src/learner.cc                          |  7 ++-
 tests/cpp/gbm/test_gbtree.cc            |  6 +--
 11 files changed, 119 insertions(+), 49 deletions(-)

diff --git a/R-package/R/xgb.Booster.R b/R-package/R/xgb.Booster.R
index 922af0eb0658..c19452925de3 100644
--- a/R-package/R/xgb.Booster.R
+++ b/R-package/R/xgb.Booster.R
@@ -397,6 +397,7 @@ predict.xgb.Booster <- function(object, newdata, missing = NA, outputmargin = FA
   shape <- predts$shape
   ret <- predts$results
 
+  n_ret <- length(ret)
   n_row <- nrow(newdata)
   if (n_row != shape[1]) {
     stop("Incorrect predict shape.")
@@ -405,36 +406,55 @@ predict.xgb.Booster <- function(object, newdata, missing = NA, outputmargin = FA
   arr <- array(data = ret, dim = rev(shape))
 
   cnames <- if (!is.null(colnames(newdata))) c(colnames(newdata), "BIAS") else NULL
+  n_groups <- shape[2]
+
+  ## Needed regardless of whether strict shape is being used.
   if (predcontrib) {
     dimnames(arr) <- list(cnames, NULL, NULL)
-    if (!strict_shape) {
-      arr <- aperm(a = arr, perm = c(2, 3, 1)) # [group, row, col]
-    }
   } else if (predinteraction) {
     dimnames(arr) <- list(cnames, cnames, NULL, NULL)
-    if (!strict_shape) {
-      arr <- aperm(a = arr, perm = c(3, 4, 1, 2)) # [group, row, col, col]
-    }
+  }
+  if (strict_shape) {
+    return(arr) # strict shape is calculated by libxgboost uniformly.
   }
 
-  if (!strict_shape) {
-    n_groups <- shape[2]
-    if (predleaf) {
-      arr <- matrix(arr, nrow = n_row, byrow = TRUE)
-    } else if (predcontrib && n_groups != 1) {
-      arr <- lapply(seq_len(n_groups), function(g) arr[g, , ])
-    } else if (predinteraction && n_groups != 1) {
-      arr <- lapply(seq_len(n_groups), function(g) arr[g, , , ])
-    } else if (!reshape && n_groups != 1) {
-      arr <- ret
-    } else if (reshape && n_groups != 1) {
-      arr <- matrix(arr, ncol = n_groups, byrow = TRUE)
+  if (predleaf) {
+    ## Predict leaf
+    arr <- if (n_ret == n_row) {
+      matrix(arr, ncol = 1)
+    } else {
+      matrix(arr, nrow = n_row, byrow = TRUE)
     }
-    arr <- drop(arr)
-    if (length(dim(arr)) == 1) {
-      arr <- as.vector(arr)
-    } else if (length(dim(arr)) == 2) {
-      arr <- as.matrix(arr)
+  } else if (predcontrib) {
+    ## Predict contribution
+    arr <- aperm(a = arr, perm = c(2, 3, 1)) # [group, row, col]
+    arr <- if (n_ret == n_row) {
+      matrix(arr, ncol =  1, dimnames = list(NULL, cnames))
+    } else if (n_groups != 1) {
+      ## turns array into list of matrices
+      lapply(seq_len(n_groups), function(g) arr[g, , ])
+    } else {
+      ## remove the first axis (group)
+      as.matrix(arr[1, , ])
+    }
+  } else if (predinteraction) {
+    ## Predict interaction
+    arr <- aperm(a = arr, perm = c(3, 4, 1, 2)) # [group, row, col, col]
+    arr <- if (n_ret == n_row) {
+      matrix(arr, ncol = 1, dimnames = list(NULL, cnames))
+    } else if (n_groups != 1) {
+      ## turns array into list of matrices
+      lapply(seq_len(n_groups), function(g) arr[g, , , ])
+    } else {
+      ## remove the first axis (group)
+      arr[1, , , ]
+    }
+  } else {
+    ## Normal prediction
+    arr <- if (reshape && n_groups != 1) {
+      matrix(arr, ncol = n_groups, byrow = TRUE)
+    } else {
+      as.vector(ret)
     }
   }
   return(arr)
diff --git a/R-package/R/xgb.importance.R b/R-package/R/xgb.importance.R
index 5176a9d54da0..67dbed0a5915 100644
--- a/R-package/R/xgb.importance.R
+++ b/R-package/R/xgb.importance.R
@@ -115,14 +115,14 @@ xgb.importance <- function(feature_names = NULL, model = NULL, trees = NULL,
   } else {
     concatenated <- list()
     output_names <- vector()
-    for (importance_type in c("weight", "gain", "cover")) {
-      args <- list(importance_type = importance_type, feature_names = feature_names)
+    for (importance_type in c("weight", "total_gain", "total_cover")) {
+      args <- list(importance_type = importance_type, feature_names = feature_names, tree_idx = trees)
       results <- .Call(
         XGBoosterFeatureScore_R, model$handle, jsonlite::toJSON(args, auto_unbox = TRUE, null = "null")
       )
       names(results) <- c("features", "shape", importance_type)
       concatenated[
-        switch(importance_type, "weight" = "Frequency", "gain" = "Gain", "cover" = "Cover")
+        switch(importance_type, "weight" = "Frequency", "total_gain" = "Gain", "total_cover" = "Cover")
       ] <- results[importance_type]
       output_names <- results$features
     }
diff --git a/R-package/tests/testthat/test_helpers.R b/R-package/tests/testthat/test_helpers.R
index d52b706194ec..d66b72430626 100644
--- a/R-package/tests/testthat/test_helpers.R
+++ b/R-package/tests/testthat/test_helpers.R
@@ -1,3 +1,4 @@
+library(testthat)
 context('Test helper functions')
 
 require(xgboost)
@@ -310,7 +311,35 @@ test_that("xgb.importance works with and without feature names", {
   # for multiclass
   imp.Tree <- xgb.importance(model = mbst.Tree)
   expect_equal(dim(imp.Tree), c(4, 4))
-  xgb.importance(model = mbst.Tree, trees = seq(from = 0, by = nclass, length.out = nrounds))
+
+  trees <- seq(from = 0, by = 2, length.out = 2)
+  importance <- xgb.importance(feature_names = feature.names, model = bst.Tree, trees = trees)
+
+  importance_from_dump <- function() {
+    model_text_dump <- xgb.dump(model = bst.Tree, with_stats = TRUE, trees = trees)
+    imp <- xgb.model.dt.tree(
+      feature_names = feature.names,
+      text = model_text_dump,
+      trees = trees
+    )[
+      Feature != "Leaf", .(
+        Gain = sum(Quality),
+        Cover = sum(Cover),
+        Frequency = .N
+      ),
+      by = Feature
+    ][
+      , `:=`(
+        Gain = Gain / sum(Gain),
+        Cover = Cover / sum(Cover),
+        Frequency = Frequency / sum(Frequency)
+      )
+    ][
+      order(Gain, decreasing = TRUE)
+    ]
+    imp
+  }
+  expect_equal(importance_from_dump(), importance, tolerance = 1e-6)
 })
 
 test_that("xgb.importance works with GLM model", {
diff --git a/doc/prediction.rst b/doc/prediction.rst
index 60b6ad089197..8d2243d42545 100644
--- a/doc/prediction.rst
+++ b/doc/prediction.rst
@@ -32,8 +32,8 @@ After 1.4 release, we added a new parameter called ``strict_shape``, one can set
 - When using ``output_margin`` to avoid transformation and ``strict_shape`` is set to ``True``:
 
   Similar to the previous case, output is a 2-dim array, except for that ``multi:softmax``
-  has equivalent output of ``multi:softprob`` due to dropped transformation.  If strict
-  shape is set to False then output can have 1 or 2 dim depending on used model.
+  has equivalent output shape of ``multi:softprob`` due to dropped transformation.  If
+  strict shape is set to False then output can have 1 or 2 dim depending on used model.
 
 - When using ``preds_contribs`` with ``strict_shape`` set to ``True``:
 
diff --git a/include/xgboost/gbm.h b/include/xgboost/gbm.h
index 580cb52a59ea..b2808e377922 100644
--- a/include/xgboost/gbm.h
+++ b/include/xgboost/gbm.h
@@ -182,9 +182,10 @@ class GradientBooster : public Model, public Configurable {
                                              bool with_stats,
                                              std::string format) const = 0;
 
-  virtual void FeatureScore(std::string const &importance_type,
-                            std::vector<bst_feature_t> *features,
-                            std::vector<float> *scores) const = 0;
+  virtual void FeatureScore(std::string const& importance_type,
+                            common::Span<int32_t const> trees,
+                            std::vector<bst_feature_t>* features,
+                            std::vector<float>* scores) const = 0;
   /*!
    * \brief Whether the current booster uses GPU.
    */
diff --git a/include/xgboost/learner.h b/include/xgboost/learner.h
index 09c16eff6cfa..cbbe91c00fb2 100644
--- a/include/xgboost/learner.h
+++ b/include/xgboost/learner.h
@@ -155,9 +155,10 @@ class Learner : public Model, public Configurable, public dmlc::Serializable {
   /*!
    * \brief Calculate feature score.  See doc in C API for outputs.
    */
-  virtual void CalcFeatureScore(std::string const &importance_type,
-                                std::vector<bst_feature_t> *features,
-                                std::vector<float> *scores) = 0;
+  virtual void CalcFeatureScore(std::string const& importance_type,
+                                common::Span<int32_t const> trees,
+                                std::vector<bst_feature_t>* features,
+                                std::vector<float>* scores) = 0;
 
   /*
    * \brief Get number of boosted rounds from gradient booster.
diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index e9cb6177b0da..42b9cad6fb4a 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -1159,9 +1159,17 @@ XGB_DLL int XGBoosterFeatureScore(BoosterHandle handle, char const *json_config,
     custom_feature_names = get<Array const>(config["feature_names"]);
   }
 
-  auto& scores = learner->GetThreadLocal().ret_vec_float;
+  std::vector<int32_t> tree_idx;
+  if (!IsA<Null>(config["tree_idx"])) {
+    auto j_tree_idx = get<Array const>(config["tree_idx"]);
+    for (auto const &idx : j_tree_idx) {
+      tree_idx.push_back(get<Integer const>(idx));
+    }
+  }
+
+  auto &scores = learner->GetThreadLocal().ret_vec_float;
   std::vector<bst_feature_t> features;
-  learner->CalcFeatureScore(importance, &features, &scores);
+  learner->CalcFeatureScore(importance, common::Span<int32_t const>(tree_idx), &features, &scores);
 
   auto n_features = learner->GetNumFeature();
   GenerateFeatureMap(learner, custom_feature_names, n_features, &feature_map);
diff --git a/src/gbm/gblinear.cc b/src/gbm/gblinear.cc
index 8da1f67f40e2..e6ccd303de6f 100644
--- a/src/gbm/gblinear.cc
+++ b/src/gbm/gblinear.cc
@@ -232,9 +232,11 @@ class GBLinear : public GradientBooster {
   }
 
   void FeatureScore(std::string const &importance_type,
+                    common::Span<int32_t const> trees,
                     std::vector<bst_feature_t> *out_features,
                     std::vector<float> *out_scores) const override {
     CHECK(!model_.weight.empty()) << "Model is not initialized";
+    CHECK(trees.empty()) << "gblinear doesn't support number of trees for feature importance.";
     CHECK_EQ(importance_type, "weight")
         << "gblinear only has `weight` defined for feature importance.";
     out_features->resize(this->learner_model_param_->num_feature, 0);
diff --git a/src/gbm/gbtree.h b/src/gbm/gbtree.h
index 958ce00f8463..a0b15603ea67 100644
--- a/src/gbm/gbtree.h
+++ b/src/gbm/gbtree.h
@@ -300,18 +300,28 @@ class GBTree : public GradientBooster {
     }
   }
 
-  void FeatureScore(std::string const &importance_type,
-                    std::vector<bst_feature_t> *features,
-                    std::vector<float> *scores) const override {
+  void FeatureScore(std::string const& importance_type, common::Span<int32_t const> trees,
+                    std::vector<bst_feature_t>* features,
+                    std::vector<float>* scores) const override {
     // Because feature with no importance doesn't appear in the return value so
     // we need to set up another pair of vectors to store the values during
     // computation.
     std::vector<size_t> split_counts(this->model_.learner_model_param->num_feature, 0);
     std::vector<float> gain_map(this->model_.learner_model_param->num_feature, 0);
+    std::vector<int32_t> tree_idx;
+    if (trees.empty()) {
+      tree_idx.resize(this->model_.trees.size());
+      std::iota(tree_idx.begin(), tree_idx.end(), 0);
+      trees = common::Span<int32_t const>(tree_idx);
+    }
+
+    auto total_n_trees = model_.trees.size();
     auto add_score = [&](auto fn) {
-      for (auto const &p_tree : model_.trees) {
+      for (auto idx : trees) {
+        CHECK_LE(idx, total_n_trees) << "Invalid tree index.";
+        auto const& p_tree = model_.trees[idx];
         p_tree->WalkTree([&](bst_node_t nidx) {
-          auto const &node = (*p_tree)[nidx];
+          auto const& node = (*p_tree)[nidx];
           if (!node.IsLeaf()) {
             split_counts[node.SplitIndex()]++;
             fn(p_tree, nidx, node.SplitIndex());
diff --git a/src/learner.cc b/src/learner.cc
index 8fd19091d0be..e5244bc25ec2 100644
--- a/src/learner.cc
+++ b/src/learner.cc
@@ -1214,11 +1214,10 @@ class LearnerImpl : public LearnerIO {
     *out_preds = &out_predictions.predictions;
   }
 
-  void CalcFeatureScore(std::string const &importance_type,
-                        std::vector<bst_feature_t> *features,
-                        std::vector<float> *scores) override {
+  void CalcFeatureScore(std::string const& importance_type, common::Span<int32_t const> trees,
+                        std::vector<bst_feature_t>* features, std::vector<float>* scores) override {
     this->Configure();
-    gbm_->FeatureScore(importance_type, features, scores);
+    gbm_->FeatureScore(importance_type, trees, features, scores);
   }
 
   const std::map<std::string, std::string>& GetConfigurationArguments() const override {
diff --git a/tests/cpp/gbm/test_gbtree.cc b/tests/cpp/gbm/test_gbtree.cc
index 9255bf2c32dc..3c307594b8de 100644
--- a/tests/cpp/gbm/test_gbtree.cc
+++ b/tests/cpp/gbm/test_gbtree.cc
@@ -430,7 +430,7 @@ TEST(GBTree, FeatureScore) {
 
   std::vector<bst_feature_t> features_weight;
   std::vector<float> scores_weight;
-  learner->CalcFeatureScore("weight", &features_weight, &scores_weight);
+  learner->CalcFeatureScore("weight", {}, &features_weight, &scores_weight);
   ASSERT_EQ(features_weight.size(), scores_weight.size());
   ASSERT_LE(features_weight.size(), learner->GetNumFeature());
   ASSERT_TRUE(std::is_sorted(features_weight.begin(), features_weight.end()));
@@ -438,11 +438,11 @@ TEST(GBTree, FeatureScore) {
   auto test_eq = [&learner, &scores_weight](std::string type) {
     std::vector<bst_feature_t> features;
     std::vector<float> scores;
-    learner->CalcFeatureScore(type, &features, &scores);
+    learner->CalcFeatureScore(type, {}, &features, &scores);
 
     std::vector<bst_feature_t> features_total;
     std::vector<float> scores_total;
-    learner->CalcFeatureScore("total_" + type, &features_total, &scores_total);
+    learner->CalcFeatureScore("total_" + type, {}, &features_total, &scores_total);
 
     for (size_t i = 0; i < scores_weight.size(); ++i) {
       ASSERT_LE(RelError(scores_total[i] / scores[i], scores_weight[i]), kRtEps);