Merge branch 'master' of github.com:dmlc/xgboost into remove-single

dmlc · Apr 29, 2022 · 67a664d · 67a664d
2 parents 0486996 + f7db16a
commit 67a664d
Show file tree

Hide file tree

Showing 106 changed files with 2,137 additions and 1,086 deletions.
diff --git a/amalgamation/xgboost-all0.cc b/amalgamation/xgboost-all0.cc
@@ -24,6 +24,7 @@
 #include "../src/objective/rank_obj.cc"
 #include "../src/objective/hinge.cc"
 #include "../src/objective/aft_obj.cc"
+#include "../src/objective/adaptive.cc"
 
 // gbms
 #include "../src/gbm/gbm.cc"

diff --git a/demo/guide-python/cat_in_the_dat.py b/demo/guide-python/cat_in_the_dat.py
@@ -63,7 +63,6 @@ def load_cat_in_the_dat() -> tuple[pd.DataFrame, pd.Series]:
 
 params = {
     "tree_method": "gpu_hist",
-    "use_label_encoder": False,
     "n_estimators": 32,
     "colsample_bylevel": 0.7,
 }

diff --git a/demo/guide-python/continuation.py b/demo/guide-python/continuation.py
@@ -14,13 +14,13 @@ def training_continuation(tmpdir: str, use_pickle: bool) -> None:
     """Basic training continuation."""
     # Train 128 iterations in 1 session
     X, y = load_breast_cancer(return_X_y=True)
-    clf = xgboost.XGBClassifier(n_estimators=128, use_label_encoder=False)
+    clf = xgboost.XGBClassifier(n_estimators=128)
     clf.fit(X, y, eval_set=[(X, y)], eval_metric="logloss")
     print("Total boosted rounds:", clf.get_booster().num_boosted_rounds())
 
     # Train 128 iterations in 2 sessions, with the first one runs for 32 iterations and
     # the second one runs for 96 iterations
-    clf = xgboost.XGBClassifier(n_estimators=32, use_label_encoder=False)
+    clf = xgboost.XGBClassifier(n_estimators=32)
     clf.fit(X, y, eval_set=[(X, y)], eval_metric="logloss")
     assert clf.get_booster().num_boosted_rounds() == 32
 
@@ -54,14 +54,14 @@ def training_continuation_early_stop(tmpdir: str, use_pickle: bool) -> None:
     n_estimators = 512
 
     X, y = load_breast_cancer(return_X_y=True)
-    clf = xgboost.XGBClassifier(n_estimators=n_estimators, use_label_encoder=False)
+    clf = xgboost.XGBClassifier(n_estimators=n_estimators)
     clf.fit(X, y, eval_set=[(X, y)], eval_metric="logloss", callbacks=[early_stop])
     print("Total boosted rounds:", clf.get_booster().num_boosted_rounds())
     best = clf.best_iteration
 
     # Train 512 iterations in 2 sessions, with the first one runs for 128 iterations and
     # the second one runs until early stop.
-    clf = xgboost.XGBClassifier(n_estimators=128, use_label_encoder=False)
+    clf = xgboost.XGBClassifier(n_estimators=128)
     # Reinitialize the early stop callback
     early_stop = xgboost.callback.EarlyStopping(
         rounds=early_stopping_rounds, save_best=True
@@ -79,15 +79,13 @@ def training_continuation_early_stop(tmpdir: str, use_pickle: bool) -> None:
     else:
         path = os.path.join(tmpdir, "model-first-128.json")
         clf.save_model(path)
-        loaded = xgboost.XGBClassifier(use_label_encoder=False)
+        loaded = xgboost.XGBClassifier()
         loaded.load_model(path)
 
     early_stop = xgboost.callback.EarlyStopping(
         rounds=early_stopping_rounds, save_best=True
     )
-    clf = xgboost.XGBClassifier(
-        n_estimators=n_estimators - 128, use_label_encoder=False
-    )
+    clf = xgboost.XGBClassifier(n_estimators=n_estimators - 128)
     clf.fit(
         X,
         y,

diff --git a/demo/guide-python/predict_first_ntree.py b/demo/guide-python/predict_first_ntree.py
@@ -35,7 +35,7 @@ def native_interface():
 def sklearn_interface():
     X_train, y_train = load_svmlight_file(train)
     X_test, y_test = load_svmlight_file(test)
-    clf = xgb.XGBClassifier(n_estimators=3, max_depth=2, eta=1, use_label_encoder=False)
+    clf = xgb.XGBClassifier(n_estimators=3, max_depth=2, eta=1)
     clf.fit(X_train, y_train, eval_set=[(X_test, y_test)])
     assert clf.n_classes_ == 2
 

diff --git a/doc/jvm/xgboost4j_spark_gpu_tutorial.rst b/doc/jvm/xgboost4j_spark_gpu_tutorial.rst
@@ -1,5 +1,5 @@
 #############################################
-XGBoost4J-Spark-GPU Tutorial (version 1.6.0+)
+XGBoost4J-Spark-GPU Tutorial (version 1.6.1+)
 #############################################
 
 **XGBoost4J-Spark-GPU** is an open source library aiming to accelerate distributed XGBoost training on Apache Spark cluster from
@@ -220,7 +220,7 @@ application jar is iris-1.0.0.jar
 
   cudf_version=22.02.0
   rapids_version=22.02.0
-  xgboost_version=1.6.0
+  xgboost_version=1.6.1
   main_class=Iris
   app_jar=iris-1.0.0.jar
 

diff --git a/doc/jvm/xgboost4j_spark_tutorial.rst b/doc/jvm/xgboost4j_spark_tutorial.rst
@@ -16,12 +16,6 @@ This tutorial is to cover the end-to-end process to build a machine learning pip
 * Building a Machine Learning Pipeline with XGBoost4J-Spark
 * Running XGBoost4J-Spark in Production
 
-.. note::
-
-  **SparkContext will be stopped by default when XGBoost training task fails**.
-
-  XGBoost4J-Spark 1.2.0+ exposes a parameter **kill_spark_context_on_worker_failure**. Set **kill_spark_context_on_worker_failure** to **false** so that the SparkContext will not be stopping on training failure. Instead of stopping the SparkContext, XGBoost4J-Spark will throw an exception instead. Users who want to re-use the SparkContext should wrap the training code in a try-catch block.
-
 .. contents::
   :backlinks: none
   :local:
@@ -129,7 +123,7 @@ labels. A DataFrame like this (containing vector-represented features and numeri
 
 .. note::
 
-  There is no need to assemble feature columns from version 1.6.0+. Instead, users can specify an array of
+  There is no need to assemble feature columns from version 1.6.1+. Instead, users can specify an array of
   feture column names by ``setFeaturesCol(value: Array[String])`` and XGBoost4j-Spark will do it.
 
 Dealing with missing values

diff --git a/doc/model.schema b/doc/model.schema
@@ -400,7 +400,6 @@
                 "reg_loss_param"
               ]
             },
-
             {
               "type": "object",
               "properties": {
@@ -433,6 +432,14 @@
                 "tweedie_regression_param"
               ]
             },
+            {
+              "properties": {
+                "name": {
+                  "const": "reg:absoluteerror"
+                }
+              },
+              "type": "object"
+            },
             {
               "type": "object",
               "properties": {

diff --git a/doc/parameter.rst b/doc/parameter.rst
@@ -349,6 +349,7 @@ Specify the learning task and the corresponding learning objective. The objectiv
   - ``reg:squaredlogerror``: regression with squared log loss :math:`\frac{1}{2}[log(pred + 1) - log(label + 1)]^2`.  All input labels are required to be greater than -1.  Also, see metric ``rmsle`` for possible issue  with this objective.
   - ``reg:logistic``: logistic regression.
   - ``reg:pseudohubererror``: regression with Pseudo Huber loss, a twice differentiable alternative to absolute loss.
+  - ``reg:absoluteerror``: Regression with L1 error. When tree model is used, leaf value is refreshed after tree construction.
   - ``binary:logistic``: logistic regression for binary classification, output probability
   - ``binary:logitraw``: logistic regression for binary classification, output score before logistic transformation
   - ``binary:hinge``: hinge loss for binary classification. This makes predictions of 0 or 1, rather than producing probabilities.

diff --git a/doc/treemethod.rst b/doc/treemethod.rst
@@ -10,7 +10,7 @@ are also some free standing updaters including ``grow_local_histmaker``, ``refre
 as the latter is just a pre-configuration of the former.  The difference is mostly due to
 historical reasons that each updater requires some specific configurations and might has
 missing features.  As we are moving forward, the gap between them is becoming more and
-more irrevelant.  We will collectively document them under tree methods.
+more irrelevant.  We will collectively document them under tree methods.
 
 **************
 Exact Solution

diff --git a/doc/tutorials/categorical.rst b/doc/tutorials/categorical.rst
@@ -36,9 +36,7 @@ parameter ``enable_categorical``:
 .. code:: python
 
   # Supported tree methods are `gpu_hist`, `approx`, and `hist`.
-  clf = xgb.XGBClassifier(
-      tree_method="gpu_hist", enable_categorical=True, use_label_encoder=False
-  )
+  clf = xgb.XGBClassifier(tree_method="gpu_hist", enable_categorical=True)
   # X is the dataframe we created in previous snippet
   clf.fit(X, y)
   # Must use JSON/UBJSON for serialization, otherwise the information is lost.

diff --git a/include/xgboost/gbm.h b/include/xgboost/gbm.h
@@ -90,9 +90,8 @@ class GradientBooster : public Model, public Configurable {
    * \param prediction The output prediction cache entry that needs to be updated.
    * the booster may change content of gpair
    */
-  virtual void DoBoost(DMatrix* p_fmat,
-                       HostDeviceVector<GradientPair>* in_gpair,
-                       PredictionCacheEntry*) = 0;
+  virtual void DoBoost(DMatrix* p_fmat, HostDeviceVector<GradientPair>* in_gpair,
+                       PredictionCacheEntry*, ObjFunction const* obj) = 0;
 
   /*!
    * \brief generate predictions for given feature matrix

diff --git a/include/xgboost/linalg.h b/include/xgboost/linalg.h
@@ -670,9 +670,13 @@ class Tensor {
    * See \ref TensorView for parameters of this constructor.
    */
   template <typename I, int32_t D>
-  explicit Tensor(I const (&shape)[D], int32_t device) {
+  explicit Tensor(I const (&shape)[D], int32_t device)
+      : Tensor{common::Span<I const, D>{shape}, device} {}
+
+  template <typename I, size_t D>
+  explicit Tensor(common::Span<I const, D> shape, int32_t device) {
     // No device unroll as this is a host only function.
-    std::copy(shape, shape + D, shape_);
+    std::copy(shape.data(), shape.data() + D, shape_);
     for (auto i = D; i < kDim; ++i) {
       shape_[i] = 1;
     }

diff --git a/include/xgboost/objective.h b/include/xgboost/objective.h
@@ -1,5 +1,5 @@
 /*!
- * Copyright 2014-2019 by Contributors
+ * Copyright 2014-2022 by Contributors
  * \file objective.h
  * \brief interface of objective function used by xgboost.
  * \author Tianqi Chen, Kailong Chen
@@ -22,6 +22,8 @@
 
 namespace xgboost {
 
+class RegTree;
+
 /*! \brief interface of objective function */
 class ObjFunction : public Configurable {
  protected:
@@ -88,6 +90,22 @@ class ObjFunction : public Configurable {
     return 1;
   }
 
+  /**
+   * \brief Update the leaf values after a tree is built. Needed for objectives with 0
+   *        hessian.
+   *
+   *   Note that the leaf update is not well defined for distributed training as XGBoost
+   *   computes only an average of quantile between workers. This breaks when some leaf
+   *   have no sample assigned in a local worker.
+   *
+   * \param position The leaf index for each rows.
+   * \param info MetaInfo providing labels and weights.
+   * \param prediction Model prediction after transformation.
+   * \param p_tree Tree that needs to be updated.
+   */
+  virtual void UpdateTreeLeaf(HostDeviceVector<bst_node_t> const& position, MetaInfo const& info,
+                              HostDeviceVector<float> const& prediction, RegTree* p_tree) const {}
+
   /*!
    * \brief Create an objective function according to name.
    * \param tparam Generic parameters.

diff --git a/include/xgboost/task.h b/include/xgboost/task.h
@@ -33,13 +33,18 @@ struct ObjInfo {
   } task;
   // Does the objective have constant hessian value?
   bool const_hess{false};
+  bool zero_hess{false};
 
-  explicit ObjInfo(Task t) : task{t} {}
-  ObjInfo(Task t, bool khess) : task{t}, const_hess{khess} {}
+  ObjInfo(Task t) : task{t} {}  // NOLINT
+  ObjInfo(Task t, bool khess, bool zhess) : task{t}, const_hess{khess}, zero_hess(zhess) {}
 
   XGBOOST_DEVICE bool UseOneHot() const {
     return (task != ObjInfo::kRegression && task != ObjInfo::kBinary);
   }
+  /**
+   * \brief Use adaptive tree if the objective doesn't have valid hessian value.
+   */
+  XGBOOST_DEVICE bool UpdateTreeLeaf() const { return zero_hess; }
 };
 }  // namespace xgboost
 #endif  // XGBOOST_TASK_H_
diff --git a/include/xgboost/tree_updater.h b/include/xgboost/tree_updater.h
@@ -50,18 +50,25 @@ class TreeUpdater : public Configurable {
    *  existing trees.
    */
   virtual bool CanModifyTree() const { return false; }
+  /*!
+   * \brief Wether the out_position in `Update` is valid. This determines whether adaptive
+   *        tree can be used.
+   */
+  virtual bool HasNodePosition() const { return false; }
   /*!
    * \brief perform update to the tree models
    * \param gpair the gradient pair statistics of the data
    * \param data The data matrix passed to the updater.
-   * \param trees references the trees to be updated, updater will change the content of trees
+   * \param out_position The leaf index for each row.  The index is negated if that row is
+   *                     removed during sampling. So the 3th node is ~3.
+   * \param out_trees references the trees to be updated, updater will change the content of trees
    *   note: all the trees in the vector are updated, with the same statistics,
    *         but maybe different random seeds, usually one tree is passed in at a time,
    *         there can be multiple trees when we train random forest style model
    */
-  virtual void Update(HostDeviceVector<GradientPair>* gpair,
-                      DMatrix* data,
-                      const std::vector<RegTree*>& trees) = 0;
+  virtual void Update(HostDeviceVector<GradientPair>* gpair, DMatrix* data,
+                      common::Span<HostDeviceVector<bst_node_t>> out_position,
+                      const std::vector<RegTree*>& out_trees) = 0;
 
   /*!
    * \brief determines whether updater has enough knowledge about a given dataset

diff --git a/jvm-packages/xgboost4j-gpu/src/test/java/ml/dmlc/xgboost4j/gpu/java/BoosterTest.java b/jvm-packages/xgboost4j-gpu/src/test/java/ml/dmlc/xgboost4j/gpu/java/BoosterTest.java
@@ -69,7 +69,7 @@ public void testBooster() throws XGBoostError {
       .hasHeader().build();
 
     int maxBin = 16;
-    int round = 100;
+    int round = 10;
     //set params
     Map<String, Object> paramMap = new HashMap<String, Object>() {
       {

diff --git a/...boost4j-spark-gpu/src/main/scala/ml/dmlc/xgboost4j/scala/rapids/spark/GpuPreXGBoost.scala b/...boost4j-spark-gpu/src/main/scala/ml/dmlc/xgboost4j/scala/rapids/spark/GpuPreXGBoost.scala
@@ -56,18 +56,20 @@ class GpuPreXGBoost extends PreXGBoostProvider {
   }
 
   /**
-   * Convert the Dataset[_] to RDD[Watches] which will be fed to XGBoost
+   * Convert the Dataset[_] to RDD[() => Watches] which will be fed to XGBoost
    *
    * @param estimator [[XGBoostClassifier]] or [[XGBoostRegressor]]
    * @param dataset   the training data
    * @param params    all user defined and defaulted params
-   * @return [[XGBoostExecutionParams]] => (RDD[[Watches]], Option[ RDD[_] ])
-   *         RDD[Watches] will be used as the training input
+   * @return [[XGBoostExecutionParams]] => (Boolean, RDD[[() => Watches]], Option[ RDD[_] ])
+   *         Boolean if building DMatrix in rabit context
+   *         RDD[() => Watches] will be used as the training input
    *         Option[ RDD[_] ] is the optional cached RDD
    */
   override def buildDatasetToRDD(estimator: Estimator[_],
       dataset: Dataset[_],
-      params: Map[String, Any]): XGBoostExecutionParams => (RDD[Watches], Option[RDD[_]]) = {
+      params: Map[String, Any]):
+    XGBoostExecutionParams => (Boolean, RDD[() => Watches], Option[RDD[_]]) = {
     GpuPreXGBoost.buildDatasetToRDD(estimator, dataset, params)
   }
 
@@ -116,19 +118,21 @@ object GpuPreXGBoost extends PreXGBoostProvider {
   }
 
   /**
-   * Convert the Dataset[_] to RDD[Watches] which will be fed to XGBoost
+   * Convert the Dataset[_] to RDD[() => Watches] which will be fed to XGBoost
    *
    * @param estimator supports XGBoostClassifier and XGBoostRegressor
    * @param dataset   the training data
    * @param params    all user defined and defaulted params
-   * @return [[XGBoostExecutionParams]] => (RDD[[Watches]], Option[ RDD[_] ])
-   *         RDD[Watches] will be used as the training input
+   * @return [[XGBoostExecutionParams]] => (Boolean, RDD[[() => Watches]], Option[ RDD[_] ])
+   *         Boolean if building DMatrix in rabit context
+   *         RDD[() => Watches] will be used as the training input to build DMatrix
    *         Option[ RDD[_] ] is the optional cached RDD
    */
   override def buildDatasetToRDD(
       estimator: Estimator[_],
       dataset: Dataset[_],
-      params: Map[String, Any]): XGBoostExecutionParams => (RDD[Watches], Option[RDD[_]]) = {
+      params: Map[String, Any]):
+    XGBoostExecutionParams => (Boolean, RDD[() => Watches], Option[RDD[_]]) = {
 
     val (Seq(labelName, weightName, marginName), feturesCols, groupName, evalSets) =
       estimator match {
@@ -166,7 +170,7 @@ object GpuPreXGBoost extends PreXGBoostProvider {
     xgbExecParams: XGBoostExecutionParams =>
       val dataMap = prepareInputData(trainingData, evalDataMap, xgbExecParams.numWorkers,
         xgbExecParams.cacheTrainingSet)
-      (buildRDDWatches(dataMap, xgbExecParams, evalDataMap.isEmpty), None)
+      (true, buildRDDWatches(dataMap, xgbExecParams, evalDataMap.isEmpty), None)
   }
 
   /**
@@ -403,14 +407,9 @@ object GpuPreXGBoost extends PreXGBoostProvider {
   }
 
   private def repartitionInputData(dataFrame: DataFrame, nWorkers: Int): DataFrame = {
-    // We can't check dataFrame.rdd.getNumPartitions == nWorkers here, since dataFrame.rdd is
-    // a lazy variable. If we call it here, we will not directly extract RDD[Table] again,
-    // instead, we will involve Columnar -> Row -> Columnar and decrease the performance
-    if (nWorkers == 1) {
-      dataFrame.coalesce(1)
-    } else {
-      dataFrame.repartition(nWorkers)
-    }
+    // we can't involve any coalesce operation here, since Barrier mode will check
+    // the RDD patterns which does not allow coalesce.
+    dataFrame.repartition(nWorkers)
   }
 
   private def repartitionForGroup(
@@ -448,7 +447,7 @@ object GpuPreXGBoost extends PreXGBoostProvider {
   private def buildRDDWatches(
       dataMap: Map[String, ColumnDataBatch],
       xgbExeParams: XGBoostExecutionParams,
-      noEvalSet: Boolean): RDD[Watches] = {
+      noEvalSet: Boolean): RDD[() => Watches] = {
 
     val sc = dataMap(TRAIN_NAME).rawDF.sparkSession.sparkContext
     val maxBin = xgbExeParams.toMap.getOrElse("max_bin", 256).asInstanceOf[Int]
@@ -459,7 +458,7 @@ object GpuPreXGBoost extends PreXGBoostProvider {
       GpuUtils.toColumnarRdd(dataMap(TRAIN_NAME).rawDF).mapPartitions({
         iter =>
           val iterColBatch = iter.map(table => new GpuColumnBatch(table, null))
-          Iterator(buildWatches(
+          Iterator(() => buildWatches(
             PreXGBoost.getCacheDirName(xgbExeParams.useExternalMemory), xgbExeParams.missing,
             colIndicesForTrain, iterColBatch, maxBin))
       })
@@ -469,7 +468,7 @@ object GpuPreXGBoost extends PreXGBoostProvider {
       val nameAndColIndices = dataMap.map(nc => (nc._1, nc._2.colIndices))
       coPartitionForGpu(dataMap, sc, xgbExeParams.numWorkers).mapPartitions {
         nameAndColumnBatchIter =>
-          Iterator(buildWatchesWithEval(
+          Iterator(() => buildWatchesWithEval(
             PreXGBoost.getCacheDirName(xgbExeParams.useExternalMemory), xgbExeParams.missing,
             nameAndColIndices, nameAndColumnBatchIter, maxBin))
       }