Skip to content

Commit

Permalink
Merge branch 'master' of github.com:dmlc/xgboost into remove-single
Browse files Browse the repository at this point in the history
  • Loading branch information
RAMitchell committed Apr 29, 2022
2 parents 0486996 + f7db16a commit 67a664d
Show file tree
Hide file tree
Showing 106 changed files with 2,137 additions and 1,086 deletions.
1 change: 1 addition & 0 deletions amalgamation/xgboost-all0.cc
Expand Up @@ -24,6 +24,7 @@
#include "../src/objective/rank_obj.cc"
#include "../src/objective/hinge.cc"
#include "../src/objective/aft_obj.cc"
#include "../src/objective/adaptive.cc"

// gbms
#include "../src/gbm/gbm.cc"
Expand Down
1 change: 0 additions & 1 deletion demo/guide-python/cat_in_the_dat.py
Expand Up @@ -63,7 +63,6 @@ def load_cat_in_the_dat() -> tuple[pd.DataFrame, pd.Series]:

params = {
"tree_method": "gpu_hist",
"use_label_encoder": False,
"n_estimators": 32,
"colsample_bylevel": 0.7,
}
Expand Down
14 changes: 6 additions & 8 deletions demo/guide-python/continuation.py
Expand Up @@ -14,13 +14,13 @@ def training_continuation(tmpdir: str, use_pickle: bool) -> None:
"""Basic training continuation."""
# Train 128 iterations in 1 session
X, y = load_breast_cancer(return_X_y=True)
clf = xgboost.XGBClassifier(n_estimators=128, use_label_encoder=False)
clf = xgboost.XGBClassifier(n_estimators=128)
clf.fit(X, y, eval_set=[(X, y)], eval_metric="logloss")
print("Total boosted rounds:", clf.get_booster().num_boosted_rounds())

# Train 128 iterations in 2 sessions, with the first one runs for 32 iterations and
# the second one runs for 96 iterations
clf = xgboost.XGBClassifier(n_estimators=32, use_label_encoder=False)
clf = xgboost.XGBClassifier(n_estimators=32)
clf.fit(X, y, eval_set=[(X, y)], eval_metric="logloss")
assert clf.get_booster().num_boosted_rounds() == 32

Expand Down Expand Up @@ -54,14 +54,14 @@ def training_continuation_early_stop(tmpdir: str, use_pickle: bool) -> None:
n_estimators = 512

X, y = load_breast_cancer(return_X_y=True)
clf = xgboost.XGBClassifier(n_estimators=n_estimators, use_label_encoder=False)
clf = xgboost.XGBClassifier(n_estimators=n_estimators)
clf.fit(X, y, eval_set=[(X, y)], eval_metric="logloss", callbacks=[early_stop])
print("Total boosted rounds:", clf.get_booster().num_boosted_rounds())
best = clf.best_iteration

# Train 512 iterations in 2 sessions, with the first one runs for 128 iterations and
# the second one runs until early stop.
clf = xgboost.XGBClassifier(n_estimators=128, use_label_encoder=False)
clf = xgboost.XGBClassifier(n_estimators=128)
# Reinitialize the early stop callback
early_stop = xgboost.callback.EarlyStopping(
rounds=early_stopping_rounds, save_best=True
Expand All @@ -79,15 +79,13 @@ def training_continuation_early_stop(tmpdir: str, use_pickle: bool) -> None:
else:
path = os.path.join(tmpdir, "model-first-128.json")
clf.save_model(path)
loaded = xgboost.XGBClassifier(use_label_encoder=False)
loaded = xgboost.XGBClassifier()
loaded.load_model(path)

early_stop = xgboost.callback.EarlyStopping(
rounds=early_stopping_rounds, save_best=True
)
clf = xgboost.XGBClassifier(
n_estimators=n_estimators - 128, use_label_encoder=False
)
clf = xgboost.XGBClassifier(n_estimators=n_estimators - 128)
clf.fit(
X,
y,
Expand Down
2 changes: 1 addition & 1 deletion demo/guide-python/predict_first_ntree.py
Expand Up @@ -35,7 +35,7 @@ def native_interface():
def sklearn_interface():
X_train, y_train = load_svmlight_file(train)
X_test, y_test = load_svmlight_file(test)
clf = xgb.XGBClassifier(n_estimators=3, max_depth=2, eta=1, use_label_encoder=False)
clf = xgb.XGBClassifier(n_estimators=3, max_depth=2, eta=1)
clf.fit(X_train, y_train, eval_set=[(X_test, y_test)])
assert clf.n_classes_ == 2

Expand Down
4 changes: 2 additions & 2 deletions doc/jvm/xgboost4j_spark_gpu_tutorial.rst
@@ -1,5 +1,5 @@
#############################################
XGBoost4J-Spark-GPU Tutorial (version 1.6.0+)
XGBoost4J-Spark-GPU Tutorial (version 1.6.1+)
#############################################

**XGBoost4J-Spark-GPU** is an open source library aiming to accelerate distributed XGBoost training on Apache Spark cluster from
Expand Down Expand Up @@ -220,7 +220,7 @@ application jar is iris-1.0.0.jar
cudf_version=22.02.0
rapids_version=22.02.0
xgboost_version=1.6.0
xgboost_version=1.6.1
main_class=Iris
app_jar=iris-1.0.0.jar
Expand Down
8 changes: 1 addition & 7 deletions doc/jvm/xgboost4j_spark_tutorial.rst
Expand Up @@ -16,12 +16,6 @@ This tutorial is to cover the end-to-end process to build a machine learning pip
* Building a Machine Learning Pipeline with XGBoost4J-Spark
* Running XGBoost4J-Spark in Production

.. note::

**SparkContext will be stopped by default when XGBoost training task fails**.

XGBoost4J-Spark 1.2.0+ exposes a parameter **kill_spark_context_on_worker_failure**. Set **kill_spark_context_on_worker_failure** to **false** so that the SparkContext will not be stopping on training failure. Instead of stopping the SparkContext, XGBoost4J-Spark will throw an exception instead. Users who want to re-use the SparkContext should wrap the training code in a try-catch block.

.. contents::
:backlinks: none
:local:
Expand Down Expand Up @@ -129,7 +123,7 @@ labels. A DataFrame like this (containing vector-represented features and numeri

.. note::

There is no need to assemble feature columns from version 1.6.0+. Instead, users can specify an array of
There is no need to assemble feature columns from version 1.6.1+. Instead, users can specify an array of
feture column names by ``setFeaturesCol(value: Array[String])`` and XGBoost4j-Spark will do it.

Dealing with missing values
Expand Down
9 changes: 8 additions & 1 deletion doc/model.schema
Expand Up @@ -400,7 +400,6 @@
"reg_loss_param"
]
},

{
"type": "object",
"properties": {
Expand Down Expand Up @@ -433,6 +432,14 @@
"tweedie_regression_param"
]
},
{
"properties": {
"name": {
"const": "reg:absoluteerror"
}
},
"type": "object"
},
{
"type": "object",
"properties": {
Expand Down
1 change: 1 addition & 0 deletions doc/parameter.rst
Expand Up @@ -349,6 +349,7 @@ Specify the learning task and the corresponding learning objective. The objectiv
- ``reg:squaredlogerror``: regression with squared log loss :math:`\frac{1}{2}[log(pred + 1) - log(label + 1)]^2`. All input labels are required to be greater than -1. Also, see metric ``rmsle`` for possible issue with this objective.
- ``reg:logistic``: logistic regression.
- ``reg:pseudohubererror``: regression with Pseudo Huber loss, a twice differentiable alternative to absolute loss.
- ``reg:absoluteerror``: Regression with L1 error. When tree model is used, leaf value is refreshed after tree construction.
- ``binary:logistic``: logistic regression for binary classification, output probability
- ``binary:logitraw``: logistic regression for binary classification, output score before logistic transformation
- ``binary:hinge``: hinge loss for binary classification. This makes predictions of 0 or 1, rather than producing probabilities.
Expand Down
2 changes: 1 addition & 1 deletion doc/treemethod.rst
Expand Up @@ -10,7 +10,7 @@ are also some free standing updaters including ``grow_local_histmaker``, ``refre
as the latter is just a pre-configuration of the former. The difference is mostly due to
historical reasons that each updater requires some specific configurations and might has
missing features. As we are moving forward, the gap between them is becoming more and
more irrevelant. We will collectively document them under tree methods.
more irrelevant. We will collectively document them under tree methods.

**************
Exact Solution
Expand Down
4 changes: 1 addition & 3 deletions doc/tutorials/categorical.rst
Expand Up @@ -36,9 +36,7 @@ parameter ``enable_categorical``:
.. code:: python
# Supported tree methods are `gpu_hist`, `approx`, and `hist`.
clf = xgb.XGBClassifier(
tree_method="gpu_hist", enable_categorical=True, use_label_encoder=False
)
clf = xgb.XGBClassifier(tree_method="gpu_hist", enable_categorical=True)
# X is the dataframe we created in previous snippet
clf.fit(X, y)
# Must use JSON/UBJSON for serialization, otherwise the information is lost.
Expand Down
5 changes: 2 additions & 3 deletions include/xgboost/gbm.h
Expand Up @@ -90,9 +90,8 @@ class GradientBooster : public Model, public Configurable {
* \param prediction The output prediction cache entry that needs to be updated.
* the booster may change content of gpair
*/
virtual void DoBoost(DMatrix* p_fmat,
HostDeviceVector<GradientPair>* in_gpair,
PredictionCacheEntry*) = 0;
virtual void DoBoost(DMatrix* p_fmat, HostDeviceVector<GradientPair>* in_gpair,
PredictionCacheEntry*, ObjFunction const* obj) = 0;

/*!
* \brief generate predictions for given feature matrix
Expand Down
8 changes: 6 additions & 2 deletions include/xgboost/linalg.h
Expand Up @@ -670,9 +670,13 @@ class Tensor {
* See \ref TensorView for parameters of this constructor.
*/
template <typename I, int32_t D>
explicit Tensor(I const (&shape)[D], int32_t device) {
explicit Tensor(I const (&shape)[D], int32_t device)
: Tensor{common::Span<I const, D>{shape}, device} {}

template <typename I, size_t D>
explicit Tensor(common::Span<I const, D> shape, int32_t device) {
// No device unroll as this is a host only function.
std::copy(shape, shape + D, shape_);
std::copy(shape.data(), shape.data() + D, shape_);
for (auto i = D; i < kDim; ++i) {
shape_[i] = 1;
}
Expand Down
20 changes: 19 additions & 1 deletion include/xgboost/objective.h
@@ -1,5 +1,5 @@
/*!
* Copyright 2014-2019 by Contributors
* Copyright 2014-2022 by Contributors
* \file objective.h
* \brief interface of objective function used by xgboost.
* \author Tianqi Chen, Kailong Chen
Expand All @@ -22,6 +22,8 @@

namespace xgboost {

class RegTree;

/*! \brief interface of objective function */
class ObjFunction : public Configurable {
protected:
Expand Down Expand Up @@ -88,6 +90,22 @@ class ObjFunction : public Configurable {
return 1;
}

/**
* \brief Update the leaf values after a tree is built. Needed for objectives with 0
* hessian.
*
* Note that the leaf update is not well defined for distributed training as XGBoost
* computes only an average of quantile between workers. This breaks when some leaf
* have no sample assigned in a local worker.
*
* \param position The leaf index for each rows.
* \param info MetaInfo providing labels and weights.
* \param prediction Model prediction after transformation.
* \param p_tree Tree that needs to be updated.
*/
virtual void UpdateTreeLeaf(HostDeviceVector<bst_node_t> const& position, MetaInfo const& info,
HostDeviceVector<float> const& prediction, RegTree* p_tree) const {}

/*!
* \brief Create an objective function according to name.
* \param tparam Generic parameters.
Expand Down
9 changes: 7 additions & 2 deletions include/xgboost/task.h
Expand Up @@ -33,13 +33,18 @@ struct ObjInfo {
} task;
// Does the objective have constant hessian value?
bool const_hess{false};
bool zero_hess{false};

explicit ObjInfo(Task t) : task{t} {}
ObjInfo(Task t, bool khess) : task{t}, const_hess{khess} {}
ObjInfo(Task t) : task{t} {} // NOLINT
ObjInfo(Task t, bool khess, bool zhess) : task{t}, const_hess{khess}, zero_hess(zhess) {}

XGBOOST_DEVICE bool UseOneHot() const {
return (task != ObjInfo::kRegression && task != ObjInfo::kBinary);
}
/**
* \brief Use adaptive tree if the objective doesn't have valid hessian value.
*/
XGBOOST_DEVICE bool UpdateTreeLeaf() const { return zero_hess; }
};
} // namespace xgboost
#endif // XGBOOST_TASK_H_
15 changes: 11 additions & 4 deletions include/xgboost/tree_updater.h
Expand Up @@ -50,18 +50,25 @@ class TreeUpdater : public Configurable {
* existing trees.
*/
virtual bool CanModifyTree() const { return false; }
/*!
* \brief Wether the out_position in `Update` is valid. This determines whether adaptive
* tree can be used.
*/
virtual bool HasNodePosition() const { return false; }
/*!
* \brief perform update to the tree models
* \param gpair the gradient pair statistics of the data
* \param data The data matrix passed to the updater.
* \param trees references the trees to be updated, updater will change the content of trees
* \param out_position The leaf index for each row. The index is negated if that row is
* removed during sampling. So the 3th node is ~3.
* \param out_trees references the trees to be updated, updater will change the content of trees
* note: all the trees in the vector are updated, with the same statistics,
* but maybe different random seeds, usually one tree is passed in at a time,
* there can be multiple trees when we train random forest style model
*/
virtual void Update(HostDeviceVector<GradientPair>* gpair,
DMatrix* data,
const std::vector<RegTree*>& trees) = 0;
virtual void Update(HostDeviceVector<GradientPair>* gpair, DMatrix* data,
common::Span<HostDeviceVector<bst_node_t>> out_position,
const std::vector<RegTree*>& out_trees) = 0;

/*!
* \brief determines whether updater has enough knowledge about a given dataset
Expand Down
Expand Up @@ -69,7 +69,7 @@ public void testBooster() throws XGBoostError {
.hasHeader().build();

int maxBin = 16;
int round = 100;
int round = 10;
//set params
Map<String, Object> paramMap = new HashMap<String, Object>() {
{
Expand Down
Expand Up @@ -56,18 +56,20 @@ class GpuPreXGBoost extends PreXGBoostProvider {
}

/**
* Convert the Dataset[_] to RDD[Watches] which will be fed to XGBoost
* Convert the Dataset[_] to RDD[() => Watches] which will be fed to XGBoost
*
* @param estimator [[XGBoostClassifier]] or [[XGBoostRegressor]]
* @param dataset the training data
* @param params all user defined and defaulted params
* @return [[XGBoostExecutionParams]] => (RDD[[Watches]], Option[ RDD[_] ])
* RDD[Watches] will be used as the training input
* @return [[XGBoostExecutionParams]] => (Boolean, RDD[[() => Watches]], Option[ RDD[_] ])
* Boolean if building DMatrix in rabit context
* RDD[() => Watches] will be used as the training input
* Option[ RDD[_] ] is the optional cached RDD
*/
override def buildDatasetToRDD(estimator: Estimator[_],
dataset: Dataset[_],
params: Map[String, Any]): XGBoostExecutionParams => (RDD[Watches], Option[RDD[_]]) = {
params: Map[String, Any]):
XGBoostExecutionParams => (Boolean, RDD[() => Watches], Option[RDD[_]]) = {
GpuPreXGBoost.buildDatasetToRDD(estimator, dataset, params)
}

Expand Down Expand Up @@ -116,19 +118,21 @@ object GpuPreXGBoost extends PreXGBoostProvider {
}

/**
* Convert the Dataset[_] to RDD[Watches] which will be fed to XGBoost
* Convert the Dataset[_] to RDD[() => Watches] which will be fed to XGBoost
*
* @param estimator supports XGBoostClassifier and XGBoostRegressor
* @param dataset the training data
* @param params all user defined and defaulted params
* @return [[XGBoostExecutionParams]] => (RDD[[Watches]], Option[ RDD[_] ])
* RDD[Watches] will be used as the training input
* @return [[XGBoostExecutionParams]] => (Boolean, RDD[[() => Watches]], Option[ RDD[_] ])
* Boolean if building DMatrix in rabit context
* RDD[() => Watches] will be used as the training input to build DMatrix
* Option[ RDD[_] ] is the optional cached RDD
*/
override def buildDatasetToRDD(
estimator: Estimator[_],
dataset: Dataset[_],
params: Map[String, Any]): XGBoostExecutionParams => (RDD[Watches], Option[RDD[_]]) = {
params: Map[String, Any]):
XGBoostExecutionParams => (Boolean, RDD[() => Watches], Option[RDD[_]]) = {

val (Seq(labelName, weightName, marginName), feturesCols, groupName, evalSets) =
estimator match {
Expand Down Expand Up @@ -166,7 +170,7 @@ object GpuPreXGBoost extends PreXGBoostProvider {
xgbExecParams: XGBoostExecutionParams =>
val dataMap = prepareInputData(trainingData, evalDataMap, xgbExecParams.numWorkers,
xgbExecParams.cacheTrainingSet)
(buildRDDWatches(dataMap, xgbExecParams, evalDataMap.isEmpty), None)
(true, buildRDDWatches(dataMap, xgbExecParams, evalDataMap.isEmpty), None)
}

/**
Expand Down Expand Up @@ -403,14 +407,9 @@ object GpuPreXGBoost extends PreXGBoostProvider {
}

private def repartitionInputData(dataFrame: DataFrame, nWorkers: Int): DataFrame = {
// We can't check dataFrame.rdd.getNumPartitions == nWorkers here, since dataFrame.rdd is
// a lazy variable. If we call it here, we will not directly extract RDD[Table] again,
// instead, we will involve Columnar -> Row -> Columnar and decrease the performance
if (nWorkers == 1) {
dataFrame.coalesce(1)
} else {
dataFrame.repartition(nWorkers)
}
// we can't involve any coalesce operation here, since Barrier mode will check
// the RDD patterns which does not allow coalesce.
dataFrame.repartition(nWorkers)
}

private def repartitionForGroup(
Expand Down Expand Up @@ -448,7 +447,7 @@ object GpuPreXGBoost extends PreXGBoostProvider {
private def buildRDDWatches(
dataMap: Map[String, ColumnDataBatch],
xgbExeParams: XGBoostExecutionParams,
noEvalSet: Boolean): RDD[Watches] = {
noEvalSet: Boolean): RDD[() => Watches] = {

val sc = dataMap(TRAIN_NAME).rawDF.sparkSession.sparkContext
val maxBin = xgbExeParams.toMap.getOrElse("max_bin", 256).asInstanceOf[Int]
Expand All @@ -459,7 +458,7 @@ object GpuPreXGBoost extends PreXGBoostProvider {
GpuUtils.toColumnarRdd(dataMap(TRAIN_NAME).rawDF).mapPartitions({
iter =>
val iterColBatch = iter.map(table => new GpuColumnBatch(table, null))
Iterator(buildWatches(
Iterator(() => buildWatches(
PreXGBoost.getCacheDirName(xgbExeParams.useExternalMemory), xgbExeParams.missing,
colIndicesForTrain, iterColBatch, maxBin))
})
Expand All @@ -469,7 +468,7 @@ object GpuPreXGBoost extends PreXGBoostProvider {
val nameAndColIndices = dataMap.map(nc => (nc._1, nc._2.colIndices))
coPartitionForGpu(dataMap, sc, xgbExeParams.numWorkers).mapPartitions {
nameAndColumnBatchIter =>
Iterator(buildWatchesWithEval(
Iterator(() => buildWatchesWithEval(
PreXGBoost.getCacheDirName(xgbExeParams.useExternalMemory), xgbExeParams.missing,
nameAndColIndices, nameAndColumnBatchIter, maxBin))
}
Expand Down

0 comments on commit 67a664d

Please sign in to comment.