From 9d132d86eb5bc2a3611dadb999fafbd47f6a4c8b Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Tue, 18 Aug 2020 19:55:41 +0800 Subject: [PATCH] Feature weights (#5962) --- amalgamation/xgboost-all0.cc | 1 + demo/guide-python/feature_weights.py | 49 +++++++++++++ demo/json-model/json_parser.py | 2 +- doc/parameter.rst | 9 ++- include/xgboost/c_api.h | 28 ++++++++ include/xgboost/data.h | 29 ++------ python-package/xgboost/core.py | 7 +- python-package/xgboost/data.py | 45 ++++++++---- python-package/xgboost/sklearn.py | 27 ++++++-- src/c_api/c_api.cc | 11 +++ src/common/common.h | 18 +++++ src/common/random.cc | 38 +++++++++++ src/common/random.h | 65 +++++++++++------- src/data/data.cc | 25 +++++++ src/data/data.cu | 24 +++++++ src/tree/updater_colmaker.cc | 6 +- src/tree/updater_gpu_hist.cu | 6 +- src/tree/updater_quantile_hist.cc | 10 +-- tests/cpp/common/test_common.cc | 13 ++++ tests/cpp/common/test_random.cc | 68 ++++++++++++++++--- tests/cpp/tree/test_gpu_hist.cu | 9 ++- .../test_device_quantile_dmatrix.py | 14 ++++ tests/python/test_demos.py | 16 +++-- tests/python/test_dmatrix.py | 41 +++++++++-- tests/python/test_with_sklearn.py | 61 +++++++++++++++++ 25 files changed, 516 insertions(+), 106 deletions(-) create mode 100644 demo/guide-python/feature_weights.py create mode 100644 src/common/random.cc create mode 100644 tests/cpp/common/test_common.cc diff --git a/amalgamation/xgboost-all0.cc b/amalgamation/xgboost-all0.cc index 4dc86aa57802..9980a918d602 100644 --- a/amalgamation/xgboost-all0.cc +++ b/amalgamation/xgboost-all0.cc @@ -69,6 +69,7 @@ #include "../src/learner.cc" #include "../src/logging.cc" #include "../src/common/common.cc" +#include "../src/common/random.cc" #include "../src/common/charconv.cc" #include "../src/common/timer.cc" #include "../src/common/host_device_vector.cc" diff --git a/demo/guide-python/feature_weights.py b/demo/guide-python/feature_weights.py new file mode 100644 index 000000000000..b9cee8c050af --- /dev/null +++ b/demo/guide-python/feature_weights.py @@ -0,0 +1,49 @@ +'''Using feature weight to change column sampling. + + .. versionadded:: 1.3.0 +''' + +import numpy as np +import xgboost +from matplotlib import pyplot as plt +import argparse + + +def main(args): + rng = np.random.RandomState(1994) + + kRows = 1000 + kCols = 10 + + X = rng.randn(kRows, kCols) + y = rng.randn(kRows) + fw = np.ones(shape=(kCols,)) + for i in range(kCols): + fw[i] *= float(i) + + dtrain = xgboost.DMatrix(X, y) + dtrain.set_info(feature_weights=fw) + + bst = xgboost.train({'tree_method': 'hist', + 'colsample_bynode': 0.5}, + dtrain, num_boost_round=10, + evals=[(dtrain, 'd')]) + featue_map = bst.get_fscore() + # feature zero has 0 weight + assert featue_map.get('f0', None) is None + assert max(featue_map.values()) == featue_map.get('f9') + + if args.plot: + xgboost.plot_importance(bst) + plt.show() + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument( + '--plot', + type=int, + default=1, + help='Set to 0 to disable plotting the evaluation history.') + args = parser.parse_args() + main(args) diff --git a/demo/json-model/json_parser.py b/demo/json-model/json_parser.py index eedcbf9c2287..c41a44d881c8 100644 --- a/demo/json-model/json_parser.py +++ b/demo/json-model/json_parser.py @@ -94,7 +94,7 @@ def __str__(self): class Model: '''Gradient boosted tree model.''' - def __init__(self, m: dict): + def __init__(self, model: dict): '''Construct the Model from JSON object. parameters diff --git a/doc/parameter.rst b/doc/parameter.rst index 27da95a0ced2..f5175757fccd 100644 --- a/doc/parameter.rst +++ b/doc/parameter.rst @@ -107,6 +107,10 @@ Parameters for Tree Booster 'colsample_bynode':0.5}`` with 64 features will leave 8 features to choose from at each split. + On Python interface, one can set the ``feature_weights`` for DMatrix to define the + probability of each feature being selected when using column sampling. There's a + similar parameter for ``fit`` method in sklearn interface. + * ``lambda`` [default=1, alias: ``reg_lambda``] - L2 regularization term on weights. Increasing this value will make model more conservative. @@ -225,9 +229,8 @@ Parameters for Tree Booster list is a group of indices of features that are allowed to interact with each other. See tutorial for more information -Additional parameters for `hist` and 'gpu_hist' tree method -================================================ - +Additional parameters for ``hist`` and ``gpu_hist`` tree method +================================================================ * ``single_precision_histogram``, [default=``false``] - Use single precision to build histograms instead of double precision. diff --git a/include/xgboost/c_api.h b/include/xgboost/c_api.h index 794cbdf19e8f..4db461d11b1c 100644 --- a/include/xgboost/c_api.h +++ b/include/xgboost/c_api.h @@ -483,6 +483,34 @@ XGB_DLL int XGDMatrixGetStrFeatureInfo(DMatrixHandle handle, const char *field, bst_ulong *size, const char ***out_features); +/*! + * \brief Set meta info from dense matrix. Valid field names are: + * + * - label + * - weight + * - base_margin + * - group + * - label_lower_bound + * - label_upper_bound + * - feature_weights + * + * \param handle An instance of data matrix + * \param field Feild name + * \param data Pointer to consecutive memory storing data. + * \param size Size of the data, this is relative to size of type. (Meaning NOT number + * of bytes.) + * \param type Indicator of data type. This is defined in xgboost::DataType enum class. + * + * float = 1 + * double = 2 + * uint32_t = 3 + * uint64_t = 4 + * + * \return 0 when success, -1 when failure happens + */ +XGB_DLL int XGDMatrixSetDenseInfo(DMatrixHandle handle, const char *field, + void *data, bst_ulong size, int type); + /*! * \brief (deprecated) Use XGDMatrixSetUIntInfo instead. Set group of the training matrix * \param handle a instance of data matrix diff --git a/include/xgboost/data.h b/include/xgboost/data.h index e7350fffeeba..71aaf5f7e171 100644 --- a/include/xgboost/data.h +++ b/include/xgboost/data.h @@ -89,34 +89,17 @@ class MetaInfo { * \brief Type of each feature. Automatically set when feature_type_names is specifed. */ HostDeviceVector feature_types; + /* + * \brief Weight of each feature, used to define the probability of each feature being + * selected when using column sampling. + */ + HostDeviceVector feature_weigths; /*! \brief default constructor */ MetaInfo() = default; MetaInfo(MetaInfo&& that) = default; MetaInfo& operator=(MetaInfo&& that) = default; - MetaInfo& operator=(MetaInfo const& that) { - this->num_row_ = that.num_row_; - this->num_col_ = that.num_col_; - this->num_nonzero_ = that.num_nonzero_; - - this->labels_.Resize(that.labels_.Size()); - this->labels_.Copy(that.labels_); - - this->group_ptr_ = that.group_ptr_; - - this->weights_.Resize(that.weights_.Size()); - this->weights_.Copy(that.weights_); - - this->base_margin_.Resize(that.base_margin_.Size()); - this->base_margin_.Copy(that.base_margin_); - - this->labels_lower_bound_.Resize(that.labels_lower_bound_.Size()); - this->labels_lower_bound_.Copy(that.labels_lower_bound_); - - this->labels_upper_bound_.Resize(that.labels_upper_bound_.Size()); - this->labels_upper_bound_.Copy(that.labels_upper_bound_); - return *this; - } + MetaInfo& operator=(MetaInfo const& that) = delete; /*! * \brief Validate all metainfo. diff --git a/python-package/xgboost/core.py b/python-package/xgboost/core.py index 4bc77783ee91..cf22453245ac 100644 --- a/python-package/xgboost/core.py +++ b/python-package/xgboost/core.py @@ -455,7 +455,8 @@ def set_info(self, label_lower_bound=None, label_upper_bound=None, feature_names=None, - feature_types=None): + feature_types=None, + feature_weights=None): '''Set meta info for DMatrix.''' if label is not None: self.set_label(label) @@ -473,6 +474,10 @@ def set_info(self, self.feature_names = feature_names if feature_types is not None: self.feature_types = feature_types + if feature_weights is not None: + from .data import dispatch_meta_backend + dispatch_meta_backend(matrix=self, data=feature_weights, + name='feature_weights') def get_float_info(self, field): """Get float property from the DMatrix. diff --git a/python-package/xgboost/data.py b/python-package/xgboost/data.py index 0caab3ec17bb..e8751844f467 100644 --- a/python-package/xgboost/data.py +++ b/python-package/xgboost/data.py @@ -530,22 +530,38 @@ def dispatch_data_backend(data, missing, threads, raise TypeError('Not supported type for data.' + str(type(data))) +def _to_data_type(dtype: str, name: str): + dtype_map = {'float32': 1, 'float64': 2, 'uint32': 3, 'uint64': 4} + if dtype not in dtype_map.keys(): + raise TypeError( + f'Expecting float32, float64, uint32, uint64, got {dtype} ' + + f'for {name}.') + return dtype_map[dtype] + + +def _validate_meta_shape(data): + if hasattr(data, 'shape'): + assert len(data.shape) == 1 or ( + len(data.shape) == 2 and + (data.shape[1] == 0 or data.shape[1] == 1)) + + def _meta_from_numpy(data, field, dtype, handle): data = _maybe_np_slice(data, dtype) - if dtype == 'uint32': - c_data = c_array(ctypes.c_uint32, data) - _check_call(_LIB.XGDMatrixSetUIntInfo(handle, - c_str(field), - c_array(ctypes.c_uint, data), - c_bst_ulong(len(data)))) - elif dtype == 'float': - c_data = c_array(ctypes.c_float, data) - _check_call(_LIB.XGDMatrixSetFloatInfo(handle, - c_str(field), - c_data, - c_bst_ulong(len(data)))) - else: - raise TypeError('Unsupported type ' + str(dtype) + ' for:' + field) + interface = data.__array_interface__ + assert interface.get('mask', None) is None, 'Masked array is not supported' + size = data.shape[0] + + c_type = _to_data_type(str(data.dtype), field) + ptr = interface['data'][0] + ptr = ctypes.c_void_p(ptr) + _check_call(_LIB.XGDMatrixSetDenseInfo( + handle, + c_str(field), + ptr, + c_bst_ulong(size), + c_type + )) def _meta_from_list(data, field, dtype, handle): @@ -595,6 +611,7 @@ def _meta_from_dt(data, field, dtype, handle): def dispatch_meta_backend(matrix: DMatrix, data, name: str, dtype: str = None): '''Dispatch for meta info.''' handle = matrix.handle + _validate_meta_shape(data) if data is None: return if _is_list(data): diff --git a/python-package/xgboost/sklearn.py b/python-package/xgboost/sklearn.py index d1c3c6d53952..4721533b49d5 100644 --- a/python-package/xgboost/sklearn.py +++ b/python-package/xgboost/sklearn.py @@ -441,6 +441,7 @@ def load_model(self, fname): def fit(self, X, y, sample_weight=None, base_margin=None, eval_set=None, eval_metric=None, early_stopping_rounds=None, verbose=True, xgb_model=None, sample_weight_eval_set=None, + feature_weights=None, callbacks=None): # pylint: disable=invalid-name,attribute-defined-outside-init """Fit gradient boosting model @@ -459,9 +460,6 @@ def fit(self, X, y, sample_weight=None, base_margin=None, A list of (X, y) tuple pairs to use as validation sets, for which metrics will be computed. Validation metrics will help us track the performance of the model. - sample_weight_eval_set : list, optional - A list of the form [L_1, L_2, ..., L_n], where each L_i is a list of - instance weights on the i-th validation set. eval_metric : str, list of str, or callable, optional If a str, should be a built-in evaluation metric to use. See doc/parameter.rst. @@ -490,6 +488,13 @@ def fit(self, X, y, sample_weight=None, base_margin=None, xgb_model : str file name of stored XGBoost model or 'Booster' instance XGBoost model to be loaded before training (allows training continuation). + sample_weight_eval_set : list, optional + A list of the form [L_1, L_2, ..., L_n], where each L_i is a list of + instance weights on the i-th validation set. + feature_weights: array_like + Weight for each feature, defines the probability of each feature + being selected when colsample is being used. All values must be + greater than 0, otherwise a `ValueError` is thrown. callbacks : list of callback functions List of callback functions that are applied at end of each iteration. It is possible to use predefined callbacks by using :ref:`callback_api`. @@ -498,6 +503,7 @@ def fit(self, X, y, sample_weight=None, base_margin=None, .. code-block:: python [xgb.callback.reset_learning_rate(custom_rates)] + """ self.n_features_in_ = X.shape[1] @@ -505,6 +511,7 @@ def fit(self, X, y, sample_weight=None, base_margin=None, base_margin=base_margin, missing=self.missing, nthread=self.n_jobs) + train_dmatrix.set_info(feature_weights=feature_weights) evals_result = {} @@ -762,7 +769,7 @@ def __init__(self, objective="binary:logistic", **kwargs): def fit(self, X, y, sample_weight=None, base_margin=None, eval_set=None, eval_metric=None, early_stopping_rounds=None, verbose=True, xgb_model=None, - sample_weight_eval_set=None, callbacks=None): + sample_weight_eval_set=None, feature_weights=None, callbacks=None): # pylint: disable = attribute-defined-outside-init,arguments-differ evals_result = {} @@ -824,6 +831,7 @@ def fit(self, X, y, sample_weight=None, base_margin=None, train_dmatrix = DMatrix(X, label=training_labels, weight=sample_weight, base_margin=base_margin, missing=self.missing, nthread=self.n_jobs) + train_dmatrix.set_info(feature_weights=feature_weights) self._Booster = train(xgb_options, train_dmatrix, self.get_num_boosting_rounds(), @@ -1107,10 +1115,10 @@ def __init__(self, objective='rank:pairwise', **kwargs): raise ValueError("please use XGBRanker for ranking task") def fit(self, X, y, group, sample_weight=None, base_margin=None, - eval_set=None, - sample_weight_eval_set=None, eval_group=None, eval_metric=None, + eval_set=None, sample_weight_eval_set=None, + eval_group=None, eval_metric=None, early_stopping_rounds=None, verbose=False, xgb_model=None, - callbacks=None): + feature_weights=None, callbacks=None): # pylint: disable = attribute-defined-outside-init,arguments-differ """Fit gradient boosting ranker @@ -1176,6 +1184,10 @@ def fit(self, X, y, group, sample_weight=None, base_margin=None, xgb_model : str file name of stored XGBoost model or 'Booster' instance XGBoost model to be loaded before training (allows training continuation). + feature_weights: array_like + Weight for each feature, defines the probability of each feature + being selected when colsample is being used. All values must be + greater than 0, otherwise a `ValueError` is thrown. callbacks : list of callback functions List of callback functions that are applied at end of each iteration. It is possible to use predefined callbacks by using @@ -1211,6 +1223,7 @@ def _dmat_init(group, **params): train_dmatrix = DMatrix(data=X, label=y, weight=sample_weight, base_margin=base_margin, missing=self.missing, nthread=self.n_jobs) + train_dmatrix.set_info(feature_weights=feature_weights) train_dmatrix.set_group(group) evals_result = {} diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc index aa6ecf43a784..397f83e69bf8 100644 --- a/src/c_api/c_api.cc +++ b/src/c_api/c_api.cc @@ -316,6 +316,17 @@ XGB_DLL int XGDMatrixGetStrFeatureInfo(DMatrixHandle handle, const char *field, API_END(); } +XGB_DLL int XGDMatrixSetDenseInfo(DMatrixHandle handle, const char *field, + void *data, xgboost::bst_ulong size, + int type) { + API_BEGIN(); + CHECK_HANDLE(); + auto &info = static_cast *>(handle)->get()->Info(); + CHECK(type >= 1 && type <= 4); + info.SetInfo(field, data, static_cast(type), size); + API_END(); +} + XGB_DLL int XGDMatrixSetGroup(DMatrixHandle handle, const unsigned* group, xgboost::bst_ulong len) { diff --git a/src/common/common.h b/src/common/common.h index f9300f3d583c..a4397d1c89aa 100644 --- a/src/common/common.h +++ b/src/common/common.h @@ -9,12 +9,15 @@ #include #include +#include #include +#include #include #include #include #include #include +#include #if defined(__CUDACC__) #include @@ -154,6 +157,21 @@ inline void AssertGPUSupport() { #endif // XGBOOST_USE_CUDA } +inline void AssertOneAPISupport() { +#ifndef XGBOOST_USE_ONEAPI + LOG(FATAL) << "XGBoost version not compiled with OneAPI support."; +#endif // XGBOOST_USE_ONEAPI +} + +template > +std::vector ArgSort(std::vector const &array, Comp comp = std::less{}) { + std::vector result(array.size()); + std::iota(result.begin(), result.end(), 0); + std::stable_sort( + result.begin(), result.end(), + [&array, comp](Idx const &l, Idx const &r) { return comp(array[l], array[r]); }); + return result; +} } // namespace common } // namespace xgboost #endif // XGBOOST_COMMON_COMMON_H_ diff --git a/src/common/random.cc b/src/common/random.cc new file mode 100644 index 000000000000..f386cad916b2 --- /dev/null +++ b/src/common/random.cc @@ -0,0 +1,38 @@ +/*! + * Copyright 2020 by XGBoost Contributors + * \file random.cc + */ +#include "random.h" + +namespace xgboost { +namespace common { +std::shared_ptr> ColumnSampler::ColSample( + std::shared_ptr> p_features, + float colsample) { + if (colsample == 1.0f) { + return p_features; + } + const auto &features = p_features->HostVector(); + CHECK_GT(features.size(), 0); + + int n = std::max(1, static_cast(colsample * features.size())); + auto p_new_features = std::make_shared>(); + auto &new_features = *p_new_features; + + if (feature_weights_.size() != 0) { + new_features.HostVector() = WeightedSamplingWithoutReplacement( + p_features->HostVector(), feature_weights_, n); + } else { + new_features.Resize(features.size()); + std::copy(features.begin(), features.end(), + new_features.HostVector().begin()); + std::shuffle(new_features.HostVector().begin(), + new_features.HostVector().end(), rng_); + new_features.Resize(n); + } + std::sort(new_features.HostVector().begin(), new_features.HostVector().end()); + return p_new_features; +} + +} // namespace common +} // namespace xgboost diff --git a/src/common/random.h b/src/common/random.h index 45af80ce030b..7fd461d22d0f 100644 --- a/src/common/random.h +++ b/src/common/random.h @@ -1,5 +1,5 @@ /*! - * Copyright 2015 by Contributors + * Copyright 2015-2020 by Contributors * \file random.h * \brief Utility related to random. * \author Tianqi Chen @@ -10,14 +10,17 @@ #include #include #include +#include #include #include #include #include #include #include +#include #include "xgboost/host_device_vector.h" +#include "common.h" namespace xgboost { namespace common { @@ -75,6 +78,38 @@ using GlobalRandomEngine = RandomEngine; */ GlobalRandomEngine& GlobalRandom(); // NOLINT(*) +/* + * Original paper: + * Weighted Random Sampling (2005; Efraimidis, Spirakis) + * + * Blog: + * https://timvieira.github.io/blog/post/2019/09/16/algorithms-for-sampling-without-replacement/ +*/ +template +std::vector WeightedSamplingWithoutReplacement( + std::vector const &array, std::vector const &weights, size_t n) { + // ES sampling. + CHECK_EQ(array.size(), weights.size()); + std::vector keys(weights.size()); + std::uniform_real_distribution dist; + auto& rng = GlobalRandom(); + for (size_t i = 0; i < array.size(); ++i) { + auto w = std::max(weights.at(i), kRtEps); + auto u = dist(rng); + auto k = std::log(u) / w; + keys[i] = k; + } + auto ind = ArgSort(keys, std::greater<>{}); + ind.resize(n); + + std::vector results(ind.size()); + for (size_t k = 0; k < ind.size(); ++k) { + auto idx = ind[k]; + results[k] = array[idx]; + } + return results; +} + /** * \class ColumnSampler * @@ -82,36 +117,18 @@ GlobalRandomEngine& GlobalRandom(); // NOLINT(*) * colsample_bynode parameters. Should be initialised before tree construction and to * reset when tree construction is completed. */ - class ColumnSampler { std::shared_ptr> feature_set_tree_; std::map>> feature_set_level_; + std::vector feature_weights_; float colsample_bylevel_{1.0f}; float colsample_bytree_{1.0f}; float colsample_bynode_{1.0f}; GlobalRandomEngine rng_; - std::shared_ptr> ColSample( - std::shared_ptr> p_features, float colsample) { - if (colsample == 1.0f) return p_features; - const auto& features = p_features->HostVector(); - CHECK_GT(features.size(), 0); - int n = std::max(1, static_cast(colsample * features.size())); - auto p_new_features = std::make_shared>(); - auto& new_features = *p_new_features; - new_features.Resize(features.size()); - std::copy(features.begin(), features.end(), - new_features.HostVector().begin()); - std::shuffle(new_features.HostVector().begin(), - new_features.HostVector().end(), rng_); - new_features.Resize(n); - std::sort(new_features.HostVector().begin(), - new_features.HostVector().end()); - - return p_new_features; - } - public: + std::shared_ptr> ColSample( + std::shared_ptr> p_features, float colsample); /** * \brief Column sampler constructor. * \note This constructor manually sets the rng seed @@ -139,8 +156,10 @@ class ColumnSampler { * \param colsample_bytree * \param skip_index_0 (Optional) True to skip index 0. */ - void Init(int64_t num_col, float colsample_bynode, float colsample_bylevel, + void Init(int64_t num_col, std::vector feature_weights, + float colsample_bynode, float colsample_bylevel, float colsample_bytree, bool skip_index_0 = false) { + feature_weights_ = std::move(feature_weights); colsample_bylevel_ = colsample_bylevel; colsample_bytree_ = colsample_bytree; colsample_bynode_ = colsample_bynode; diff --git a/src/data/data.cc b/src/data/data.cc index c09e213d41f8..7d55c3315c0a 100644 --- a/src/data/data.cc +++ b/src/data/data.cc @@ -293,6 +293,9 @@ MetaInfo MetaInfo::Slice(common::Span ridxs) const { } else { out.base_margin_.HostVector() = Gather(this->base_margin_.HostVector(), ridxs); } + + out.feature_weigths.Resize(this->feature_weigths.Size()); + out.feature_weigths.Copy(this->feature_weigths); return out; } @@ -377,6 +380,16 @@ void MetaInfo::SetInfo(const char* key, const void* dptr, DataType dtype, size_t labels.resize(num); DISPATCH_CONST_PTR(dtype, dptr, cast_dptr, std::copy(cast_dptr, cast_dptr + num, labels.begin())); + } else if (!std::strcmp(key, "feature_weights")) { + auto &h_feature_weights = feature_weigths.HostVector(); + h_feature_weights.resize(num); + DISPATCH_CONST_PTR( + dtype, dptr, cast_dptr, + std::copy(cast_dptr, cast_dptr + num, h_feature_weights.begin())); + bool valid = + std::all_of(h_feature_weights.cbegin(), h_feature_weights.cend(), + [](float w) { return w >= 0; }); + CHECK(valid) << "Feature weight must be greater than 0."; } else { LOG(FATAL) << "Unknown key for MetaInfo: " << key; } @@ -396,6 +409,8 @@ void MetaInfo::GetInfo(char const *key, bst_ulong *out_len, DataType dtype, vec = &this->labels_lower_bound_.HostVector(); } else if (!std::strcmp(key, "label_upper_bound")) { vec = &this->labels_upper_bound_.HostVector(); + } else if (!std::strcmp(key, "feature_weights")) { + vec = &this->feature_weigths.HostVector(); } else { LOG(FATAL) << "Unknown float field name: " << key; } @@ -497,6 +512,11 @@ void MetaInfo::Extend(MetaInfo const& that, bool accumulate_rows) { auto &h_feature_types = feature_types.HostVector(); LoadFeatureType(this->feature_type_names, &h_feature_types); } + if (!that.feature_weigths.Empty()) { + this->feature_weigths.Resize(that.feature_weigths.Size()); + this->feature_weigths.SetDevice(that.feature_weigths.DeviceIdx()); + this->feature_weigths.Copy(that.feature_weigths); + } } void MetaInfo::Validate(int32_t device) const { @@ -538,6 +558,11 @@ void MetaInfo::Validate(int32_t device) const { check_device(labels_lower_bound_); return; } + if (feature_weigths.Size() != 0) { + CHECK_EQ(feature_weigths.Size(), num_col_) + << "Size of feature_weights must equal to number of columns."; + check_device(feature_weigths); + } if (labels_upper_bound_.Size() != 0) { CHECK_EQ(labels_upper_bound_.Size(), num_row_) << "Size of label_upper_bound must equal to number of rows."; diff --git a/src/data/data.cu b/src/data/data.cu index 5e63a828c207..15260498734d 100644 --- a/src/data/data.cu +++ b/src/data/data.cu @@ -58,6 +58,15 @@ void CopyGroupInfoImpl(ArrayInterface column, std::vector* out) { std::partial_sum(out->begin(), out->end(), out->begin()); } +namespace { +// thrust::all_of tries to copy lambda function. +struct AllOfOp { + __device__ bool operator()(float w) { + return w >= 0; + } +}; +} // anonymous namespace + void MetaInfo::SetInfo(const char * c_key, std::string const& interface_str) { Json j_interface = Json::Load({interface_str.c_str(), interface_str.size()}); auto const& j_arr = get(j_interface); @@ -82,6 +91,21 @@ void MetaInfo::SetInfo(const char * c_key, std::string const& interface_str) { } else if (key == "group") { CopyGroupInfoImpl(array_interface, &group_ptr_); return; + } else if (key == "label_lower_bound") { + CopyInfoImpl(array_interface, &labels_lower_bound_); + return; + } else if (key == "label_upper_bound") { + CopyInfoImpl(array_interface, &labels_upper_bound_); + return; + } else if (key == "feature_weights") { + CopyInfoImpl(array_interface, &feature_weigths); + auto d_feature_weights = feature_weigths.ConstDeviceSpan(); + auto valid = + thrust::all_of(thrust::device, d_feature_weights.data(), + d_feature_weights.data() + d_feature_weights.size(), + AllOfOp{}); + CHECK(valid) << "Feature weight must be greater than 0."; + return; } else { LOG(FATAL) << "Unknown metainfo: " << key; } diff --git a/src/tree/updater_colmaker.cc b/src/tree/updater_colmaker.cc index 951cfdb5ec27..45cdb0ba9163 100644 --- a/src/tree/updater_colmaker.cc +++ b/src/tree/updater_colmaker.cc @@ -235,8 +235,10 @@ class ColMaker: public TreeUpdater { } } { - column_sampler_.Init(fmat.Info().num_col_, param_.colsample_bynode, - param_.colsample_bylevel, param_.colsample_bytree); + column_sampler_.Init(fmat.Info().num_col_, + fmat.Info().feature_weigths.ConstHostVector(), + param_.colsample_bynode, param_.colsample_bylevel, + param_.colsample_bytree); } { // setup temp space for each thread diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu index 5cbe75350402..3535a59d6f85 100644 --- a/src/tree/updater_gpu_hist.cu +++ b/src/tree/updater_gpu_hist.cu @@ -266,8 +266,10 @@ struct GPUHistMakerDevice { // Note that the column sampler must be passed by value because it is not // thread safe void Reset(HostDeviceVector* dh_gpair, DMatrix* dmat, int64_t num_columns) { - this->column_sampler.Init(num_columns, param.colsample_bynode, - param.colsample_bylevel, param.colsample_bytree); + auto const& info = dmat->Info(); + this->column_sampler.Init(num_columns, info.feature_weigths.HostVector(), + param.colsample_bynode, param.colsample_bylevel, + param.colsample_bytree); dh::safe_cuda(cudaSetDevice(device_id)); this->interaction_constraints.Reset(); std::fill(node_sum_gradients.begin(), node_sum_gradients.end(), diff --git a/src/tree/updater_quantile_hist.cc b/src/tree/updater_quantile_hist.cc index 37a90dfebd74..95d3c2008ef9 100644 --- a/src/tree/updater_quantile_hist.cc +++ b/src/tree/updater_quantile_hist.cc @@ -841,11 +841,13 @@ void QuantileHistMaker::Builder::InitData(const GHistIndexMatrix& // store a pointer to the tree p_last_tree_ = &tree; if (data_layout_ == kDenseDataOneBased) { - column_sampler_.Init(info.num_col_, param_.colsample_bynode, param_.colsample_bylevel, - param_.colsample_bytree, true); + column_sampler_.Init(info.num_col_, info.feature_weigths.ConstHostVector(), + param_.colsample_bynode, param_.colsample_bylevel, + param_.colsample_bytree, true); } else { - column_sampler_.Init(info.num_col_, param_.colsample_bynode, param_.colsample_bylevel, - param_.colsample_bytree, false); + column_sampler_.Init(info.num_col_, info.feature_weigths.ConstHostVector(), + param_.colsample_bynode, param_.colsample_bylevel, + param_.colsample_bytree, false); } if (data_layout_ == kDenseDataZeroBased || data_layout_ == kDenseDataOneBased) { /* specialized code for dense data: diff --git a/tests/cpp/common/test_common.cc b/tests/cpp/common/test_common.cc new file mode 100644 index 000000000000..006860b11af2 --- /dev/null +++ b/tests/cpp/common/test_common.cc @@ -0,0 +1,13 @@ +#include +#include "../../../src/common/common.h" + +namespace xgboost { +namespace common { +TEST(ArgSort, Basic) { + std::vector inputs {3.0, 2.0, 1.0}; + auto ret = ArgSort(inputs); + std::vector sol{2, 1, 0}; + ASSERT_EQ(ret, sol); +} +} // namespace common +} // namespace xgboost diff --git a/tests/cpp/common/test_random.cc b/tests/cpp/common/test_random.cc index dc7b38554162..9b2a1515543f 100644 --- a/tests/cpp/common/test_random.cc +++ b/tests/cpp/common/test_random.cc @@ -8,9 +8,10 @@ namespace common { TEST(ColumnSampler, Test) { int n = 128; ColumnSampler cs; + std::vector feature_weights; // No node sampling - cs.Init(n, 1.0f, 0.5f, 0.5f); + cs.Init(n, feature_weights, 1.0f, 0.5f, 0.5f); auto set0 = cs.GetFeatureSet(0); ASSERT_EQ(set0->Size(), 32); @@ -23,7 +24,7 @@ TEST(ColumnSampler, Test) { ASSERT_EQ(set2->Size(), 32); // Node sampling - cs.Init(n, 0.5f, 1.0f, 0.5f); + cs.Init(n, feature_weights, 0.5f, 1.0f, 0.5f); auto set3 = cs.GetFeatureSet(0); ASSERT_EQ(set3->Size(), 32); @@ -33,19 +34,19 @@ TEST(ColumnSampler, Test) { ASSERT_EQ(set4->Size(), 32); // No level or node sampling, should be the same at different depth - cs.Init(n, 1.0f, 1.0f, 0.5f); + cs.Init(n, feature_weights, 1.0f, 1.0f, 0.5f); ASSERT_EQ(cs.GetFeatureSet(0)->HostVector(), cs.GetFeatureSet(1)->HostVector()); - cs.Init(n, 1.0f, 1.0f, 1.0f); + cs.Init(n, feature_weights, 1.0f, 1.0f, 1.0f); auto set5 = cs.GetFeatureSet(0); ASSERT_EQ(set5->Size(), n); - cs.Init(n, 1.0f, 1.0f, 1.0f); + cs.Init(n, feature_weights, 1.0f, 1.0f, 1.0f); auto set6 = cs.GetFeatureSet(0); ASSERT_EQ(set5->HostVector(), set6->HostVector()); // Should always be a minimum of one feature - cs.Init(n, 1e-16f, 1e-16f, 1e-16f); + cs.Init(n, feature_weights, 1e-16f, 1e-16f, 1e-16f); ASSERT_EQ(cs.GetFeatureSet(0)->Size(), 1); } @@ -56,13 +57,13 @@ TEST(ColumnSampler, ThreadSynchronisation) { size_t iterations = 10; size_t levels = 5; std::vector reference_result; - bool success = - true; // Cannot use google test asserts in multithreaded region + std::vector feature_weights; + bool success = true; // Cannot use google test asserts in multithreaded region #pragma omp parallel num_threads(num_threads) { for (auto j = 0ull; j < iterations; j++) { ColumnSampler cs(j); - cs.Init(n, 0.5f, 0.5f, 0.5f); + cs.Init(n, feature_weights, 0.5f, 0.5f, 0.5f); for (auto level = 0ull; level < levels; level++) { auto result = cs.GetFeatureSet(level)->ConstHostVector(); #pragma omp single @@ -76,5 +77,54 @@ TEST(ColumnSampler, ThreadSynchronisation) { } ASSERT_TRUE(success); } + +TEST(ColumnSampler, WeightedSampling) { + auto test_basic = [](int first) { + std::vector feature_weights(2); + feature_weights[0] = std::abs(first - 1.0f); + feature_weights[1] = first - 0.0f; + ColumnSampler cs{0}; + cs.Init(2, feature_weights, 1.0, 1.0, 0.5); + auto feature_sets = cs.GetFeatureSet(0); + auto const &h_feat_set = feature_sets->HostVector(); + ASSERT_EQ(h_feat_set.size(), 1); + ASSERT_EQ(h_feat_set[0], first - 0); + }; + + test_basic(0); + test_basic(1); + + size_t constexpr kCols = 64; + std::vector feature_weights(kCols); + SimpleLCG rng; + SimpleRealUniformDistribution dist(.0f, 12.0f); + std::generate(feature_weights.begin(), feature_weights.end(), [&]() { return dist(&rng); }); + ColumnSampler cs{0}; + cs.Init(kCols, feature_weights, 0.5f, 1.0f, 1.0f); + std::vector features(kCols); + std::iota(features.begin(), features.end(), 0); + std::vector freq(kCols, 0); + for (size_t i = 0; i < 1024; ++i) { + auto fset = cs.GetFeatureSet(0); + ASSERT_EQ(kCols * 0.5, fset->Size()); + auto const& h_fset = fset->HostVector(); + for (auto f : h_fset) { + freq[f] += 1.0f; + } + } + + auto norm = std::accumulate(freq.cbegin(), freq.cend(), .0f); + for (auto& f : freq) { + f /= norm; + } + norm = std::accumulate(feature_weights.cbegin(), feature_weights.cend(), .0f); + for (auto& f : feature_weights) { + f /= norm; + } + + for (size_t i = 0; i < feature_weights.size(); ++i) { + EXPECT_NEAR(freq[i], feature_weights[i], 1e-2); + } +} } // namespace common } // namespace xgboost diff --git a/tests/cpp/tree/test_gpu_hist.cu b/tests/cpp/tree/test_gpu_hist.cu index fd5c9f43fb2a..153cafb88fd8 100644 --- a/tests/cpp/tree/test_gpu_hist.cu +++ b/tests/cpp/tree/test_gpu_hist.cu @@ -204,12 +204,11 @@ TEST(GpuHist, EvaluateRootSplit) { ASSERT_EQ(maker.hist.Data().size(), hist.size()); thrust::copy(hist.begin(), hist.end(), maker.hist.Data().begin()); + std::vector feature_weights; - maker.column_sampler.Init(kNCols, - param.colsample_bynode, - param.colsample_bylevel, - param.colsample_bytree, - false); + maker.column_sampler.Init(kNCols, feature_weights, param.colsample_bynode, + param.colsample_bylevel, param.colsample_bytree, + false); RegTree tree; MetaInfo info; diff --git a/tests/python-gpu/test_device_quantile_dmatrix.py b/tests/python-gpu/test_device_quantile_dmatrix.py index f0978a0afaf4..c44de28bd2ff 100644 --- a/tests/python-gpu/test_device_quantile_dmatrix.py +++ b/tests/python-gpu/test_device_quantile_dmatrix.py @@ -16,6 +16,20 @@ def test_dmatrix_numpy_init(self): match='is not supported for DeviceQuantileDMatrix'): xgb.DeviceQuantileDMatrix(data, np.ones(5, dtype=np.float64)) + @pytest.mark.skipif(**tm.no_cupy()) + def test_dmatrix_feature_weights(self): + import cupy as cp + rng = cp.random.RandomState(1994) + data = rng.randn(5, 5) + m = xgb.DMatrix(data) + + feature_weights = rng.uniform(size=5) + m.set_info(feature_weights=feature_weights) + + cp.testing.assert_array_equal( + cp.array(m.get_float_info('feature_weights')), + feature_weights.astype(np.float32)) + @pytest.mark.skipif(**tm.no_cupy()) def test_dmatrix_cupy_init(self): import cupy as cp diff --git a/tests/python/test_demos.py b/tests/python/test_demos.py index 8b6535dbff45..25c1c4de6c1f 100644 --- a/tests/python/test_demos.py +++ b/tests/python/test_demos.py @@ -1,12 +1,10 @@ import os import subprocess -import sys import pytest import testing as tm -CURRENT_DIR = os.path.dirname(__file__) -ROOT_DIR = os.path.dirname(os.path.dirname(CURRENT_DIR)) +ROOT_DIR = tm.PROJECT_ROOT DEMO_DIR = os.path.join(ROOT_DIR, 'demo') PYTHON_DEMO_DIR = os.path.join(DEMO_DIR, 'guide-python') @@ -19,21 +17,27 @@ def test_basic_walkthrough(): os.remove('dump.raw.txt') +@pytest.mark.skipif(**tm.no_matplotlib()) def test_custom_multiclass_objective(): script = os.path.join(PYTHON_DEMO_DIR, 'custom_softmax.py') cmd = ['python', script, '--plot=0'] subprocess.check_call(cmd) +@pytest.mark.skipif(**tm.no_matplotlib()) def test_custom_rmsle_objective(): - major, minor = sys.version_info[:2] - if minor < 6: - pytest.skip('Skipping RMLSE test due to Python version being too low.') script = os.path.join(PYTHON_DEMO_DIR, 'custom_rmsle.py') cmd = ['python', script, '--plot=0'] subprocess.check_call(cmd) +@pytest.mark.skipif(**tm.no_matplotlib()) +def test_feature_weights_demo(): + script = os.path.join(PYTHON_DEMO_DIR, 'feature_weights.py') + cmd = ['python', script, '--plot=0'] + subprocess.check_call(cmd) + + @pytest.mark.skipif(**tm.no_sklearn()) def test_sklearn_demo(): script = os.path.join(PYTHON_DEMO_DIR, 'sklearn_examples.py') diff --git a/tests/python/test_dmatrix.py b/tests/python/test_dmatrix.py index 8daf7f3573cb..bc4a906aad02 100644 --- a/tests/python/test_dmatrix.py +++ b/tests/python/test_dmatrix.py @@ -75,6 +75,11 @@ def test_slice(self): X = rng.randn(100, 100) y = rng.randint(low=0, high=3, size=100) d = xgb.DMatrix(X, y) + np.testing.assert_equal(d.get_label(), y.astype(np.float32)) + + fw = rng.uniform(size=100).astype(np.float32) + d.set_info(feature_weights=fw) + eval_res_0 = {} booster = xgb.train( {'num_class': 3, 'objective': 'multi:softprob'}, d, @@ -82,19 +87,23 @@ def test_slice(self): predt = booster.predict(d) predt = predt.reshape(100 * 3, 1) + d.set_base_margin(predt) ridxs = [1, 2, 3, 4, 5, 6] - d = d.slice(ridxs) - sliced_margin = d.get_float_info('base_margin') + sliced = d.slice(ridxs) + + sliced_margin = sliced.get_float_info('base_margin') assert sliced_margin.shape[0] == len(ridxs) * 3 eval_res_1 = {} - xgb.train({'num_class': 3, 'objective': 'multi:softprob'}, d, - num_boost_round=2, evals=[(d, 'd')], evals_result=eval_res_1) + xgb.train({'num_class': 3, 'objective': 'multi:softprob'}, sliced, + num_boost_round=2, evals=[(sliced, 'd')], + evals_result=eval_res_1) eval_res_0 = eval_res_0['d']['merror'] eval_res_1 = eval_res_1['d']['merror'] + for i in range(len(eval_res_0)): assert abs(eval_res_0[i] - eval_res_1[i]) < 0.02 @@ -172,13 +181,33 @@ def test_get_info(self): dtrain.get_float_info('base_margin') dtrain.get_uint_info('group_ptr') + def test_feature_weights(self): + kRows = 10 + kCols = 50 + rng = np.random.RandomState(1994) + fw = rng.uniform(size=kCols) + X = rng.randn(kRows, kCols) + m = xgb.DMatrix(X) + m.set_info(feature_weights=fw) + np.testing.assert_allclose(fw, m.get_float_info('feature_weights')) + # Handle empty + m.set_info(feature_weights=np.empty((0, 0))) + + assert m.get_float_info('feature_weights').shape[0] == 0 + + fw -= 1 + + def assign_weight(): + m.set_info(feature_weights=fw) + self.assertRaises(ValueError, assign_weight) + def test_sparse_dmatrix_csr(self): nrow = 100 ncol = 1000 x = rand(nrow, ncol, density=0.0005, format='csr', random_state=rng) assert x.indices.max() < ncol - 1 x.data[:] = 1 - dtrain = xgb.DMatrix(x, label=np.random.binomial(1, 0.3, nrow)) + dtrain = xgb.DMatrix(x, label=rng.binomial(1, 0.3, nrow)) assert (dtrain.num_row(), dtrain.num_col()) == (nrow, ncol) watchlist = [(dtrain, 'train')] param = {'max_depth': 3, 'objective': 'binary:logistic', 'verbosity': 0} @@ -191,7 +220,7 @@ def test_sparse_dmatrix_csc(self): x = rand(nrow, ncol, density=0.0005, format='csc', random_state=rng) assert x.indices.max() < nrow - 1 x.data[:] = 1 - dtrain = xgb.DMatrix(x, label=np.random.binomial(1, 0.3, nrow)) + dtrain = xgb.DMatrix(x, label=rng.binomial(1, 0.3, nrow)) assert (dtrain.num_row(), dtrain.num_col()) == (nrow, ncol) watchlist = [(dtrain, 'train')] param = {'max_depth': 3, 'objective': 'binary:logistic', 'verbosity': 0} diff --git a/tests/python/test_with_sklearn.py b/tests/python/test_with_sklearn.py index 7f62a3e83052..ce0b57e823ff 100644 --- a/tests/python/test_with_sklearn.py +++ b/tests/python/test_with_sklearn.py @@ -1,3 +1,5 @@ +import collections +import importlib.util import numpy as np import xgboost as xgb from xgboost.sklearn import XGBoostLabelEncoder @@ -654,6 +656,7 @@ def test_validation_weights_xgbmodel(): eval_set=[(X_train, y_train), (X_test, y_test)], sample_weight_eval_set=[weights_train]) + def test_validation_weights_xgbclassifier(): from sklearn.datasets import make_hastie_10_2 @@ -920,6 +923,64 @@ def test_pandas_input(): np.array([0, 1])) +def run_feature_weights(increasing): + with TemporaryDirectory() as tmpdir: + kRows = 512 + kCols = 64 + colsample_bynode = 0.5 + reg = xgb.XGBRegressor(tree_method='hist', + colsample_bynode=colsample_bynode) + X = rng.randn(kRows, kCols) + y = rng.randn(kRows) + fw = np.ones(shape=(kCols,)) + for i in range(kCols): + if increasing: + fw[i] *= float(i) + else: + fw[i] *= float(kCols - i) + + reg.fit(X, y, feature_weights=fw) + model_path = os.path.join(tmpdir, 'model.json') + reg.save_model(model_path) + with open(model_path) as fd: + model = json.load(fd) + + parser_path = os.path.join(tm.PROJECT_ROOT, 'demo', 'json-model', + 'json_parser.py') + spec = importlib.util.spec_from_file_location("JsonParser", + parser_path) + foo = importlib.util.module_from_spec(spec) + spec.loader.exec_module(foo) + model = foo.Model(model) + splits = {} + total_nodes = 0 + for tree in model.trees: + n_nodes = len(tree.nodes) + total_nodes += n_nodes + for n in range(n_nodes): + if tree.is_leaf(n): + continue + if splits.get(tree.split_index(n), None) is None: + splits[tree.split_index(n)] = 1 + else: + splits[tree.split_index(n)] += 1 + + od = collections.OrderedDict(sorted(splits.items())) + tuples = [(k, v) for k, v in od.items()] + k, v = list(zip(*tuples)) + w = np.polyfit(k, v, deg=1) + return w + + +def test_feature_weights(): + poly_increasing = run_feature_weights(True) + poly_decreasing = run_feature_weights(False) + # Approxmated test, this is dependent on the implementation of random + # number generator in std library. + assert poly_increasing[0] > 0.08 + assert poly_decreasing[0] < -0.08 + + class TestBoostFromPrediction(unittest.TestCase): def run_boost_from_prediction(self, tree_method): from sklearn.datasets import load_breast_cancer