From b5b24354b8c2d558f937ef975ad4e996527fac0a Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Sat, 10 Oct 2020 16:12:37 +0800 Subject: [PATCH] More categorical tests and disable shap sparse test. (#6219) * Fix tree load with 32 category. --- src/common/json.cc | 12 +++-- src/tree/tree_model.cc | 9 ++-- tests/cpp/tree/test_tree_model.cc | 72 +++++++++++++++++++++++++ tests/python-gpu/test_gpu_prediction.py | 7 ++- tests/python-gpu/test_gpu_updaters.py | 49 ++++++++++------- 5 files changed, 121 insertions(+), 28 deletions(-) diff --git a/src/common/json.cc b/src/common/json.cc index 98a6abbd6976..e31bbee7bf4d 100644 --- a/src/common/json.cc +++ b/src/common/json.cc @@ -186,7 +186,9 @@ Json& JsonObject::operator[](int ind) { } bool JsonObject::operator==(Value const& rhs) const { - if (!IsA(&rhs)) { return false; } + if (!IsA(&rhs)) { + return false; + } return object_ == Cast(&rhs)->GetObject(); } @@ -275,10 +277,14 @@ Json& JsonNumber::operator[](int ind) { bool JsonNumber::operator==(Value const& rhs) const { if (!IsA(&rhs)) { return false; } + auto r_num = Cast(&rhs)->GetNumber(); if (std::isinf(number_)) { - return std::isinf(Cast(&rhs)->GetNumber()); + return std::isinf(r_num); + } + if (std::isnan(number_)) { + return std::isnan(r_num); } - return std::abs(number_ - Cast(&rhs)->GetNumber()) < kRtEps; + return number_ - r_num == 0; } Value & JsonNumber::operator=(Value const &rhs) { diff --git a/src/tree/tree_model.cc b/src/tree/tree_model.cc index 71196c9f0a71..e4253dbc4b55 100644 --- a/src/tree/tree_model.cc +++ b/src/tree/tree_model.cc @@ -792,16 +792,17 @@ void RegTree::LoadCategoricalSplit(Json const& in) { auto j_begin = get(categories_segments[cnt]); auto j_end = get(categories_sizes[cnt]) + j_begin; bst_cat_t max_cat{std::numeric_limits::min()}; + CHECK_NE(j_end - j_begin, 0) << nidx; for (auto j = j_begin; j < j_end; ++j) { auto const &category = get(categories[j]); auto cat = common::AsCat(category); max_cat = std::max(max_cat, cat); } - size_t size = max_cat == std::numeric_limits::min() - ? 0 - : common::KCatBitField::ComputeStorageSize(max_cat); - size = size == 0 ? 1 : size; + // Have at least 1 category in split. + CHECK_NE(std::numeric_limits::min(), max_cat); + size_t n_cats = max_cat + 1; // cat 0 + size_t size = common::KCatBitField::ComputeStorageSize(n_cats); std::vector cat_bits_storage(size, 0); common::CatBitField cat_bits{common::Span(cat_bits_storage)}; for (auto j = j_begin; j < j_end; ++j) { diff --git a/tests/cpp/tree/test_tree_model.cc b/tests/cpp/tree/test_tree_model.cc index 4b090db46846..83be207626a9 100644 --- a/tests/cpp/tree/test_tree_model.cc +++ b/tests/cpp/tree/test_tree_model.cc @@ -5,6 +5,7 @@ #include "xgboost/json_io.h" #include "xgboost/tree_model.h" #include "../../../src/common/bitfield.h" +#include "../../../src/common/categorical.h" namespace xgboost { #if DMLC_IO_NO_ENDIAN_SWAP // skip on big-endian machines @@ -150,6 +151,77 @@ TEST(Tree, ExpandCategoricalFeature) { } } +void GrowTree(RegTree* p_tree) { + SimpleLCG lcg; + size_t n_expands = 10; + constexpr size_t kCols = 256; + SimpleRealUniformDistribution coin(0.0, 1.0); + SimpleRealUniformDistribution feat(0.0, kCols); + SimpleRealUniformDistribution split_cat(0.0, 128.0); + SimpleRealUniformDistribution split_value(0.0, kCols); + + std::stack stack; + stack.push(RegTree::kRoot); + auto& tree = *p_tree; + + for (size_t i = 0; i < n_expands; ++i) { + auto is_cat = coin(&lcg) <= 0.5; + bst_node_t node = stack.top(); + stack.pop(); + + bst_feature_t f = feat(&lcg); + if (is_cat) { + bst_cat_t cat = common::AsCat(split_cat(&lcg)); + std::vector split_cats( + LBitField32::ComputeStorageSize(cat + 1)); + LBitField32 bitset{split_cats}; + bitset.Set(cat); + tree.ExpandCategorical(node, f, split_cats, true, 1.0, 2.0, 3.0, 11.0, 2.0, + /*left_sum=*/3.0, /*right_sum=*/4.0); + } else { + auto split = split_value(&lcg); + tree.ExpandNode(node, f, split, true, 1.0, 2.0, 3.0, 11.0, 2.0, + /*left_sum=*/3.0, /*right_sum=*/4.0); + } + + stack.push(tree[node].LeftChild()); + stack.push(tree[node].RightChild()); + } +} + +void CheckReload(RegTree const &tree) { + Json out{Object()}; + tree.SaveModel(&out); + + RegTree loaded_tree; + loaded_tree.LoadModel(out); + Json saved{Object()}; + loaded_tree.SaveModel(&saved); + + auto same = out == saved; + ASSERT_TRUE(same); +} + +TEST(Tree, CategoricalIO) { + { + RegTree tree; + bst_cat_t cat = 32; + std::vector split_cats(LBitField32::ComputeStorageSize(cat + 1)); + LBitField32 bitset{split_cats}; + bitset.Set(cat); + tree.ExpandCategorical(0, 0, split_cats, true, 1.0, 2.0, 3.0, 11.0, 2.0, + /*left_sum=*/3.0, /*right_sum=*/4.0); + + CheckReload(tree); + } + + { + RegTree tree; + GrowTree(&tree); + CheckReload(tree); + } +} + namespace { RegTree ConstructTree() { RegTree tree; diff --git a/tests/python-gpu/test_gpu_prediction.py b/tests/python-gpu/test_gpu_prediction.py index 74a27ab54be9..b22caf30658b 100644 --- a/tests/python-gpu/test_gpu_prediction.py +++ b/tests/python-gpu/test_gpu_prediction.py @@ -212,6 +212,10 @@ def test_shap(self, num_rounds, dataset, param): tm.dataset_strategy, shap_parameter_strategy) @settings(deadline=None, max_examples=20) def test_shap_interactions(self, num_rounds, dataset, param): + if dataset.name == 'sparse': + issue = 'https://github.com/dmlc/xgboost/issues/6074' + pytest.xfail(reason=f'GPU shap with sparse is flaky: {issue}') + param.update({"predictor": "gpu_predictor", "gpu_id": 0}) param = dataset.set_params(param) dmat = dataset.get_dmat() @@ -220,5 +224,6 @@ def test_shap_interactions(self, num_rounds, dataset, param): shap = bst.predict(test_dmat, pred_interactions=True) margin = bst.predict(test_dmat, output_margin=True) assume(len(dataset.y) > 0) - assert np.allclose(np.sum(shap, axis=(len(shap.shape) - 1, len(shap.shape) - 2)), margin, + assert np.allclose(np.sum(shap, axis=(len(shap.shape) - 1, len(shap.shape) - 2)), + margin, 1e-3, 1e-3) diff --git a/tests/python-gpu/test_gpu_updaters.py b/tests/python-gpu/test_gpu_updaters.py index 40e8af7a7954..48ac2f5e56d4 100644 --- a/tests/python-gpu/test_gpu_updaters.py +++ b/tests/python-gpu/test_gpu_updaters.py @@ -41,7 +41,24 @@ def test_gpu_hist(self, param, num_rounds, dataset): note(result) assert tm.non_increasing(result['train'][dataset.metric]) - def run_categorical_basic(self, cat, onehot, label, rounds): + def run_categorical_basic(self, rows, cols, rounds, cats): + import pandas as pd + rng = np.random.RandomState(1994) + + pd_dict = {} + for i in range(cols): + c = rng.randint(low=0, high=cats+1, size=rows) + pd_dict[str(i)] = pd.Series(c, dtype=np.int64) + + df = pd.DataFrame(pd_dict) + label = df.iloc[:, 0] + for i in range(0, cols-1): + label += df.iloc[:, i] + label += 1 + df = df.astype('category') + onehot = pd.get_dummies(df) + cat = df + by_etl_results = {} by_builtin_results = {} @@ -64,28 +81,20 @@ def run_categorical_basic(self, cat, onehot, label, rounds): rtol=1e-3) assert tm.non_increasing(by_builtin_results['Train']['rmse']) - @given(strategies.integers(10, 400), strategies.integers(5, 10), - strategies.integers(1, 5), strategies.integers(4, 8)) + @given(strategies.integers(10, 400), strategies.integers(3, 8), + strategies.integers(1, 5), strategies.integers(4, 7)) @settings(deadline=None) @pytest.mark.skipif(**tm.no_pandas()) def test_categorical(self, rows, cols, rounds, cats): - import pandas as pd - rng = np.random.RandomState(1994) - - pd_dict = {} - for i in range(cols): - c = rng.randint(low=0, high=cats+1, size=rows) - pd_dict[str(i)] = pd.Series(c, dtype=np.int64) - - df = pd.DataFrame(pd_dict) - label = df.iloc[:, 0] - for i in range(0, cols-1): - label += df.iloc[:, i] - label += 1 - df = df.astype('category') - x = pd.get_dummies(df) - - self.run_categorical_basic(df, x, label, rounds) + self.run_categorical_basic(rows, cols, rounds, cats) + + def test_categorical_32_cat(self): + '''32 hits the bound of integer bitset, so special test''' + rows = 1000 + cols = 10 + cats = 32 + rounds = 4 + self.run_categorical_basic(rows, cols, rounds, cats) @pytest.mark.skipif(**tm.no_cupy()) @given(parameter_strategy, strategies.integers(1, 20),