From bee9b75045d0ec5536be20bef389872b987e864e Mon Sep 17 00:00:00 2001 From: fis Date: Mon, 13 Sep 2021 20:53:34 +0800 Subject: [PATCH 1/5] Initial commit. --- src/common/quantile.cc | 47 +++++++++++++++++++++++++++++++++--------- src/common/quantile.h | 6 +++++- 2 files changed, 42 insertions(+), 11 deletions(-) diff --git a/src/common/quantile.cc b/src/common/quantile.cc index a50602b152c0..bb6b55d6b18b 100644 --- a/src/common/quantile.cc +++ b/src/common/quantile.cc @@ -1,29 +1,34 @@ /*! - * Copyright 2020 by XGBoost Contributors + * Copyright 2021 by XGBoost Contributors */ #include #include #include "quantile.h" #include "hist_util.h" +#include "categorical.h" namespace xgboost { namespace common { -HostSketchContainer::HostSketchContainer(std::vector columns_size, - int32_t max_bins, bool use_group, - int32_t n_threads) - : columns_size_{std::move(columns_size)}, max_bins_{max_bins}, +HostSketchContainer::HostSketchContainer( + std::vector columns_size, int32_t max_bins, + common::Span feature_types, bool use_group, int32_t n_threads) + : feature_types_(feature_types.cbegin(), feature_types.cend()), + columns_size_{std::move(columns_size)}, max_bins_{max_bins}, use_group_ind_{use_group}, n_threads_{n_threads} { monitor_.Init(__func__); CHECK_NE(columns_size_.size(), 0); sketches_.resize(columns_size_.size()); CHECK_GE(n_threads_, 1); + categories_.resize(columns_size_.size()); ParallelFor(sketches_.size(), n_threads_, Sched::Auto(), [&](auto i) { auto n_bins = std::min(static_cast(max_bins_), columns_size_[i]); n_bins = std::max(n_bins, static_cast(1)); auto eps = 1.0 / (static_cast(n_bins) * WQSketch::kFactor); - sketches_[i].Init(columns_size_[i], eps); - sketches_[i].inqueue.queue.resize(sketches_[i].limit_size * 2); + if (!IsCat(this->feature_types_, i)) { + sketches_[i].Init(columns_size_[i], eps); + sketches_[i].inqueue.queue.resize(sketches_[i].limit_size * 2); + } }); } @@ -182,13 +187,21 @@ void HostSketchContainer::PushRowPage( auto p_inst = inst.data(); if (is_dense) { for (size_t ii = begin; ii < end; ii++) { - sketches_[ii].Push(p_inst[ii].fvalue, w); + if (IsCat(feature_types_, ii)) { + categories_[ii].emplace(p_inst[ii].fvalue); + } else { + sketches_[ii].Push(p_inst[ii].fvalue, w); + } } } else { for (size_t i = 0; i < inst.size(); ++i) { auto const& entry = p_inst[i]; if (entry.index >= begin && entry.index < end) { - sketches_[entry.index].Push(entry.fvalue, w); + if (IsCat(feature_types_, entry.index)) { + categories_[entry.index].emplace(entry.fvalue); + } else { + sketches_[entry.index].Push(entry.fvalue, w); + } } } } @@ -338,6 +351,13 @@ void AddCutPoint(WQuantileSketch::SummaryContainer const &summary, } } +void AddCategories(std::set const &categories, HistogramCuts *cuts) { + auto &cut_values = cuts->cut_values_.HostVector(); + for (auto const &v : categories) { + cut_values.push_back(v); + } +} + void HostSketchContainer::MakeCuts(HistogramCuts* cuts) { monitor_.Start(__func__); std::vector reduced; @@ -352,6 +372,9 @@ void HostSketchContainer::MakeCuts(HistogramCuts* cuts) { size_t max_num_bins = std::min(num_cuts[fidx], max_bins_); a.Reserve(max_num_bins + 1); CHECK(a.data); + if (IsCat(feature_types_, fidx)) { + return; + } if (num_cuts[fidx] != 0) { a.SetPrune(reduced[fidx], max_num_bins + 1); CHECK(a.data && reduced[fidx].data); @@ -367,7 +390,11 @@ void HostSketchContainer::MakeCuts(HistogramCuts* cuts) { for (size_t fid = 0; fid < reduced.size(); ++fid) { size_t max_num_bins = std::min(num_cuts[fid], max_bins_); WQSketch::SummaryContainer const& a = final_summaries[fid]; - AddCutPoint(a, max_num_bins, cuts); + if (IsCat(feature_types_, fid)) { + AddCategories(categories_.at(fid), cuts); + } else { + AddCutPoint(a, max_num_bins, cuts); + } // push a value that is greater than anything const bst_float cpt = (a.size > 0) ? a.data[a.size - 1].value : cuts->min_vals_.HostVector()[fid]; diff --git a/src/common/quantile.h b/src/common/quantile.h index c72f5f39160f..8bde8352fbe1 100644 --- a/src/common/quantile.h +++ b/src/common/quantile.h @@ -707,6 +707,9 @@ class HostSketchContainer { private: std::vector sketches_; + std::vector> categories_; + std::vector feature_types_; + std::vector columns_size_; int32_t max_bins_; bool use_group_ind_{false}; @@ -721,7 +724,8 @@ class HostSketchContainer { * \param use_group whether is assigned to group to data instance. */ HostSketchContainer(std::vector columns_size, int32_t max_bins, - bool use_group, int32_t n_threads); + common::Span feature_types, bool use_group, + int32_t n_threads); static bool UseGroup(MetaInfo const &info) { size_t const num_groups = From 60f59c3abdebdbf1c2723760dc4c713d16a9d372 Mon Sep 17 00:00:00 2001 From: fis Date: Mon, 13 Sep 2021 21:19:10 +0800 Subject: [PATCH 2/5] Add tests. --- src/common/hist_util.h | 1 + src/common/quantile.cc | 3 +- src/common/quantile.h | 4 +-- tests/cpp/common/test_hist_util.cc | 11 ++++++++ tests/cpp/common/test_hist_util.cu | 44 ++++++------------------------ tests/cpp/common/test_hist_util.h | 42 ++++++++++++++++++++++++++++ tests/cpp/common/test_quantile.cc | 8 ++++-- 7 files changed, 72 insertions(+), 41 deletions(-) diff --git a/src/common/hist_util.h b/src/common/hist_util.h index fc49148b0765..bd7bfc83f4d7 100644 --- a/src/common/hist_util.h +++ b/src/common/hist_util.h @@ -128,6 +128,7 @@ inline HistogramCuts SketchOnDMatrix(DMatrix *m, int32_t max_bins, } } HostSketchContainer container(reduced, max_bins, + m->Info().feature_types.ConstHostSpan(), HostSketchContainer::UseGroup(info), threads); for (auto const &page : m->GetBatches()) { container.PushRowPage(page, info, hessian); diff --git a/src/common/quantile.cc b/src/common/quantile.cc index bb6b55d6b18b..d691d9e8c90b 100644 --- a/src/common/quantile.cc +++ b/src/common/quantile.cc @@ -12,7 +12,8 @@ namespace common { HostSketchContainer::HostSketchContainer( std::vector columns_size, int32_t max_bins, - common::Span feature_types, bool use_group, int32_t n_threads) + common::Span feature_types, bool use_group, + int32_t n_threads) : feature_types_(feature_types.cbegin(), feature_types.cend()), columns_size_{std::move(columns_size)}, max_bins_{max_bins}, use_group_ind_{use_group}, n_threads_{n_threads} { diff --git a/src/common/quantile.h b/src/common/quantile.h index 8bde8352fbe1..d316a0a58aa2 100644 --- a/src/common/quantile.h +++ b/src/common/quantile.h @@ -708,7 +708,7 @@ class HostSketchContainer { private: std::vector sketches_; std::vector> categories_; - std::vector feature_types_; + std::vector const feature_types_; std::vector columns_size_; int32_t max_bins_; @@ -724,7 +724,7 @@ class HostSketchContainer { * \param use_group whether is assigned to group to data instance. */ HostSketchContainer(std::vector columns_size, int32_t max_bins, - common::Span feature_types, bool use_group, + common::Span feature_types, bool use_group, int32_t n_threads); static bool UseGroup(MetaInfo const &info) { diff --git a/tests/cpp/common/test_hist_util.cc b/tests/cpp/common/test_hist_util.cc index 3abf0d45dc81..8c3de06646ab 100644 --- a/tests/cpp/common/test_hist_util.cc +++ b/tests/cpp/common/test_hist_util.cc @@ -388,5 +388,16 @@ TEST(HistUtil, SketchFromWeights) { TestSketchFromWeights(true); TestSketchFromWeights(false); } + +TEST(HistUtil, SketchCategoricalFeatures) { + TestCategoricalSketch(1000, 256, 32, false, + [](DMatrix *p_fmat, int32_t num_bins) { + return SketchOnDMatrix(p_fmat, num_bins); + }); + TestCategoricalSketch(1000, 256, 32, true, + [](DMatrix *p_fmat, int32_t num_bins) { + return SketchOnDMatrix(p_fmat, num_bins); + }); +} } // namespace common } // namespace xgboost diff --git a/tests/cpp/common/test_hist_util.cu b/tests/cpp/common/test_hist_util.cu index a4c18299dfe7..bc67c5e99950 100644 --- a/tests/cpp/common/test_hist_util.cu +++ b/tests/cpp/common/test_hist_util.cu @@ -126,43 +126,15 @@ TEST(HistUtil, DeviceSketchCategoricalAsNumeric) { } } -void TestCategoricalSketch(size_t n, size_t num_categories, int32_t num_bins, bool weighted) { - auto x = GenerateRandomCategoricalSingleColumn(n, num_categories); - auto dmat = GetDMatrixFromData(x, n, 1); - dmat->Info().feature_types.HostVector().push_back(FeatureType::kCategorical); - - if (weighted) { - std::vector weights(n, 0); - SimpleLCG lcg; - SimpleRealUniformDistribution dist(0, 1); - for (auto& v : weights) { - v = dist(&lcg); - } - dmat->Info().weights_.HostVector() = weights; - } - - ASSERT_EQ(dmat->Info().feature_types.Size(), 1); - auto cuts = DeviceSketch(0, dmat.get(), num_bins); - std::sort(x.begin(), x.end()); - auto n_uniques = std::unique(x.begin(), x.end()) - x.begin(); - ASSERT_NE(n_uniques, x.size()); - ASSERT_EQ(cuts.TotalBins(), n_uniques); - ASSERT_EQ(n_uniques, num_categories); - - auto& values = cuts.cut_values_.HostVector(); - ASSERT_TRUE(std::is_sorted(values.cbegin(), values.cend())); - auto is_unique = (std::unique(values.begin(), values.end()) - values.begin()) == n_uniques; - ASSERT_TRUE(is_unique); - - x.resize(n_uniques); - for (size_t i = 0; i < n_uniques; ++i) { - ASSERT_EQ(x[i], values[i]); - } -} - TEST(HistUtil, DeviceSketchCategoricalFeatures) { - TestCategoricalSketch(1000, 256, 32, false); - TestCategoricalSketch(1000, 256, 32, true); + TestCategoricalSketch(1000, 256, 32, false, + [](DMatrix *p_fmat, int32_t num_bins) { + return DeviceSketch(0, p_fmat, num_bins); + }); + TestCategoricalSketch(1000, 256, 32, true, + [](DMatrix *p_fmat, int32_t num_bins) { + return DeviceSketch(0, p_fmat, num_bins); + }); } TEST(HistUtil, DeviceSketchMultipleColumns) { diff --git a/tests/cpp/common/test_hist_util.h b/tests/cpp/common/test_hist_util.h index aa91f9c29b79..10851f14e29e 100644 --- a/tests/cpp/common/test_hist_util.h +++ b/tests/cpp/common/test_hist_util.h @@ -5,6 +5,8 @@ #include #include #include + +#include "../helpers.h" #include "../../../src/common/hist_util.h" #include "../../../src/data/simple_dmatrix.h" #include "../../../src/data/adapter.h" @@ -206,5 +208,45 @@ inline void ValidateCuts(const HistogramCuts& cuts, DMatrix* dmat, } } +/** + * \brief Test for sketching on categorical data. + * + * \param sketch Sketch function, can be on device or on host. + */ +template +void TestCategoricalSketch(size_t n, size_t num_categories, int32_t num_bins, + bool weighted, Fn sketch) { + auto x = GenerateRandomCategoricalSingleColumn(n, num_categories); + auto dmat = GetDMatrixFromData(x, n, 1); + dmat->Info().feature_types.HostVector().push_back(FeatureType::kCategorical); + + if (weighted) { + std::vector weights(n, 0); + SimpleLCG lcg; + SimpleRealUniformDistribution dist(0, 1); + for (auto& v : weights) { + v = dist(&lcg); + } + dmat->Info().weights_.HostVector() = weights; + } + + ASSERT_EQ(dmat->Info().feature_types.Size(), 1); + auto cuts = sketch(dmat.get(), num_bins); + std::sort(x.begin(), x.end()); + auto n_uniques = std::unique(x.begin(), x.end()) - x.begin(); + ASSERT_NE(n_uniques, x.size()); + ASSERT_EQ(cuts.TotalBins(), n_uniques); + ASSERT_EQ(n_uniques, num_categories); + + auto& values = cuts.cut_values_.HostVector(); + ASSERT_TRUE(std::is_sorted(values.cbegin(), values.cend())); + auto is_unique = (std::unique(values.begin(), values.end()) - values.begin()) == n_uniques; + ASSERT_TRUE(is_unique); + + x.resize(n_uniques); + for (size_t i = 0; i < n_uniques; ++i) { + ASSERT_EQ(x[i], values[i]); + } +} } // namespace common } // namespace xgboost diff --git a/tests/cpp/common/test_quantile.cc b/tests/cpp/common/test_quantile.cc index fff74d8e06db..eb4372500806 100644 --- a/tests/cpp/common/test_quantile.cc +++ b/tests/cpp/common/test_quantile.cc @@ -43,12 +43,14 @@ void TestDistributedQuantile(size_t rows, size_t cols) { // Generate cuts for distributed environment. auto sparsity = 0.5f; auto rank = rabit::GetRank(); - HostSketchContainer sketch_distributed(column_size, n_bins, false, OmpGetNumThreads(0)); auto m = RandomDataGenerator{rows, cols, sparsity} .Seed(rank) .Lower(.0f) .Upper(1.0f) .GenerateDMatrix(); + HostSketchContainer sketch_distributed( + column_size, n_bins, m->Info().feature_types.ConstHostSpan(), false, + OmpGetNumThreads(0)); for (auto const &page : m->GetBatches()) { sketch_distributed.PushRowPage(page, m->Info()); } @@ -59,7 +61,9 @@ void TestDistributedQuantile(size_t rows, size_t cols) { rabit::Finalize(); CHECK_EQ(rabit::GetWorldSize(), 1); std::for_each(column_size.begin(), column_size.end(), [=](auto& size) { size *= world; }); - HostSketchContainer sketch_on_single_node(column_size, n_bins, false, OmpGetNumThreads(0)); + HostSketchContainer sketch_on_single_node( + column_size, n_bins, m->Info().feature_types.ConstHostSpan(), false, + OmpGetNumThreads(0)); for (auto rank = 0; rank < world; ++rank) { auto m = RandomDataGenerator{rows, cols, sparsity} .Seed(rank) From d66c033572c150ced51895739f89f45af3562340 Mon Sep 17 00:00:00 2001 From: fis Date: Mon, 13 Sep 2021 21:23:57 +0800 Subject: [PATCH 3/5] Fix. --- src/common/quantile.cc | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/src/common/quantile.cc b/src/common/quantile.cc index d691d9e8c90b..3553d4c35f6d 100644 --- a/src/common/quantile.cc +++ b/src/common/quantile.cc @@ -369,13 +369,13 @@ void HostSketchContainer::MakeCuts(HistogramCuts* cuts) { std::vector final_summaries(reduced.size()); ParallelFor(reduced.size(), n_threads_, Sched::Guided(), [&](size_t fidx) { + if (IsCat(feature_types_, fidx)) { + return; + } WQSketch::SummaryContainer &a = final_summaries[fidx]; size_t max_num_bins = std::min(num_cuts[fidx], max_bins_); a.Reserve(max_num_bins + 1); CHECK(a.data); - if (IsCat(feature_types_, fidx)) { - return; - } if (num_cuts[fidx] != 0) { a.SetPrune(reduced[fidx], max_num_bins + 1); CHECK(a.data && reduced[fidx].data); @@ -395,13 +395,13 @@ void HostSketchContainer::MakeCuts(HistogramCuts* cuts) { AddCategories(categories_.at(fid), cuts); } else { AddCutPoint(a, max_num_bins, cuts); + // push a value that is greater than anything + const bst_float cpt = (a.size > 0) ? a.data[a.size - 1].value + : cuts->min_vals_.HostVector()[fid]; + // this must be bigger than last value in a scale + const bst_float last = cpt + (fabs(cpt) + 1e-5f); + cuts->cut_values_.HostVector().push_back(last); } - // push a value that is greater than anything - const bst_float cpt - = (a.size > 0) ? a.data[a.size - 1].value : cuts->min_vals_.HostVector()[fid]; - // this must be bigger than last value in a scale - const bst_float last = cpt + (fabs(cpt) + 1e-5f); - cuts->cut_values_.HostVector().push_back(last); // Ensure that every feature gets at least one quantile point CHECK_LE(cuts->cut_values_.HostVector().size(), std::numeric_limits::max()); From f46827aaddcdf36fe296559475c45a5ad79535dc Mon Sep 17 00:00:00 2001 From: fis Date: Mon, 13 Sep 2021 21:30:38 +0800 Subject: [PATCH 4/5] Lint. --- src/common/hist_util.h | 2 +- src/common/quantile.h | 3 ++- tests/cpp/common/test_hist_util.cc | 3 +++ tests/cpp/common/test_hist_util.cu | 3 +++ tests/cpp/common/test_hist_util.h | 3 +++ 5 files changed, 12 insertions(+), 2 deletions(-) diff --git a/src/common/hist_util.h b/src/common/hist_util.h index bd7bfc83f4d7..9dc0bd1c5fd1 100644 --- a/src/common/hist_util.h +++ b/src/common/hist_util.h @@ -1,5 +1,5 @@ /*! - * Copyright 2017-2020 by Contributors + * Copyright 2017-2021 by Contributors * \file hist_util.h * \brief Utility for fast histogram aggregation * \author Philip Cho, Tianqi Chen diff --git a/src/common/quantile.h b/src/common/quantile.h index d316a0a58aa2..b93a7c8c3927 100644 --- a/src/common/quantile.h +++ b/src/common/quantile.h @@ -1,5 +1,5 @@ /*! - * Copyright 2014 by Contributors + * Copyright 2014-2021 by Contributors * \file quantile.h * \brief util to compute quantiles * \author Tianqi Chen @@ -15,6 +15,7 @@ #include #include #include +#include #include "timer.h" diff --git a/tests/cpp/common/test_hist_util.cc b/tests/cpp/common/test_hist_util.cc index 8c3de06646ab..b59e994a2339 100644 --- a/tests/cpp/common/test_hist_util.cc +++ b/tests/cpp/common/test_hist_util.cc @@ -1,3 +1,6 @@ +/*! + * Copyright 2019-2021 by XGBoost Contributors + */ #include #include #include diff --git a/tests/cpp/common/test_hist_util.cu b/tests/cpp/common/test_hist_util.cu index bc67c5e99950..f08ad54ac8fc 100644 --- a/tests/cpp/common/test_hist_util.cu +++ b/tests/cpp/common/test_hist_util.cu @@ -1,3 +1,6 @@ +/*! + * Copyright 2019-2021 by XGBoost Contributors + */ #include #include diff --git a/tests/cpp/common/test_hist_util.h b/tests/cpp/common/test_hist_util.h index 10851f14e29e..2efd3a5ba36a 100644 --- a/tests/cpp/common/test_hist_util.h +++ b/tests/cpp/common/test_hist_util.h @@ -1,3 +1,6 @@ +/*! + * Copyright 2019-2021 by XGBoost Contributors + */ #pragma once #include #include From 8f5ad72987ee66d4dd3a0fa49d01643f1ae80adf Mon Sep 17 00:00:00 2001 From: fis Date: Mon, 13 Sep 2021 22:05:27 +0800 Subject: [PATCH 5/5] header. --- src/common/quantile.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/common/quantile.cc b/src/common/quantile.cc index 3553d4c35f6d..4e84719c0537 100644 --- a/src/common/quantile.cc +++ b/src/common/quantile.cc @@ -1,5 +1,5 @@ /*! - * Copyright 2021 by XGBoost Contributors + * Copyright 2020-2021 by XGBoost Contributors */ #include #include