From 31c1e13f901b7800d7304f0149925f0a30b97ed6 Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Fri, 17 Sep 2021 04:37:09 +0800 Subject: [PATCH] Categorical data support in CPU sketching. (#7221) --- src/common/hist_util.h | 3 +- src/common/quantile.cc | 60 ++++++++++++++++++++++-------- src/common/quantile.h | 9 ++++- tests/cpp/common/test_hist_util.cc | 14 +++++++ tests/cpp/common/test_hist_util.cu | 47 ++++++----------------- tests/cpp/common/test_hist_util.h | 45 ++++++++++++++++++++++ tests/cpp/common/test_quantile.cc | 8 +++- 7 files changed, 129 insertions(+), 57 deletions(-) diff --git a/src/common/hist_util.h b/src/common/hist_util.h index fc49148b0765..9dc0bd1c5fd1 100644 --- a/src/common/hist_util.h +++ b/src/common/hist_util.h @@ -1,5 +1,5 @@ /*! - * Copyright 2017-2020 by Contributors + * Copyright 2017-2021 by Contributors * \file hist_util.h * \brief Utility for fast histogram aggregation * \author Philip Cho, Tianqi Chen @@ -128,6 +128,7 @@ inline HistogramCuts SketchOnDMatrix(DMatrix *m, int32_t max_bins, } } HostSketchContainer container(reduced, max_bins, + m->Info().feature_types.ConstHostSpan(), HostSketchContainer::UseGroup(info), threads); for (auto const &page : m->GetBatches()) { container.PushRowPage(page, info, hessian); diff --git a/src/common/quantile.cc b/src/common/quantile.cc index a50602b152c0..4e84719c0537 100644 --- a/src/common/quantile.cc +++ b/src/common/quantile.cc @@ -1,29 +1,35 @@ /*! - * Copyright 2020 by XGBoost Contributors + * Copyright 2020-2021 by XGBoost Contributors */ #include #include #include "quantile.h" #include "hist_util.h" +#include "categorical.h" namespace xgboost { namespace common { -HostSketchContainer::HostSketchContainer(std::vector columns_size, - int32_t max_bins, bool use_group, - int32_t n_threads) - : columns_size_{std::move(columns_size)}, max_bins_{max_bins}, +HostSketchContainer::HostSketchContainer( + std::vector columns_size, int32_t max_bins, + common::Span feature_types, bool use_group, + int32_t n_threads) + : feature_types_(feature_types.cbegin(), feature_types.cend()), + columns_size_{std::move(columns_size)}, max_bins_{max_bins}, use_group_ind_{use_group}, n_threads_{n_threads} { monitor_.Init(__func__); CHECK_NE(columns_size_.size(), 0); sketches_.resize(columns_size_.size()); CHECK_GE(n_threads_, 1); + categories_.resize(columns_size_.size()); ParallelFor(sketches_.size(), n_threads_, Sched::Auto(), [&](auto i) { auto n_bins = std::min(static_cast(max_bins_), columns_size_[i]); n_bins = std::max(n_bins, static_cast(1)); auto eps = 1.0 / (static_cast(n_bins) * WQSketch::kFactor); - sketches_[i].Init(columns_size_[i], eps); - sketches_[i].inqueue.queue.resize(sketches_[i].limit_size * 2); + if (!IsCat(this->feature_types_, i)) { + sketches_[i].Init(columns_size_[i], eps); + sketches_[i].inqueue.queue.resize(sketches_[i].limit_size * 2); + } }); } @@ -182,13 +188,21 @@ void HostSketchContainer::PushRowPage( auto p_inst = inst.data(); if (is_dense) { for (size_t ii = begin; ii < end; ii++) { - sketches_[ii].Push(p_inst[ii].fvalue, w); + if (IsCat(feature_types_, ii)) { + categories_[ii].emplace(p_inst[ii].fvalue); + } else { + sketches_[ii].Push(p_inst[ii].fvalue, w); + } } } else { for (size_t i = 0; i < inst.size(); ++i) { auto const& entry = p_inst[i]; if (entry.index >= begin && entry.index < end) { - sketches_[entry.index].Push(entry.fvalue, w); + if (IsCat(feature_types_, entry.index)) { + categories_[entry.index].emplace(entry.fvalue); + } else { + sketches_[entry.index].Push(entry.fvalue, w); + } } } } @@ -338,6 +352,13 @@ void AddCutPoint(WQuantileSketch::SummaryContainer const &summary, } } +void AddCategories(std::set const &categories, HistogramCuts *cuts) { + auto &cut_values = cuts->cut_values_.HostVector(); + for (auto const &v : categories) { + cut_values.push_back(v); + } +} + void HostSketchContainer::MakeCuts(HistogramCuts* cuts) { monitor_.Start(__func__); std::vector reduced; @@ -348,6 +369,9 @@ void HostSketchContainer::MakeCuts(HistogramCuts* cuts) { std::vector final_summaries(reduced.size()); ParallelFor(reduced.size(), n_threads_, Sched::Guided(), [&](size_t fidx) { + if (IsCat(feature_types_, fidx)) { + return; + } WQSketch::SummaryContainer &a = final_summaries[fidx]; size_t max_num_bins = std::min(num_cuts[fidx], max_bins_); a.Reserve(max_num_bins + 1); @@ -367,13 +391,17 @@ void HostSketchContainer::MakeCuts(HistogramCuts* cuts) { for (size_t fid = 0; fid < reduced.size(); ++fid) { size_t max_num_bins = std::min(num_cuts[fid], max_bins_); WQSketch::SummaryContainer const& a = final_summaries[fid]; - AddCutPoint(a, max_num_bins, cuts); - // push a value that is greater than anything - const bst_float cpt - = (a.size > 0) ? a.data[a.size - 1].value : cuts->min_vals_.HostVector()[fid]; - // this must be bigger than last value in a scale - const bst_float last = cpt + (fabs(cpt) + 1e-5f); - cuts->cut_values_.HostVector().push_back(last); + if (IsCat(feature_types_, fid)) { + AddCategories(categories_.at(fid), cuts); + } else { + AddCutPoint(a, max_num_bins, cuts); + // push a value that is greater than anything + const bst_float cpt = (a.size > 0) ? a.data[a.size - 1].value + : cuts->min_vals_.HostVector()[fid]; + // this must be bigger than last value in a scale + const bst_float last = cpt + (fabs(cpt) + 1e-5f); + cuts->cut_values_.HostVector().push_back(last); + } // Ensure that every feature gets at least one quantile point CHECK_LE(cuts->cut_values_.HostVector().size(), std::numeric_limits::max()); diff --git a/src/common/quantile.h b/src/common/quantile.h index c72f5f39160f..b93a7c8c3927 100644 --- a/src/common/quantile.h +++ b/src/common/quantile.h @@ -1,5 +1,5 @@ /*! - * Copyright 2014 by Contributors + * Copyright 2014-2021 by Contributors * \file quantile.h * \brief util to compute quantiles * \author Tianqi Chen @@ -15,6 +15,7 @@ #include #include #include +#include #include "timer.h" @@ -707,6 +708,9 @@ class HostSketchContainer { private: std::vector sketches_; + std::vector> categories_; + std::vector const feature_types_; + std::vector columns_size_; int32_t max_bins_; bool use_group_ind_{false}; @@ -721,7 +725,8 @@ class HostSketchContainer { * \param use_group whether is assigned to group to data instance. */ HostSketchContainer(std::vector columns_size, int32_t max_bins, - bool use_group, int32_t n_threads); + common::Span feature_types, bool use_group, + int32_t n_threads); static bool UseGroup(MetaInfo const &info) { size_t const num_groups = diff --git a/tests/cpp/common/test_hist_util.cc b/tests/cpp/common/test_hist_util.cc index 3abf0d45dc81..b59e994a2339 100644 --- a/tests/cpp/common/test_hist_util.cc +++ b/tests/cpp/common/test_hist_util.cc @@ -1,3 +1,6 @@ +/*! + * Copyright 2019-2021 by XGBoost Contributors + */ #include #include #include @@ -388,5 +391,16 @@ TEST(HistUtil, SketchFromWeights) { TestSketchFromWeights(true); TestSketchFromWeights(false); } + +TEST(HistUtil, SketchCategoricalFeatures) { + TestCategoricalSketch(1000, 256, 32, false, + [](DMatrix *p_fmat, int32_t num_bins) { + return SketchOnDMatrix(p_fmat, num_bins); + }); + TestCategoricalSketch(1000, 256, 32, true, + [](DMatrix *p_fmat, int32_t num_bins) { + return SketchOnDMatrix(p_fmat, num_bins); + }); +} } // namespace common } // namespace xgboost diff --git a/tests/cpp/common/test_hist_util.cu b/tests/cpp/common/test_hist_util.cu index a8a14a8dd948..0be7450e9bfd 100644 --- a/tests/cpp/common/test_hist_util.cu +++ b/tests/cpp/common/test_hist_util.cu @@ -1,3 +1,6 @@ +/*! + * Copyright 2019-2021 by XGBoost Contributors + */ #include #include @@ -126,43 +129,15 @@ TEST(HistUtil, DeviceSketchCategoricalAsNumeric) { } } -void TestCategoricalSketch(size_t n, size_t num_categories, int32_t num_bins, bool weighted) { - auto x = GenerateRandomCategoricalSingleColumn(n, num_categories); - auto dmat = GetDMatrixFromData(x, n, 1); - dmat->Info().feature_types.HostVector().push_back(FeatureType::kCategorical); - - if (weighted) { - std::vector weights(n, 0); - SimpleLCG lcg; - SimpleRealUniformDistribution dist(0, 1); - for (auto& v : weights) { - v = dist(&lcg); - } - dmat->Info().weights_.HostVector() = weights; - } - - ASSERT_EQ(dmat->Info().feature_types.Size(), 1); - auto cuts = DeviceSketch(0, dmat.get(), num_bins); - std::sort(x.begin(), x.end()); - auto n_uniques = std::unique(x.begin(), x.end()) - x.begin(); - ASSERT_NE(n_uniques, x.size()); - ASSERT_EQ(cuts.TotalBins(), n_uniques); - ASSERT_EQ(n_uniques, num_categories); - - auto& values = cuts.cut_values_.HostVector(); - ASSERT_TRUE(std::is_sorted(values.cbegin(), values.cend())); - auto is_unique = (std::unique(values.begin(), values.end()) - values.begin()) == n_uniques; - ASSERT_TRUE(is_unique); - - x.resize(n_uniques); - for (size_t i = 0; i < n_uniques; ++i) { - ASSERT_EQ(x[i], values[i]); - } -} - TEST(HistUtil, DeviceSketchCategoricalFeatures) { - TestCategoricalSketch(1000, 256, 32, false); - TestCategoricalSketch(1000, 256, 32, true); + TestCategoricalSketch(1000, 256, 32, false, + [](DMatrix *p_fmat, int32_t num_bins) { + return DeviceSketch(0, p_fmat, num_bins); + }); + TestCategoricalSketch(1000, 256, 32, true, + [](DMatrix *p_fmat, int32_t num_bins) { + return DeviceSketch(0, p_fmat, num_bins); + }); } void TestMixedSketch() { diff --git a/tests/cpp/common/test_hist_util.h b/tests/cpp/common/test_hist_util.h index aa91f9c29b79..2efd3a5ba36a 100644 --- a/tests/cpp/common/test_hist_util.h +++ b/tests/cpp/common/test_hist_util.h @@ -1,3 +1,6 @@ +/*! + * Copyright 2019-2021 by XGBoost Contributors + */ #pragma once #include #include @@ -5,6 +8,8 @@ #include #include #include + +#include "../helpers.h" #include "../../../src/common/hist_util.h" #include "../../../src/data/simple_dmatrix.h" #include "../../../src/data/adapter.h" @@ -206,5 +211,45 @@ inline void ValidateCuts(const HistogramCuts& cuts, DMatrix* dmat, } } +/** + * \brief Test for sketching on categorical data. + * + * \param sketch Sketch function, can be on device or on host. + */ +template +void TestCategoricalSketch(size_t n, size_t num_categories, int32_t num_bins, + bool weighted, Fn sketch) { + auto x = GenerateRandomCategoricalSingleColumn(n, num_categories); + auto dmat = GetDMatrixFromData(x, n, 1); + dmat->Info().feature_types.HostVector().push_back(FeatureType::kCategorical); + + if (weighted) { + std::vector weights(n, 0); + SimpleLCG lcg; + SimpleRealUniformDistribution dist(0, 1); + for (auto& v : weights) { + v = dist(&lcg); + } + dmat->Info().weights_.HostVector() = weights; + } + + ASSERT_EQ(dmat->Info().feature_types.Size(), 1); + auto cuts = sketch(dmat.get(), num_bins); + std::sort(x.begin(), x.end()); + auto n_uniques = std::unique(x.begin(), x.end()) - x.begin(); + ASSERT_NE(n_uniques, x.size()); + ASSERT_EQ(cuts.TotalBins(), n_uniques); + ASSERT_EQ(n_uniques, num_categories); + + auto& values = cuts.cut_values_.HostVector(); + ASSERT_TRUE(std::is_sorted(values.cbegin(), values.cend())); + auto is_unique = (std::unique(values.begin(), values.end()) - values.begin()) == n_uniques; + ASSERT_TRUE(is_unique); + + x.resize(n_uniques); + for (size_t i = 0; i < n_uniques; ++i) { + ASSERT_EQ(x[i], values[i]); + } +} } // namespace common } // namespace xgboost diff --git a/tests/cpp/common/test_quantile.cc b/tests/cpp/common/test_quantile.cc index fff74d8e06db..eb4372500806 100644 --- a/tests/cpp/common/test_quantile.cc +++ b/tests/cpp/common/test_quantile.cc @@ -43,12 +43,14 @@ void TestDistributedQuantile(size_t rows, size_t cols) { // Generate cuts for distributed environment. auto sparsity = 0.5f; auto rank = rabit::GetRank(); - HostSketchContainer sketch_distributed(column_size, n_bins, false, OmpGetNumThreads(0)); auto m = RandomDataGenerator{rows, cols, sparsity} .Seed(rank) .Lower(.0f) .Upper(1.0f) .GenerateDMatrix(); + HostSketchContainer sketch_distributed( + column_size, n_bins, m->Info().feature_types.ConstHostSpan(), false, + OmpGetNumThreads(0)); for (auto const &page : m->GetBatches()) { sketch_distributed.PushRowPage(page, m->Info()); } @@ -59,7 +61,9 @@ void TestDistributedQuantile(size_t rows, size_t cols) { rabit::Finalize(); CHECK_EQ(rabit::GetWorldSize(), 1); std::for_each(column_size.begin(), column_size.end(), [=](auto& size) { size *= world; }); - HostSketchContainer sketch_on_single_node(column_size, n_bins, false, OmpGetNumThreads(0)); + HostSketchContainer sketch_on_single_node( + column_size, n_bins, m->Info().feature_types.ConstHostSpan(), false, + OmpGetNumThreads(0)); for (auto rank = 0; rank < world; ++rank) { auto m = RandomDataGenerator{rows, cols, sparsity} .Seed(rank)