Skip to content

Commit

Permalink
Categorical data support in CPU sketching. (#7221)
Browse files Browse the repository at this point in the history
  • Loading branch information
trivialfis committed Sep 16, 2021
1 parent 9f63d6f commit 31c1e13
Show file tree
Hide file tree
Showing 7 changed files with 129 additions and 57 deletions.
3 changes: 2 additions & 1 deletion src/common/hist_util.h
@@ -1,5 +1,5 @@
/*!
* Copyright 2017-2020 by Contributors
* Copyright 2017-2021 by Contributors
* \file hist_util.h
* \brief Utility for fast histogram aggregation
* \author Philip Cho, Tianqi Chen
Expand Down Expand Up @@ -128,6 +128,7 @@ inline HistogramCuts SketchOnDMatrix(DMatrix *m, int32_t max_bins,
}
}
HostSketchContainer container(reduced, max_bins,
m->Info().feature_types.ConstHostSpan(),
HostSketchContainer::UseGroup(info), threads);
for (auto const &page : m->GetBatches<SparsePage>()) {
container.PushRowPage(page, info, hessian);
Expand Down
60 changes: 44 additions & 16 deletions src/common/quantile.cc
@@ -1,29 +1,35 @@
/*!
* Copyright 2020 by XGBoost Contributors
* Copyright 2020-2021 by XGBoost Contributors
*/
#include <limits>
#include <utility>
#include "quantile.h"
#include "hist_util.h"
#include "categorical.h"

namespace xgboost {
namespace common {

HostSketchContainer::HostSketchContainer(std::vector<bst_row_t> columns_size,
int32_t max_bins, bool use_group,
int32_t n_threads)
: columns_size_{std::move(columns_size)}, max_bins_{max_bins},
HostSketchContainer::HostSketchContainer(
std::vector<bst_row_t> columns_size, int32_t max_bins,
common::Span<FeatureType const> feature_types, bool use_group,
int32_t n_threads)
: feature_types_(feature_types.cbegin(), feature_types.cend()),
columns_size_{std::move(columns_size)}, max_bins_{max_bins},
use_group_ind_{use_group}, n_threads_{n_threads} {
monitor_.Init(__func__);
CHECK_NE(columns_size_.size(), 0);
sketches_.resize(columns_size_.size());
CHECK_GE(n_threads_, 1);
categories_.resize(columns_size_.size());
ParallelFor(sketches_.size(), n_threads_, Sched::Auto(), [&](auto i) {
auto n_bins = std::min(static_cast<size_t>(max_bins_), columns_size_[i]);
n_bins = std::max(n_bins, static_cast<decltype(n_bins)>(1));
auto eps = 1.0 / (static_cast<float>(n_bins) * WQSketch::kFactor);
sketches_[i].Init(columns_size_[i], eps);
sketches_[i].inqueue.queue.resize(sketches_[i].limit_size * 2);
if (!IsCat(this->feature_types_, i)) {
sketches_[i].Init(columns_size_[i], eps);
sketches_[i].inqueue.queue.resize(sketches_[i].limit_size * 2);
}
});
}

Expand Down Expand Up @@ -182,13 +188,21 @@ void HostSketchContainer::PushRowPage(
auto p_inst = inst.data();
if (is_dense) {
for (size_t ii = begin; ii < end; ii++) {
sketches_[ii].Push(p_inst[ii].fvalue, w);
if (IsCat(feature_types_, ii)) {
categories_[ii].emplace(p_inst[ii].fvalue);
} else {
sketches_[ii].Push(p_inst[ii].fvalue, w);
}
}
} else {
for (size_t i = 0; i < inst.size(); ++i) {
auto const& entry = p_inst[i];
if (entry.index >= begin && entry.index < end) {
sketches_[entry.index].Push(entry.fvalue, w);
if (IsCat(feature_types_, entry.index)) {
categories_[entry.index].emplace(entry.fvalue);
} else {
sketches_[entry.index].Push(entry.fvalue, w);
}
}
}
}
Expand Down Expand Up @@ -338,6 +352,13 @@ void AddCutPoint(WQuantileSketch<float, float>::SummaryContainer const &summary,
}
}

void AddCategories(std::set<bst_cat_t> const &categories, HistogramCuts *cuts) {
auto &cut_values = cuts->cut_values_.HostVector();
for (auto const &v : categories) {
cut_values.push_back(v);
}
}

void HostSketchContainer::MakeCuts(HistogramCuts* cuts) {
monitor_.Start(__func__);
std::vector<WQSketch::SummaryContainer> reduced;
Expand All @@ -348,6 +369,9 @@ void HostSketchContainer::MakeCuts(HistogramCuts* cuts) {
std::vector<WQSketch::SummaryContainer> final_summaries(reduced.size());

ParallelFor(reduced.size(), n_threads_, Sched::Guided(), [&](size_t fidx) {
if (IsCat(feature_types_, fidx)) {
return;
}
WQSketch::SummaryContainer &a = final_summaries[fidx];
size_t max_num_bins = std::min(num_cuts[fidx], max_bins_);
a.Reserve(max_num_bins + 1);
Expand All @@ -367,13 +391,17 @@ void HostSketchContainer::MakeCuts(HistogramCuts* cuts) {
for (size_t fid = 0; fid < reduced.size(); ++fid) {
size_t max_num_bins = std::min(num_cuts[fid], max_bins_);
WQSketch::SummaryContainer const& a = final_summaries[fid];
AddCutPoint(a, max_num_bins, cuts);
// push a value that is greater than anything
const bst_float cpt
= (a.size > 0) ? a.data[a.size - 1].value : cuts->min_vals_.HostVector()[fid];
// this must be bigger than last value in a scale
const bst_float last = cpt + (fabs(cpt) + 1e-5f);
cuts->cut_values_.HostVector().push_back(last);
if (IsCat(feature_types_, fid)) {
AddCategories(categories_.at(fid), cuts);
} else {
AddCutPoint(a, max_num_bins, cuts);
// push a value that is greater than anything
const bst_float cpt = (a.size > 0) ? a.data[a.size - 1].value
: cuts->min_vals_.HostVector()[fid];
// this must be bigger than last value in a scale
const bst_float last = cpt + (fabs(cpt) + 1e-5f);
cuts->cut_values_.HostVector().push_back(last);
}

// Ensure that every feature gets at least one quantile point
CHECK_LE(cuts->cut_values_.HostVector().size(), std::numeric_limits<uint32_t>::max());
Expand Down
9 changes: 7 additions & 2 deletions src/common/quantile.h
@@ -1,5 +1,5 @@
/*!
* Copyright 2014 by Contributors
* Copyright 2014-2021 by Contributors
* \file quantile.h
* \brief util to compute quantiles
* \author Tianqi Chen
Expand All @@ -15,6 +15,7 @@
#include <cstring>
#include <algorithm>
#include <iostream>
#include <set>

#include "timer.h"

Expand Down Expand Up @@ -707,6 +708,9 @@ class HostSketchContainer {

private:
std::vector<WQSketch> sketches_;
std::vector<std::set<bst_cat_t>> categories_;
std::vector<FeatureType> const feature_types_;

std::vector<bst_row_t> columns_size_;
int32_t max_bins_;
bool use_group_ind_{false};
Expand All @@ -721,7 +725,8 @@ class HostSketchContainer {
* \param use_group whether is assigned to group to data instance.
*/
HostSketchContainer(std::vector<bst_row_t> columns_size, int32_t max_bins,
bool use_group, int32_t n_threads);
common::Span<FeatureType const> feature_types, bool use_group,
int32_t n_threads);

static bool UseGroup(MetaInfo const &info) {
size_t const num_groups =
Expand Down
14 changes: 14 additions & 0 deletions tests/cpp/common/test_hist_util.cc
@@ -1,3 +1,6 @@
/*!
* Copyright 2019-2021 by XGBoost Contributors
*/
#include <gtest/gtest.h>
#include <vector>
#include <string>
Expand Down Expand Up @@ -388,5 +391,16 @@ TEST(HistUtil, SketchFromWeights) {
TestSketchFromWeights(true);
TestSketchFromWeights(false);
}

TEST(HistUtil, SketchCategoricalFeatures) {
TestCategoricalSketch(1000, 256, 32, false,
[](DMatrix *p_fmat, int32_t num_bins) {
return SketchOnDMatrix(p_fmat, num_bins);
});
TestCategoricalSketch(1000, 256, 32, true,
[](DMatrix *p_fmat, int32_t num_bins) {
return SketchOnDMatrix(p_fmat, num_bins);
});
}
} // namespace common
} // namespace xgboost
47 changes: 11 additions & 36 deletions tests/cpp/common/test_hist_util.cu
@@ -1,3 +1,6 @@
/*!
* Copyright 2019-2021 by XGBoost Contributors
*/
#include <dmlc/filesystem.h>
#include <gtest/gtest.h>

Expand Down Expand Up @@ -126,43 +129,15 @@ TEST(HistUtil, DeviceSketchCategoricalAsNumeric) {
}
}

void TestCategoricalSketch(size_t n, size_t num_categories, int32_t num_bins, bool weighted) {
auto x = GenerateRandomCategoricalSingleColumn(n, num_categories);
auto dmat = GetDMatrixFromData(x, n, 1);
dmat->Info().feature_types.HostVector().push_back(FeatureType::kCategorical);

if (weighted) {
std::vector<float> weights(n, 0);
SimpleLCG lcg;
SimpleRealUniformDistribution<float> dist(0, 1);
for (auto& v : weights) {
v = dist(&lcg);
}
dmat->Info().weights_.HostVector() = weights;
}

ASSERT_EQ(dmat->Info().feature_types.Size(), 1);
auto cuts = DeviceSketch(0, dmat.get(), num_bins);
std::sort(x.begin(), x.end());
auto n_uniques = std::unique(x.begin(), x.end()) - x.begin();
ASSERT_NE(n_uniques, x.size());
ASSERT_EQ(cuts.TotalBins(), n_uniques);
ASSERT_EQ(n_uniques, num_categories);

auto& values = cuts.cut_values_.HostVector();
ASSERT_TRUE(std::is_sorted(values.cbegin(), values.cend()));
auto is_unique = (std::unique(values.begin(), values.end()) - values.begin()) == n_uniques;
ASSERT_TRUE(is_unique);

x.resize(n_uniques);
for (size_t i = 0; i < n_uniques; ++i) {
ASSERT_EQ(x[i], values[i]);
}
}

TEST(HistUtil, DeviceSketchCategoricalFeatures) {
TestCategoricalSketch(1000, 256, 32, false);
TestCategoricalSketch(1000, 256, 32, true);
TestCategoricalSketch(1000, 256, 32, false,
[](DMatrix *p_fmat, int32_t num_bins) {
return DeviceSketch(0, p_fmat, num_bins);
});
TestCategoricalSketch(1000, 256, 32, true,
[](DMatrix *p_fmat, int32_t num_bins) {
return DeviceSketch(0, p_fmat, num_bins);
});
}

void TestMixedSketch() {
Expand Down
45 changes: 45 additions & 0 deletions tests/cpp/common/test_hist_util.h
@@ -1,10 +1,15 @@
/*!
* Copyright 2019-2021 by XGBoost Contributors
*/
#pragma once
#include <gtest/gtest.h>
#include <dmlc/filesystem.h>
#include <random>
#include <vector>
#include <string>
#include <fstream>

#include "../helpers.h"
#include "../../../src/common/hist_util.h"
#include "../../../src/data/simple_dmatrix.h"
#include "../../../src/data/adapter.h"
Expand Down Expand Up @@ -206,5 +211,45 @@ inline void ValidateCuts(const HistogramCuts& cuts, DMatrix* dmat,
}
}

/**
* \brief Test for sketching on categorical data.
*
* \param sketch Sketch function, can be on device or on host.
*/
template <typename Fn>
void TestCategoricalSketch(size_t n, size_t num_categories, int32_t num_bins,
bool weighted, Fn sketch) {
auto x = GenerateRandomCategoricalSingleColumn(n, num_categories);
auto dmat = GetDMatrixFromData(x, n, 1);
dmat->Info().feature_types.HostVector().push_back(FeatureType::kCategorical);

if (weighted) {
std::vector<float> weights(n, 0);
SimpleLCG lcg;
SimpleRealUniformDistribution<float> dist(0, 1);
for (auto& v : weights) {
v = dist(&lcg);
}
dmat->Info().weights_.HostVector() = weights;
}

ASSERT_EQ(dmat->Info().feature_types.Size(), 1);
auto cuts = sketch(dmat.get(), num_bins);
std::sort(x.begin(), x.end());
auto n_uniques = std::unique(x.begin(), x.end()) - x.begin();
ASSERT_NE(n_uniques, x.size());
ASSERT_EQ(cuts.TotalBins(), n_uniques);
ASSERT_EQ(n_uniques, num_categories);

auto& values = cuts.cut_values_.HostVector();
ASSERT_TRUE(std::is_sorted(values.cbegin(), values.cend()));
auto is_unique = (std::unique(values.begin(), values.end()) - values.begin()) == n_uniques;
ASSERT_TRUE(is_unique);

x.resize(n_uniques);
for (size_t i = 0; i < n_uniques; ++i) {
ASSERT_EQ(x[i], values[i]);
}
}
} // namespace common
} // namespace xgboost
8 changes: 6 additions & 2 deletions tests/cpp/common/test_quantile.cc
Expand Up @@ -43,12 +43,14 @@ void TestDistributedQuantile(size_t rows, size_t cols) {
// Generate cuts for distributed environment.
auto sparsity = 0.5f;
auto rank = rabit::GetRank();
HostSketchContainer sketch_distributed(column_size, n_bins, false, OmpGetNumThreads(0));
auto m = RandomDataGenerator{rows, cols, sparsity}
.Seed(rank)
.Lower(.0f)
.Upper(1.0f)
.GenerateDMatrix();
HostSketchContainer sketch_distributed(
column_size, n_bins, m->Info().feature_types.ConstHostSpan(), false,
OmpGetNumThreads(0));
for (auto const &page : m->GetBatches<SparsePage>()) {
sketch_distributed.PushRowPage(page, m->Info());
}
Expand All @@ -59,7 +61,9 @@ void TestDistributedQuantile(size_t rows, size_t cols) {
rabit::Finalize();
CHECK_EQ(rabit::GetWorldSize(), 1);
std::for_each(column_size.begin(), column_size.end(), [=](auto& size) { size *= world; });
HostSketchContainer sketch_on_single_node(column_size, n_bins, false, OmpGetNumThreads(0));
HostSketchContainer sketch_on_single_node(
column_size, n_bins, m->Info().feature_types.ConstHostSpan(), false,
OmpGetNumThreads(0));
for (auto rank = 0; rank < world; ++rank) {
auto m = RandomDataGenerator{rows, cols, sparsity}
.Seed(rank)
Expand Down

0 comments on commit 31c1e13

Please sign in to comment.