Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Categorical data support in CPU sketching. #7221

Merged
merged 5 commits into from Sep 16, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
3 changes: 2 additions & 1 deletion src/common/hist_util.h
@@ -1,5 +1,5 @@
/*!
* Copyright 2017-2020 by Contributors
* Copyright 2017-2021 by Contributors
* \file hist_util.h
* \brief Utility for fast histogram aggregation
* \author Philip Cho, Tianqi Chen
Expand Down Expand Up @@ -128,6 +128,7 @@ inline HistogramCuts SketchOnDMatrix(DMatrix *m, int32_t max_bins,
}
}
HostSketchContainer container(reduced, max_bins,
m->Info().feature_types.ConstHostSpan(),
HostSketchContainer::UseGroup(info), threads);
for (auto const &page : m->GetBatches<SparsePage>()) {
container.PushRowPage(page, info, hessian);
Expand Down
60 changes: 44 additions & 16 deletions src/common/quantile.cc
@@ -1,29 +1,35 @@
/*!
* Copyright 2020 by XGBoost Contributors
* Copyright 2020-2021 by XGBoost Contributors
*/
#include <limits>
#include <utility>
#include "quantile.h"
#include "hist_util.h"
#include "categorical.h"

namespace xgboost {
namespace common {

HostSketchContainer::HostSketchContainer(std::vector<bst_row_t> columns_size,
int32_t max_bins, bool use_group,
int32_t n_threads)
: columns_size_{std::move(columns_size)}, max_bins_{max_bins},
HostSketchContainer::HostSketchContainer(
std::vector<bst_row_t> columns_size, int32_t max_bins,
common::Span<FeatureType const> feature_types, bool use_group,
int32_t n_threads)
: feature_types_(feature_types.cbegin(), feature_types.cend()),
columns_size_{std::move(columns_size)}, max_bins_{max_bins},
use_group_ind_{use_group}, n_threads_{n_threads} {
monitor_.Init(__func__);
CHECK_NE(columns_size_.size(), 0);
sketches_.resize(columns_size_.size());
CHECK_GE(n_threads_, 1);
categories_.resize(columns_size_.size());
ParallelFor(sketches_.size(), n_threads_, Sched::Auto(), [&](auto i) {
auto n_bins = std::min(static_cast<size_t>(max_bins_), columns_size_[i]);
n_bins = std::max(n_bins, static_cast<decltype(n_bins)>(1));
auto eps = 1.0 / (static_cast<float>(n_bins) * WQSketch::kFactor);
sketches_[i].Init(columns_size_[i], eps);
sketches_[i].inqueue.queue.resize(sketches_[i].limit_size * 2);
if (!IsCat(this->feature_types_, i)) {
sketches_[i].Init(columns_size_[i], eps);
sketches_[i].inqueue.queue.resize(sketches_[i].limit_size * 2);
}
});
}

Expand Down Expand Up @@ -182,13 +188,21 @@ void HostSketchContainer::PushRowPage(
auto p_inst = inst.data();
if (is_dense) {
for (size_t ii = begin; ii < end; ii++) {
sketches_[ii].Push(p_inst[ii].fvalue, w);
if (IsCat(feature_types_, ii)) {
categories_[ii].emplace(p_inst[ii].fvalue);
} else {
sketches_[ii].Push(p_inst[ii].fvalue, w);
}
}
} else {
for (size_t i = 0; i < inst.size(); ++i) {
auto const& entry = p_inst[i];
if (entry.index >= begin && entry.index < end) {
sketches_[entry.index].Push(entry.fvalue, w);
if (IsCat(feature_types_, entry.index)) {
categories_[entry.index].emplace(entry.fvalue);
} else {
sketches_[entry.index].Push(entry.fvalue, w);
}
}
}
}
Expand Down Expand Up @@ -338,6 +352,13 @@ void AddCutPoint(WQuantileSketch<float, float>::SummaryContainer const &summary,
}
}

void AddCategories(std::set<bst_cat_t> const &categories, HistogramCuts *cuts) {
auto &cut_values = cuts->cut_values_.HostVector();
for (auto const &v : categories) {
cut_values.push_back(v);
}
}

void HostSketchContainer::MakeCuts(HistogramCuts* cuts) {
monitor_.Start(__func__);
std::vector<WQSketch::SummaryContainer> reduced;
Expand All @@ -348,6 +369,9 @@ void HostSketchContainer::MakeCuts(HistogramCuts* cuts) {
std::vector<WQSketch::SummaryContainer> final_summaries(reduced.size());

ParallelFor(reduced.size(), n_threads_, Sched::Guided(), [&](size_t fidx) {
if (IsCat(feature_types_, fidx)) {
return;
}
WQSketch::SummaryContainer &a = final_summaries[fidx];
size_t max_num_bins = std::min(num_cuts[fidx], max_bins_);
a.Reserve(max_num_bins + 1);
Expand All @@ -367,13 +391,17 @@ void HostSketchContainer::MakeCuts(HistogramCuts* cuts) {
for (size_t fid = 0; fid < reduced.size(); ++fid) {
size_t max_num_bins = std::min(num_cuts[fid], max_bins_);
WQSketch::SummaryContainer const& a = final_summaries[fid];
AddCutPoint(a, max_num_bins, cuts);
// push a value that is greater than anything
const bst_float cpt
= (a.size > 0) ? a.data[a.size - 1].value : cuts->min_vals_.HostVector()[fid];
// this must be bigger than last value in a scale
const bst_float last = cpt + (fabs(cpt) + 1e-5f);
cuts->cut_values_.HostVector().push_back(last);
if (IsCat(feature_types_, fid)) {
AddCategories(categories_.at(fid), cuts);
} else {
AddCutPoint(a, max_num_bins, cuts);
// push a value that is greater than anything
const bst_float cpt = (a.size > 0) ? a.data[a.size - 1].value
: cuts->min_vals_.HostVector()[fid];
// this must be bigger than last value in a scale
const bst_float last = cpt + (fabs(cpt) + 1e-5f);
cuts->cut_values_.HostVector().push_back(last);
}

// Ensure that every feature gets at least one quantile point
CHECK_LE(cuts->cut_values_.HostVector().size(), std::numeric_limits<uint32_t>::max());
Expand Down
9 changes: 7 additions & 2 deletions src/common/quantile.h
@@ -1,5 +1,5 @@
/*!
* Copyright 2014 by Contributors
* Copyright 2014-2021 by Contributors
* \file quantile.h
* \brief util to compute quantiles
* \author Tianqi Chen
Expand All @@ -15,6 +15,7 @@
#include <cstring>
#include <algorithm>
#include <iostream>
#include <set>

#include "timer.h"

Expand Down Expand Up @@ -707,6 +708,9 @@ class HostSketchContainer {

private:
std::vector<WQSketch> sketches_;
std::vector<std::set<bst_cat_t>> categories_;
std::vector<FeatureType> const feature_types_;

std::vector<bst_row_t> columns_size_;
int32_t max_bins_;
bool use_group_ind_{false};
Expand All @@ -721,7 +725,8 @@ class HostSketchContainer {
* \param use_group whether is assigned to group to data instance.
*/
HostSketchContainer(std::vector<bst_row_t> columns_size, int32_t max_bins,
bool use_group, int32_t n_threads);
common::Span<FeatureType const> feature_types, bool use_group,
int32_t n_threads);

static bool UseGroup(MetaInfo const &info) {
size_t const num_groups =
Expand Down
14 changes: 14 additions & 0 deletions tests/cpp/common/test_hist_util.cc
@@ -1,3 +1,6 @@
/*!
* Copyright 2019-2021 by XGBoost Contributors
*/
#include <gtest/gtest.h>
#include <vector>
#include <string>
Expand Down Expand Up @@ -388,5 +391,16 @@ TEST(HistUtil, SketchFromWeights) {
TestSketchFromWeights(true);
TestSketchFromWeights(false);
}

TEST(HistUtil, SketchCategoricalFeatures) {
TestCategoricalSketch(1000, 256, 32, false,
[](DMatrix *p_fmat, int32_t num_bins) {
return SketchOnDMatrix(p_fmat, num_bins);
});
TestCategoricalSketch(1000, 256, 32, true,
[](DMatrix *p_fmat, int32_t num_bins) {
return SketchOnDMatrix(p_fmat, num_bins);
});
}
} // namespace common
} // namespace xgboost
47 changes: 11 additions & 36 deletions tests/cpp/common/test_hist_util.cu
@@ -1,3 +1,6 @@
/*!
* Copyright 2019-2021 by XGBoost Contributors
*/
#include <dmlc/filesystem.h>
#include <gtest/gtest.h>

Expand Down Expand Up @@ -126,43 +129,15 @@ TEST(HistUtil, DeviceSketchCategoricalAsNumeric) {
}
}

void TestCategoricalSketch(size_t n, size_t num_categories, int32_t num_bins, bool weighted) {
auto x = GenerateRandomCategoricalSingleColumn(n, num_categories);
auto dmat = GetDMatrixFromData(x, n, 1);
dmat->Info().feature_types.HostVector().push_back(FeatureType::kCategorical);

if (weighted) {
std::vector<float> weights(n, 0);
SimpleLCG lcg;
SimpleRealUniformDistribution<float> dist(0, 1);
for (auto& v : weights) {
v = dist(&lcg);
}
dmat->Info().weights_.HostVector() = weights;
}

ASSERT_EQ(dmat->Info().feature_types.Size(), 1);
auto cuts = DeviceSketch(0, dmat.get(), num_bins);
std::sort(x.begin(), x.end());
auto n_uniques = std::unique(x.begin(), x.end()) - x.begin();
ASSERT_NE(n_uniques, x.size());
ASSERT_EQ(cuts.TotalBins(), n_uniques);
ASSERT_EQ(n_uniques, num_categories);

auto& values = cuts.cut_values_.HostVector();
ASSERT_TRUE(std::is_sorted(values.cbegin(), values.cend()));
auto is_unique = (std::unique(values.begin(), values.end()) - values.begin()) == n_uniques;
ASSERT_TRUE(is_unique);

x.resize(n_uniques);
for (size_t i = 0; i < n_uniques; ++i) {
ASSERT_EQ(x[i], values[i]);
}
}

TEST(HistUtil, DeviceSketchCategoricalFeatures) {
TestCategoricalSketch(1000, 256, 32, false);
TestCategoricalSketch(1000, 256, 32, true);
TestCategoricalSketch(1000, 256, 32, false,
[](DMatrix *p_fmat, int32_t num_bins) {
return DeviceSketch(0, p_fmat, num_bins);
});
TestCategoricalSketch(1000, 256, 32, true,
[](DMatrix *p_fmat, int32_t num_bins) {
return DeviceSketch(0, p_fmat, num_bins);
});
}

TEST(HistUtil, DeviceSketchMultipleColumns) {
Expand Down
45 changes: 45 additions & 0 deletions tests/cpp/common/test_hist_util.h
@@ -1,10 +1,15 @@
/*!
* Copyright 2019-2021 by XGBoost Contributors
*/
#pragma once
#include <gtest/gtest.h>
#include <dmlc/filesystem.h>
#include <random>
#include <vector>
#include <string>
#include <fstream>

#include "../helpers.h"
#include "../../../src/common/hist_util.h"
#include "../../../src/data/simple_dmatrix.h"
#include "../../../src/data/adapter.h"
Expand Down Expand Up @@ -206,5 +211,45 @@ inline void ValidateCuts(const HistogramCuts& cuts, DMatrix* dmat,
}
}

/**
* \brief Test for sketching on categorical data.
*
* \param sketch Sketch function, can be on device or on host.
*/
template <typename Fn>
void TestCategoricalSketch(size_t n, size_t num_categories, int32_t num_bins,
bool weighted, Fn sketch) {
auto x = GenerateRandomCategoricalSingleColumn(n, num_categories);
auto dmat = GetDMatrixFromData(x, n, 1);
dmat->Info().feature_types.HostVector().push_back(FeatureType::kCategorical);

if (weighted) {
std::vector<float> weights(n, 0);
SimpleLCG lcg;
SimpleRealUniformDistribution<float> dist(0, 1);
for (auto& v : weights) {
v = dist(&lcg);
}
dmat->Info().weights_.HostVector() = weights;
}

ASSERT_EQ(dmat->Info().feature_types.Size(), 1);
auto cuts = sketch(dmat.get(), num_bins);
std::sort(x.begin(), x.end());
auto n_uniques = std::unique(x.begin(), x.end()) - x.begin();
ASSERT_NE(n_uniques, x.size());
ASSERT_EQ(cuts.TotalBins(), n_uniques);
ASSERT_EQ(n_uniques, num_categories);

auto& values = cuts.cut_values_.HostVector();
ASSERT_TRUE(std::is_sorted(values.cbegin(), values.cend()));
auto is_unique = (std::unique(values.begin(), values.end()) - values.begin()) == n_uniques;
ASSERT_TRUE(is_unique);

x.resize(n_uniques);
for (size_t i = 0; i < n_uniques; ++i) {
ASSERT_EQ(x[i], values[i]);
}
}
} // namespace common
} // namespace xgboost
8 changes: 6 additions & 2 deletions tests/cpp/common/test_quantile.cc
Expand Up @@ -43,12 +43,14 @@ void TestDistributedQuantile(size_t rows, size_t cols) {
// Generate cuts for distributed environment.
auto sparsity = 0.5f;
auto rank = rabit::GetRank();
HostSketchContainer sketch_distributed(column_size, n_bins, false, OmpGetNumThreads(0));
auto m = RandomDataGenerator{rows, cols, sparsity}
.Seed(rank)
.Lower(.0f)
.Upper(1.0f)
.GenerateDMatrix();
HostSketchContainer sketch_distributed(
column_size, n_bins, m->Info().feature_types.ConstHostSpan(), false,
OmpGetNumThreads(0));
for (auto const &page : m->GetBatches<SparsePage>()) {
sketch_distributed.PushRowPage(page, m->Info());
}
Expand All @@ -59,7 +61,9 @@ void TestDistributedQuantile(size_t rows, size_t cols) {
rabit::Finalize();
CHECK_EQ(rabit::GetWorldSize(), 1);
std::for_each(column_size.begin(), column_size.end(), [=](auto& size) { size *= world; });
HostSketchContainer sketch_on_single_node(column_size, n_bins, false, OmpGetNumThreads(0));
HostSketchContainer sketch_on_single_node(
column_size, n_bins, m->Info().feature_types.ConstHostSpan(), false,
OmpGetNumThreads(0));
for (auto rank = 0; rank < world; ++rank) {
auto m = RandomDataGenerator{rows, cols, sparsity}
.Seed(rank)
Expand Down