Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add test for invalid categorical data values. #7380

Merged
merged 2 commits into from Nov 2, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
6 changes: 3 additions & 3 deletions src/common/categorical.h
Expand Up @@ -42,9 +42,9 @@ inline XGBOOST_DEVICE bool Decision(common::Span<uint32_t const> cats, bst_cat_t
return !s_cats.Check(cat);
}

inline void CheckCat(bst_cat_t cat) {
CHECK_GE(cat, 0) << "Invalid categorical value detected. Categorical value "
"should be non-negative.";
inline void InvalidCategory() {
LOG(FATAL) << "Invalid categorical value detected. Categorical value "
"should be non-negative.";
}

struct IsCatOp {
Expand Down
26 changes: 26 additions & 0 deletions src/common/quantile.cu
Expand Up @@ -580,6 +580,19 @@ void SketchContainer::AllReduce() {
timer_.Stop(__func__);
}

namespace {
struct InvalidCat {
Span<float const> values;
Span<uint32_t const> ptrs;
Span<FeatureType const> ft;

XGBOOST_DEVICE bool operator()(size_t i) {
auto fidx = dh::SegmentId(ptrs, i);
return IsCat(ft, fidx) && values[i] < 0;
}
};
} // anonymous namespace

void SketchContainer::MakeCuts(HistogramCuts* p_cuts) {
timer_.Start(__func__);
dh::safe_cuda(cudaSetDevice(device_));
Expand Down Expand Up @@ -669,6 +682,19 @@ void SketchContainer::MakeCuts(HistogramCuts* p_cuts) {
assert(idx+1 < in_column.size());
out_column[idx] = in_column[idx+1].value;
});

if (has_categorical_) {
dh::XGBCachingDeviceAllocator<char> alloc;
auto ptrs = p_cuts->cut_ptrs_.ConstDeviceSpan();
auto it = thrust::make_counting_iterator(0ul);
CHECK_EQ(p_cuts->Ptrs().back(), out_cut_values.size());
auto invalid =
thrust::any_of(thrust::cuda::par(alloc), it, it + out_cut_values.size(),
InvalidCat{out_cut_values, ptrs, d_ft});
if (invalid) {
InvalidCategory();
}
}
timer_.Stop(__func__);
}
} // namespace common
Expand Down
4 changes: 3 additions & 1 deletion src/tree/updater_gpu_hist.cu
Expand Up @@ -580,7 +580,9 @@ struct GPUHistMakerDevice {
CHECK_LT(candidate.split.fvalue, std::numeric_limits<bst_cat_t>::max())
<< "Categorical feature value too large.";
auto cat = common::AsCat(candidate.split.fvalue);
common::CheckCat(cat);
if (cat < 0) {
common::InvalidCategory();
}
std::vector<uint32_t> split_cats(LBitField32::ComputeStorageSize(std::max(cat+1, 1)), 0);
LBitField32 cats_bits(split_cats);
cats_bits.Set(cat);
Expand Down
15 changes: 15 additions & 0 deletions tests/python-gpu/test_gpu_updaters.py
Expand Up @@ -95,6 +95,21 @@ def test_categorical_32_cat(self):
rounds = 4
self.run_categorical_basic(rows, cols, rounds, cats)

def test_invalid_categorical(self):
import cupy as cp
rng = np.random.default_rng()
X = rng.normal(loc=0, scale=1, size=1000).reshape(100, 10)
y = rng.normal(loc=0, scale=1, size=100)

# Check is performe during sketching.
Xy = xgb.DMatrix(X, y, feature_types=["c"] * 10)
with pytest.raises(ValueError):
xgb.train({"tree_method": "gpu_hist"}, Xy)

X, y = cp.array(X), cp.array(y)
with pytest.raises(ValueError):
Xy = xgb.DeviceQuantileDMatrix(X, y, feature_types=["c"] * 10)

@pytest.mark.skipif(**tm.no_cupy())
@given(parameter_strategy, strategies.integers(1, 20),
tm.dataset_strategy)
Expand Down