Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support multi-class with base margin. #7381

Merged
merged 15 commits into from Nov 2, 2021
2 changes: 1 addition & 1 deletion python-package/xgboost/core.py
Expand Up @@ -577,7 +577,7 @@ def __init__(

# force into void_p, mac need to pass things in as void_p
if data is None:
self.handle = None
self.handle: Optional[ctypes.c_void_p] = None
return

from .data import dispatch_data_backend, _is_iter
Expand Down
4 changes: 1 addition & 3 deletions python-package/xgboost/dask.py
Expand Up @@ -1432,9 +1432,7 @@ def inplace_predict( # pylint: disable=unused-argument
Value in the input data which needs to be present as a missing
value. If None, defaults to np.nan.
base_margin:
See :py:obj:`xgboost.DMatrix` for details. Right now classifier is not well
supported with base_margin as it requires the size of base margin to be `n_classes
* n_samples`.
See :py:obj:`xgboost.DMatrix` for details.

.. versionadded:: 1.4.0

Expand Down
66 changes: 43 additions & 23 deletions python-package/xgboost/data.py
Expand Up @@ -18,6 +18,11 @@

CAT_T = "c"

# meta info that can be a matrix instead of vector.
# For now it's base_margin for multi-class, but it can be extended to label once we have
# multi-output.
_matrix_meta = {"base_margin"}


def _warn_unused_missing(data, missing):
if (missing is not None) and (not np.isnan(missing)):
Expand Down Expand Up @@ -217,7 +222,7 @@ def _is_modin_df(data):
}


def _invalid_dataframe_dtype(data) -> None:
def _invalid_dataframe_dtype(data: Any) -> None:
# pandas series has `dtypes` but it's just a single object
# cudf series doesn't have `dtypes`.
if hasattr(data, "dtypes") and hasattr(data.dtypes, "__iter__"):
Expand Down Expand Up @@ -291,7 +296,7 @@ def _transform_pandas_df(
else:
transformed = data

if meta and len(data.columns) > 1:
if meta and len(data.columns) > 1 and meta not in _matrix_meta:
raise ValueError(f"DataFrame for {meta} cannot have multiple columns")

dtype = meta_type if meta_type else np.float32
Expand Down Expand Up @@ -323,6 +328,18 @@ def _is_pandas_series(data):
return isinstance(data, pd.Series)


def _meta_from_pandas_series(
data, name: str, dtype: Optional[str], handle: ctypes.c_void_p
) -> None:
"""Help transform pandas series for meta data like labels"""
data = data.values.astype('float')
from pandas.api.types import is_sparse
if is_sparse(data):
data = data.to_dense()
assert len(data.shape) == 1 or data.shape[1] == 0 or data.shape[1] == 1
_meta_from_numpy(data, name, dtype, handle)


def _is_modin_series(data):
try:
import modin.pandas as pd
Expand Down Expand Up @@ -374,9 +391,9 @@ def _transform_dt_df(
):
"""Validate feature names and types if data table"""
if meta and data.shape[1] > 1:
raise ValueError(
'DataTable for label or weight cannot have multiple columns')
raise ValueError('DataTable for meta info cannot have multiple columns')
if meta:
meta_type = "float" if meta_type is None else meta_type
# below requires new dt version
# extract first column
data = data.to_numpy()[:, 0].astype(meta_type)
Expand Down Expand Up @@ -820,19 +837,27 @@ def _to_data_type(dtype: str, name: str):
return dtype_map[dtype]


def _validate_meta_shape(data, name: str) -> None:
def _validate_meta_shape(data: Any, name: str) -> None:
if hasattr(data, "shape"):
msg = f"Invalid shape: {data.shape} for {name}"
if name in _matrix_meta:
if len(data.shape) > 2:
raise ValueError(msg)
return

if len(data.shape) > 2 or (
len(data.shape) == 2 and (data.shape[1] != 0 and data.shape[1] != 1)
):
raise ValueError(f"Invalid shape: {data.shape} for {name}")


def _meta_from_numpy(data, field, dtype, handle):
def _meta_from_numpy(
data: np.ndarray, field: str, dtype, handle: ctypes.c_void_p
) -> None:
data = _maybe_np_slice(data, dtype)
interface = data.__array_interface__
assert interface.get('mask', None) is None, 'Masked array is not supported'
size = data.shape[0]
size = data.size

c_type = _to_data_type(str(data.dtype), field)
ptr = interface['data'][0]
Expand All @@ -855,17 +880,13 @@ def _meta_from_tuple(data, field, dtype, handle):
return _meta_from_list(data, field, dtype, handle)


def _meta_from_cudf_df(data, field, handle):
if len(data.columns) != 1:
raise ValueError(
'Expecting meta-info to contain a single column')
data = data[data.columns[0]]

interface = bytes(json.dumps([data.__cuda_array_interface__],
indent=2), 'utf-8')
_check_call(_LIB.XGDMatrixSetInfoFromInterface(handle,
c_str(field),
interface))
def _meta_from_cudf_df(data, field: str, handle: ctypes.c_void_p) -> None:
if field not in _matrix_meta:
_meta_from_cudf_series(data.iloc[:, 0], field, handle)
else:
data = data.values
interface = _cuda_array_interface(data)
_check_call(_LIB.XGDMatrixSetInfoFromInterface(handle, c_str(field), interface))


def _meta_from_cudf_series(data, field, handle):
Expand All @@ -885,14 +906,15 @@ def _meta_from_cupy_array(data, field, handle):
interface))


def _meta_from_dt(data, field, dtype, handle):
data, _, _ = _transform_dt_df(data, None, None)
def _meta_from_dt(data, field: str, dtype, handle: ctypes.c_void_p):
data, _, _ = _transform_dt_df(data, None, None, field, dtype)
_meta_from_numpy(data, field, dtype, handle)


def dispatch_meta_backend(matrix: DMatrix, data, name: str, dtype: str = None):
'''Dispatch for meta info.'''
handle = matrix.handle
assert handle is not None
_validate_meta_shape(data, name)
if data is None:
return
Expand All @@ -911,9 +933,7 @@ def dispatch_meta_backend(matrix: DMatrix, data, name: str, dtype: str = None):
_meta_from_numpy(data, name, dtype, handle)
return
if _is_pandas_series(data):
data = data.values.astype('float')
assert len(data.shape) == 1 or data.shape[1] == 0 or data.shape[1] == 1
_meta_from_numpy(data, name, dtype, handle)
_meta_from_pandas_series(data, name, dtype, handle)
return
if _is_dlpack(data):
data = _transform_dlpack(data)
Expand Down
28 changes: 15 additions & 13 deletions src/data/array_interface.h
Expand Up @@ -210,27 +210,28 @@ class ArrayInterfaceHandler {
}

static void ExtractStride(std::map<std::string, Json> const &column,
size_t strides[2], size_t rows, size_t cols, size_t itemsize) {
size_t *stride_r, size_t *stride_c, size_t rows,
size_t cols, size_t itemsize) {
auto strides_it = column.find("strides");
if (strides_it == column.cend() || IsA<Null>(strides_it->second)) {
// default strides
strides[0] = cols;
strides[1] = 1;
*stride_r = cols;
*stride_c = 1;
} else {
// strides specified by the array interface
auto const &j_strides = get<Array const>(strides_it->second);
CHECK_LE(j_strides.size(), 2) << ArrayInterfaceErrors::Dimension(2);
strides[0] = get<Integer const>(j_strides[0]) / itemsize;
*stride_r = get<Integer const>(j_strides[0]) / itemsize;
size_t n = 1;
if (j_strides.size() == 2) {
n = get<Integer const>(j_strides[1]) / itemsize;
}
strides[1] = n;
*stride_c = n;
}

auto valid = rows * strides[0] + cols * strides[1] >= (rows * cols);
auto valid = rows * (*stride_r) + cols * (*stride_c) >= (rows * cols);
CHECK(valid) << "Invalid strides in array."
<< " strides: (" << strides[0] << "," << strides[1]
<< " strides: (" << (*stride_r) << "," << (*stride_c)
<< "), shape: (" << rows << ", " << cols << ")";
}

Expand Down Expand Up @@ -281,8 +282,8 @@ class ArrayInterface {
<< "Masked array is not yet supported.";
}

ArrayInterfaceHandler::ExtractStride(array, strides, num_rows, num_cols,
typestr[2] - '0');
ArrayInterfaceHandler::ExtractStride(array, &stride_row, &stride_col,
num_rows, num_cols, typestr[2] - '0');

auto stream_it = array.find("stream");
if (stream_it != array.cend() && !IsA<Null>(stream_it->second)) {
Expand Down Expand Up @@ -323,8 +324,8 @@ class ArrayInterface {
num_rows = std::max(num_rows, static_cast<size_t>(num_cols));
num_cols = 1;

strides[0] = std::max(strides[0], strides[1]);
strides[1] = 1;
stride_row = std::max(stride_row, stride_col);
stride_col = 1;
}

void AssignType(StringView typestr) {
Expand Down Expand Up @@ -406,13 +407,14 @@ class ArrayInterface {
template <typename T = float>
XGBOOST_DEVICE T GetElement(size_t r, size_t c) const {
return this->DispatchCall(
[=](auto *p_values) -> T { return p_values[strides[0] * r + strides[1] * c]; });
[=](auto *p_values) -> T { return p_values[stride_row * r + stride_col * c]; });
}

RBitField8 valid;
bst_row_t num_rows;
bst_feature_t num_cols;
size_t strides[2]{0, 0};
size_t stride_row{0};
size_t stride_col{0};
void* data;
Type type;
};
Expand Down
34 changes: 20 additions & 14 deletions src/data/data.cu
Expand Up @@ -30,12 +30,16 @@ void CopyInfoImpl(ArrayInterface column, HostDeviceVector<float>* out) {
return;
}
out->SetDevice(ptr_device);
out->Resize(column.num_rows);

auto p_dst = thrust::device_pointer_cast(out->DevicePointer());
size_t size = column.num_rows * column.num_cols;
CHECK_NE(size, 0);
out->Resize(size);

dh::LaunchN(column.num_rows, [=] __device__(size_t idx) {
p_dst[idx] = column.GetElement(idx, 0);
auto p_dst = thrust::device_pointer_cast(out->DevicePointer());
dh::LaunchN(size, [=] __device__(size_t idx) {
size_t ridx = idx / column.num_cols;
size_t cidx = idx - (ridx * column.num_cols);
p_dst[idx] = column.GetElement(ridx, cidx);
});
}

Expand Down Expand Up @@ -126,23 +130,27 @@ void ValidateQueryGroup(std::vector<bst_group_t> const &group_ptr_);

void MetaInfo::SetInfo(const char * c_key, std::string const& interface_str) {
Json j_interface = Json::Load({interface_str.c_str(), interface_str.size()});
auto const& j_arr = get<Array>(j_interface);
CHECK_EQ(j_arr.size(), 1)
<< "MetaInfo: " << c_key << ". " << ArrayInterfaceErrors::Dimension(1);
ArrayInterface array_interface(interface_str);
std::string key{c_key};
if (!((array_interface.num_cols == 1 && array_interface.num_rows == 0) ||
(array_interface.num_cols == 0 && array_interface.num_rows == 1))) {
// Not an empty column, transform it.
array_interface.AsColumnVector();
}

CHECK(!array_interface.valid.Data())
<< "Meta info " << key << " should be dense, found validity mask";
if (array_interface.num_rows == 0) {
return;
}

if (key == "base_margin") {
CopyInfoImpl(array_interface, &base_margin_);
return;
}

CHECK(array_interface.num_cols == 1 || array_interface.num_rows == 1)
<< "MetaInfo: " << c_key << " has invalid shape";
if (!((array_interface.num_cols == 1 && array_interface.num_rows == 0) ||
(array_interface.num_cols == 0 && array_interface.num_rows == 1))) {
// Not an empty column, transform it.
array_interface.AsColumnVector();
}
if (key == "label") {
CopyInfoImpl(array_interface, &labels_);
auto ptr = labels_.ConstDevicePointer();
Expand All @@ -155,8 +163,6 @@ void MetaInfo::SetInfo(const char * c_key, std::string const& interface_str) {
auto valid = thrust::none_of(thrust::device, ptr, ptr + weights_.Size(),
WeightsCheck{});
CHECK(valid) << "Weights must be positive values.";
} else if (key == "base_margin") {
CopyInfoImpl(array_interface, &base_margin_);
} else if (key == "group") {
CopyGroupInfoImpl(array_interface, &group_ptr_);
ValidateQueryGroup(group_ptr_);
Expand Down
27 changes: 8 additions & 19 deletions src/predictor/cpu_predictor.cc
Expand Up @@ -290,27 +290,16 @@ class CPUPredictor : public Predictor {
const auto& base_margin = info.base_margin_.HostVector();
out_preds->Resize(n);
std::vector<bst_float>& out_preds_h = out_preds->HostVector();
if (base_margin.size() == n) {
CHECK_EQ(out_preds->Size(), n);
std::copy(base_margin.begin(), base_margin.end(), out_preds_h.begin());
} else {
if (!base_margin.empty()) {
std::ostringstream oss;
oss << "Ignoring the base margin, since it has incorrect length. "
<< "The base margin must be an array of length ";
if (model.learner_model_param->num_output_group > 1) {
oss << "[num_class] * [number of data points], i.e. "
<< model.learner_model_param->num_output_group << " * " << info.num_row_
<< " = " << n << ". ";
} else {
oss << "[number of data points], i.e. " << info.num_row_ << ". ";
}
oss << "Instead, all data points will use "
<< "base_score = " << model.learner_model_param->base_score;
LOG(WARNING) << oss.str();
}
if (base_margin.empty()) {
std::fill(out_preds_h.begin(), out_preds_h.end(),
model.learner_model_param->base_score);
} else {
std::string expected{
"(" + std::to_string(info.num_row_) + ", " +
std::to_string(model.learner_model_param->num_output_group) + ")"};
CHECK_EQ(base_margin.size(), n)
<< "Invalid shape of base_margin. Expected:" << expected;
std::copy(base_margin.begin(), base_margin.end(), out_preds_h.begin());
}
}

Expand Down
6 changes: 5 additions & 1 deletion src/predictor/gpu_predictor.cu
Expand Up @@ -938,7 +938,11 @@ class GPUPredictor : public xgboost::Predictor {
out_preds->SetDevice(generic_param_->gpu_id);
out_preds->Resize(n);
if (base_margin.Size() != 0) {
CHECK_EQ(base_margin.Size(), n);
std::string expected{
"(" + std::to_string(info.num_row_) + ", " +
std::to_string(model.learner_model_param->num_output_group) + ")"};
CHECK_EQ(base_margin.Size(), n)
<< "Invalid shape of base_margin. Expected:" << expected;
out_preds->Copy(base_margin);
} else {
out_preds->Fill(model.learner_model_param->base_score);
Expand Down
2 changes: 2 additions & 0 deletions tests/cpp/data/test_metainfo.cc
Expand Up @@ -252,6 +252,8 @@ TEST(MetaInfo, Validate) {
EXPECT_THROW(info.Validate(1), dmlc::Error);

xgboost::HostDeviceVector<xgboost::bst_group_t> d_groups{groups};
d_groups.SetDevice(0);
d_groups.DevicePointer(); // pull to device
auto arr_interface = xgboost::GetArrayInterface(&d_groups, 64, 1);
std::string arr_interface_str;
xgboost::Json::Dump(arr_interface, &arr_interface_str);
Expand Down
3 changes: 3 additions & 0 deletions tests/python-gpu/test_from_cudf.py
Expand Up @@ -5,6 +5,7 @@

sys.path.append("tests/python")
import testing as tm
from test_dmatrix import set_base_margin_info


def dmatrix_from_cudf(input_type, DMatrixT, missing=np.NAN):
Expand Down Expand Up @@ -142,6 +143,8 @@ def _test_cudf_metainfo(DMatrixT):
dmat_cudf.get_float_info('base_margin'))
assert np.array_equal(dmat.get_uint_info('group_ptr'), dmat_cudf.get_uint_info('group_ptr'))

set_base_margin_info(df, DMatrixT, "gpu_hist")


class TestFromColumnar:
'''Tests for constructing DMatrix from data structure conforming Apache
Expand Down
3 changes: 3 additions & 0 deletions tests/python-gpu/test_from_cupy.py
Expand Up @@ -5,6 +5,7 @@

sys.path.append("tests/python")
import testing as tm
from test_dmatrix import set_base_margin_info


def dmatrix_from_cupy(input_type, DMatrixT, missing=np.NAN):
Expand Down Expand Up @@ -107,6 +108,8 @@ def _test_cupy_metainfo(DMatrixT):
assert np.array_equal(dmat.get_uint_info('group_ptr'),
dmat_cupy.get_uint_info('group_ptr'))

set_base_margin_info(cp.asarray, DMatrixT, "gpu_hist")


@pytest.mark.skipif(**tm.no_cupy())
@pytest.mark.skipif(**tm.no_sklearn())
Expand Down