diff --git a/python-package/xgboost/core.py b/python-package/xgboost/core.py index 594ed74bb4eb..464e88656406 100644 --- a/python-package/xgboost/core.py +++ b/python-package/xgboost/core.py @@ -577,7 +577,7 @@ def __init__( # force into void_p, mac need to pass things in as void_p if data is None: - self.handle = None + self.handle: Optional[ctypes.c_void_p] = None return from .data import dispatch_data_backend, _is_iter diff --git a/python-package/xgboost/dask.py b/python-package/xgboost/dask.py index cb103b194bb5..9560ff2f1992 100644 --- a/python-package/xgboost/dask.py +++ b/python-package/xgboost/dask.py @@ -1432,9 +1432,7 @@ def inplace_predict( # pylint: disable=unused-argument Value in the input data which needs to be present as a missing value. If None, defaults to np.nan. base_margin: - See :py:obj:`xgboost.DMatrix` for details. Right now classifier is not well - supported with base_margin as it requires the size of base margin to be `n_classes - * n_samples`. + See :py:obj:`xgboost.DMatrix` for details. .. versionadded:: 1.4.0 diff --git a/python-package/xgboost/data.py b/python-package/xgboost/data.py index 664911a5474b..285b2e840f59 100644 --- a/python-package/xgboost/data.py +++ b/python-package/xgboost/data.py @@ -18,6 +18,11 @@ CAT_T = "c" +# meta info that can be a matrix instead of vector. +# For now it's base_margin for multi-class, but it can be extended to label once we have +# multi-output. +_matrix_meta = {"base_margin"} + def _warn_unused_missing(data, missing): if (missing is not None) and (not np.isnan(missing)): @@ -217,7 +222,7 @@ def _is_modin_df(data): } -def _invalid_dataframe_dtype(data) -> None: +def _invalid_dataframe_dtype(data: Any) -> None: # pandas series has `dtypes` but it's just a single object # cudf series doesn't have `dtypes`. if hasattr(data, "dtypes") and hasattr(data.dtypes, "__iter__"): @@ -291,7 +296,7 @@ def _transform_pandas_df( else: transformed = data - if meta and len(data.columns) > 1: + if meta and len(data.columns) > 1 and meta not in _matrix_meta: raise ValueError(f"DataFrame for {meta} cannot have multiple columns") dtype = meta_type if meta_type else np.float32 @@ -323,6 +328,18 @@ def _is_pandas_series(data): return isinstance(data, pd.Series) +def _meta_from_pandas_series( + data, name: str, dtype: Optional[str], handle: ctypes.c_void_p +) -> None: + """Help transform pandas series for meta data like labels""" + data = data.values.astype('float') + from pandas.api.types import is_sparse + if is_sparse(data): + data = data.to_dense() + assert len(data.shape) == 1 or data.shape[1] == 0 or data.shape[1] == 1 + _meta_from_numpy(data, name, dtype, handle) + + def _is_modin_series(data): try: import modin.pandas as pd @@ -374,9 +391,9 @@ def _transform_dt_df( ): """Validate feature names and types if data table""" if meta and data.shape[1] > 1: - raise ValueError( - 'DataTable for label or weight cannot have multiple columns') + raise ValueError('DataTable for meta info cannot have multiple columns') if meta: + meta_type = "float" if meta_type is None else meta_type # below requires new dt version # extract first column data = data.to_numpy()[:, 0].astype(meta_type) @@ -820,19 +837,27 @@ def _to_data_type(dtype: str, name: str): return dtype_map[dtype] -def _validate_meta_shape(data, name: str) -> None: +def _validate_meta_shape(data: Any, name: str) -> None: if hasattr(data, "shape"): + msg = f"Invalid shape: {data.shape} for {name}" + if name in _matrix_meta: + if len(data.shape) > 2: + raise ValueError(msg) + return + if len(data.shape) > 2 or ( len(data.shape) == 2 and (data.shape[1] != 0 and data.shape[1] != 1) ): raise ValueError(f"Invalid shape: {data.shape} for {name}") -def _meta_from_numpy(data, field, dtype, handle): +def _meta_from_numpy( + data: np.ndarray, field: str, dtype, handle: ctypes.c_void_p +) -> None: data = _maybe_np_slice(data, dtype) interface = data.__array_interface__ assert interface.get('mask', None) is None, 'Masked array is not supported' - size = data.shape[0] + size = data.size c_type = _to_data_type(str(data.dtype), field) ptr = interface['data'][0] @@ -855,17 +880,13 @@ def _meta_from_tuple(data, field, dtype, handle): return _meta_from_list(data, field, dtype, handle) -def _meta_from_cudf_df(data, field, handle): - if len(data.columns) != 1: - raise ValueError( - 'Expecting meta-info to contain a single column') - data = data[data.columns[0]] - - interface = bytes(json.dumps([data.__cuda_array_interface__], - indent=2), 'utf-8') - _check_call(_LIB.XGDMatrixSetInfoFromInterface(handle, - c_str(field), - interface)) +def _meta_from_cudf_df(data, field: str, handle: ctypes.c_void_p) -> None: + if field not in _matrix_meta: + _meta_from_cudf_series(data.iloc[:, 0], field, handle) + else: + data = data.values + interface = _cuda_array_interface(data) + _check_call(_LIB.XGDMatrixSetInfoFromInterface(handle, c_str(field), interface)) def _meta_from_cudf_series(data, field, handle): @@ -885,14 +906,15 @@ def _meta_from_cupy_array(data, field, handle): interface)) -def _meta_from_dt(data, field, dtype, handle): - data, _, _ = _transform_dt_df(data, None, None) +def _meta_from_dt(data, field: str, dtype, handle: ctypes.c_void_p): + data, _, _ = _transform_dt_df(data, None, None, field, dtype) _meta_from_numpy(data, field, dtype, handle) def dispatch_meta_backend(matrix: DMatrix, data, name: str, dtype: str = None): '''Dispatch for meta info.''' handle = matrix.handle + assert handle is not None _validate_meta_shape(data, name) if data is None: return @@ -911,9 +933,7 @@ def dispatch_meta_backend(matrix: DMatrix, data, name: str, dtype: str = None): _meta_from_numpy(data, name, dtype, handle) return if _is_pandas_series(data): - data = data.values.astype('float') - assert len(data.shape) == 1 or data.shape[1] == 0 or data.shape[1] == 1 - _meta_from_numpy(data, name, dtype, handle) + _meta_from_pandas_series(data, name, dtype, handle) return if _is_dlpack(data): data = _transform_dlpack(data) diff --git a/src/data/array_interface.h b/src/data/array_interface.h index b7ca311439ae..6524f4512407 100644 --- a/src/data/array_interface.h +++ b/src/data/array_interface.h @@ -210,27 +210,28 @@ class ArrayInterfaceHandler { } static void ExtractStride(std::map const &column, - size_t strides[2], size_t rows, size_t cols, size_t itemsize) { + size_t *stride_r, size_t *stride_c, size_t rows, + size_t cols, size_t itemsize) { auto strides_it = column.find("strides"); if (strides_it == column.cend() || IsA(strides_it->second)) { // default strides - strides[0] = cols; - strides[1] = 1; + *stride_r = cols; + *stride_c = 1; } else { // strides specified by the array interface auto const &j_strides = get(strides_it->second); CHECK_LE(j_strides.size(), 2) << ArrayInterfaceErrors::Dimension(2); - strides[0] = get(j_strides[0]) / itemsize; + *stride_r = get(j_strides[0]) / itemsize; size_t n = 1; if (j_strides.size() == 2) { n = get(j_strides[1]) / itemsize; } - strides[1] = n; + *stride_c = n; } - auto valid = rows * strides[0] + cols * strides[1] >= (rows * cols); + auto valid = rows * (*stride_r) + cols * (*stride_c) >= (rows * cols); CHECK(valid) << "Invalid strides in array." - << " strides: (" << strides[0] << "," << strides[1] + << " strides: (" << (*stride_r) << "," << (*stride_c) << "), shape: (" << rows << ", " << cols << ")"; } @@ -281,8 +282,8 @@ class ArrayInterface { << "Masked array is not yet supported."; } - ArrayInterfaceHandler::ExtractStride(array, strides, num_rows, num_cols, - typestr[2] - '0'); + ArrayInterfaceHandler::ExtractStride(array, &stride_row, &stride_col, + num_rows, num_cols, typestr[2] - '0'); auto stream_it = array.find("stream"); if (stream_it != array.cend() && !IsA(stream_it->second)) { @@ -323,8 +324,8 @@ class ArrayInterface { num_rows = std::max(num_rows, static_cast(num_cols)); num_cols = 1; - strides[0] = std::max(strides[0], strides[1]); - strides[1] = 1; + stride_row = std::max(stride_row, stride_col); + stride_col = 1; } void AssignType(StringView typestr) { @@ -406,13 +407,14 @@ class ArrayInterface { template XGBOOST_DEVICE T GetElement(size_t r, size_t c) const { return this->DispatchCall( - [=](auto *p_values) -> T { return p_values[strides[0] * r + strides[1] * c]; }); + [=](auto *p_values) -> T { return p_values[stride_row * r + stride_col * c]; }); } RBitField8 valid; bst_row_t num_rows; bst_feature_t num_cols; - size_t strides[2]{0, 0}; + size_t stride_row{0}; + size_t stride_col{0}; void* data; Type type; }; diff --git a/src/data/data.cu b/src/data/data.cu index 2f298f330743..c4c7c503cb2e 100644 --- a/src/data/data.cu +++ b/src/data/data.cu @@ -30,12 +30,16 @@ void CopyInfoImpl(ArrayInterface column, HostDeviceVector* out) { return; } out->SetDevice(ptr_device); - out->Resize(column.num_rows); - auto p_dst = thrust::device_pointer_cast(out->DevicePointer()); + size_t size = column.num_rows * column.num_cols; + CHECK_NE(size, 0); + out->Resize(size); - dh::LaunchN(column.num_rows, [=] __device__(size_t idx) { - p_dst[idx] = column.GetElement(idx, 0); + auto p_dst = thrust::device_pointer_cast(out->DevicePointer()); + dh::LaunchN(size, [=] __device__(size_t idx) { + size_t ridx = idx / column.num_cols; + size_t cidx = idx - (ridx * column.num_cols); + p_dst[idx] = column.GetElement(ridx, cidx); }); } @@ -126,16 +130,8 @@ void ValidateQueryGroup(std::vector const &group_ptr_); void MetaInfo::SetInfo(const char * c_key, std::string const& interface_str) { Json j_interface = Json::Load({interface_str.c_str(), interface_str.size()}); - auto const& j_arr = get(j_interface); - CHECK_EQ(j_arr.size(), 1) - << "MetaInfo: " << c_key << ". " << ArrayInterfaceErrors::Dimension(1); ArrayInterface array_interface(interface_str); std::string key{c_key}; - if (!((array_interface.num_cols == 1 && array_interface.num_rows == 0) || - (array_interface.num_cols == 0 && array_interface.num_rows == 1))) { - // Not an empty column, transform it. - array_interface.AsColumnVector(); - } CHECK(!array_interface.valid.Data()) << "Meta info " << key << " should be dense, found validity mask"; @@ -143,6 +139,18 @@ void MetaInfo::SetInfo(const char * c_key, std::string const& interface_str) { return; } + if (key == "base_margin") { + CopyInfoImpl(array_interface, &base_margin_); + return; + } + + CHECK(array_interface.num_cols == 1 || array_interface.num_rows == 1) + << "MetaInfo: " << c_key << " has invalid shape"; + if (!((array_interface.num_cols == 1 && array_interface.num_rows == 0) || + (array_interface.num_cols == 0 && array_interface.num_rows == 1))) { + // Not an empty column, transform it. + array_interface.AsColumnVector(); + } if (key == "label") { CopyInfoImpl(array_interface, &labels_); auto ptr = labels_.ConstDevicePointer(); @@ -155,8 +163,6 @@ void MetaInfo::SetInfo(const char * c_key, std::string const& interface_str) { auto valid = thrust::none_of(thrust::device, ptr, ptr + weights_.Size(), WeightsCheck{}); CHECK(valid) << "Weights must be positive values."; - } else if (key == "base_margin") { - CopyInfoImpl(array_interface, &base_margin_); } else if (key == "group") { CopyGroupInfoImpl(array_interface, &group_ptr_); ValidateQueryGroup(group_ptr_); diff --git a/src/predictor/cpu_predictor.cc b/src/predictor/cpu_predictor.cc index b113a31761f5..d581f64a1d56 100644 --- a/src/predictor/cpu_predictor.cc +++ b/src/predictor/cpu_predictor.cc @@ -290,27 +290,16 @@ class CPUPredictor : public Predictor { const auto& base_margin = info.base_margin_.HostVector(); out_preds->Resize(n); std::vector& out_preds_h = out_preds->HostVector(); - if (base_margin.size() == n) { - CHECK_EQ(out_preds->Size(), n); - std::copy(base_margin.begin(), base_margin.end(), out_preds_h.begin()); - } else { - if (!base_margin.empty()) { - std::ostringstream oss; - oss << "Ignoring the base margin, since it has incorrect length. " - << "The base margin must be an array of length "; - if (model.learner_model_param->num_output_group > 1) { - oss << "[num_class] * [number of data points], i.e. " - << model.learner_model_param->num_output_group << " * " << info.num_row_ - << " = " << n << ". "; - } else { - oss << "[number of data points], i.e. " << info.num_row_ << ". "; - } - oss << "Instead, all data points will use " - << "base_score = " << model.learner_model_param->base_score; - LOG(WARNING) << oss.str(); - } + if (base_margin.empty()) { std::fill(out_preds_h.begin(), out_preds_h.end(), model.learner_model_param->base_score); + } else { + std::string expected{ + "(" + std::to_string(info.num_row_) + ", " + + std::to_string(model.learner_model_param->num_output_group) + ")"}; + CHECK_EQ(base_margin.size(), n) + << "Invalid shape of base_margin. Expected:" << expected; + std::copy(base_margin.begin(), base_margin.end(), out_preds_h.begin()); } } diff --git a/src/predictor/gpu_predictor.cu b/src/predictor/gpu_predictor.cu index d1b9c8132659..51674237e973 100644 --- a/src/predictor/gpu_predictor.cu +++ b/src/predictor/gpu_predictor.cu @@ -938,7 +938,11 @@ class GPUPredictor : public xgboost::Predictor { out_preds->SetDevice(generic_param_->gpu_id); out_preds->Resize(n); if (base_margin.Size() != 0) { - CHECK_EQ(base_margin.Size(), n); + std::string expected{ + "(" + std::to_string(info.num_row_) + ", " + + std::to_string(model.learner_model_param->num_output_group) + ")"}; + CHECK_EQ(base_margin.Size(), n) + << "Invalid shape of base_margin. Expected:" << expected; out_preds->Copy(base_margin); } else { out_preds->Fill(model.learner_model_param->base_score); diff --git a/tests/cpp/data/test_metainfo.cc b/tests/cpp/data/test_metainfo.cc index a58bda90d9df..bb5452a56d28 100644 --- a/tests/cpp/data/test_metainfo.cc +++ b/tests/cpp/data/test_metainfo.cc @@ -252,6 +252,8 @@ TEST(MetaInfo, Validate) { EXPECT_THROW(info.Validate(1), dmlc::Error); xgboost::HostDeviceVector d_groups{groups}; + d_groups.SetDevice(0); + d_groups.DevicePointer(); // pull to device auto arr_interface = xgboost::GetArrayInterface(&d_groups, 64, 1); std::string arr_interface_str; xgboost::Json::Dump(arr_interface, &arr_interface_str); diff --git a/tests/python-gpu/test_from_cudf.py b/tests/python-gpu/test_from_cudf.py index 6250ab328208..defc2b2197be 100644 --- a/tests/python-gpu/test_from_cudf.py +++ b/tests/python-gpu/test_from_cudf.py @@ -5,6 +5,7 @@ sys.path.append("tests/python") import testing as tm +from test_dmatrix import set_base_margin_info def dmatrix_from_cudf(input_type, DMatrixT, missing=np.NAN): @@ -142,6 +143,8 @@ def _test_cudf_metainfo(DMatrixT): dmat_cudf.get_float_info('base_margin')) assert np.array_equal(dmat.get_uint_info('group_ptr'), dmat_cudf.get_uint_info('group_ptr')) + set_base_margin_info(df, DMatrixT, "gpu_hist") + class TestFromColumnar: '''Tests for constructing DMatrix from data structure conforming Apache diff --git a/tests/python-gpu/test_from_cupy.py b/tests/python-gpu/test_from_cupy.py index fee5edbb8c44..cbed40777fc6 100644 --- a/tests/python-gpu/test_from_cupy.py +++ b/tests/python-gpu/test_from_cupy.py @@ -5,6 +5,7 @@ sys.path.append("tests/python") import testing as tm +from test_dmatrix import set_base_margin_info def dmatrix_from_cupy(input_type, DMatrixT, missing=np.NAN): @@ -107,6 +108,8 @@ def _test_cupy_metainfo(DMatrixT): assert np.array_equal(dmat.get_uint_info('group_ptr'), dmat_cupy.get_uint_info('group_ptr')) + set_base_margin_info(cp.asarray, DMatrixT, "gpu_hist") + @pytest.mark.skipif(**tm.no_cupy()) @pytest.mark.skipif(**tm.no_sklearn()) diff --git a/tests/python-gpu/test_gpu_with_dask.py b/tests/python-gpu/test_gpu_with_dask.py index d9b9f8684863..7150566c89a5 100644 --- a/tests/python-gpu/test_gpu_with_dask.py +++ b/tests/python-gpu/test_gpu_with_dask.py @@ -22,6 +22,7 @@ from test_with_dask import run_empty_dmatrix_auc # noqa from test_with_dask import run_auc # noqa from test_with_dask import run_boost_from_prediction # noqa +from test_with_dask import run_boost_from_prediction_multi_clasas # noqa from test_with_dask import run_dask_classifier # noqa from test_with_dask import run_empty_dmatrix_cls # noqa from test_with_dask import _get_client_workers # noqa @@ -297,13 +298,18 @@ def run_gpu_hist( @pytest.mark.skipif(**tm.no_cudf()) def test_boost_from_prediction(local_cuda_cluster: LocalCUDACluster) -> None: import cudf - from sklearn.datasets import load_breast_cancer + from sklearn.datasets import load_breast_cancer, load_digits with Client(local_cuda_cluster) as client: X_, y_ = load_breast_cancer(return_X_y=True) X = dd.from_array(X_, chunksize=100).map_partitions(cudf.from_pandas) y = dd.from_array(y_, chunksize=100).map_partitions(cudf.from_pandas) run_boost_from_prediction(X, y, "gpu_hist", client) + X_, y_ = load_digits(return_X_y=True) + X = dd.from_array(X_, chunksize=100).map_partitions(cudf.from_pandas) + y = dd.from_array(y_, chunksize=100).map_partitions(cudf.from_pandas) + run_boost_from_prediction_multi_clasas(X, y, "gpu_hist", client) + class TestDistributedGPU: @pytest.mark.skipif(**tm.no_dask()) diff --git a/tests/python-gpu/test_gpu_with_sklearn.py b/tests/python-gpu/test_gpu_with_sklearn.py index 5f70ef6310b1..17a2ccf832e9 100644 --- a/tests/python-gpu/test_gpu_with_sklearn.py +++ b/tests/python-gpu/test_gpu_with_sklearn.py @@ -35,8 +35,25 @@ def test_gpu_binary_classification(): assert err < 0.1 +@pytest.mark.skipif(**tm.no_cupy()) +@pytest.mark.skipif(**tm.no_cudf()) def test_boost_from_prediction_gpu_hist(): - twskl.run_boost_from_prediction('gpu_hist') + from sklearn.datasets import load_breast_cancer, load_digits + import cupy as cp + import cudf + + tree_method = "gpu_hist" + X, y = load_breast_cancer(return_X_y=True) + X, y = cp.array(X), cp.array(y) + + twskl.run_boost_from_prediction_binary(tree_method, X, y, None) + twskl.run_boost_from_prediction_binary(tree_method, X, y, cudf.DataFrame) + + X, y = load_digits(return_X_y=True) + X, y = cp.array(X), cp.array(y) + + twskl.run_boost_from_prediction_multi_clasas(tree_method, X, y, None) + twskl.run_boost_from_prediction_multi_clasas(tree_method, X, y, cudf.DataFrame) def test_num_parallel_tree(): diff --git a/tests/python/test_dmatrix.py b/tests/python/test_dmatrix.py index 313cf822fb98..1b5ad266d3d9 100644 --- a/tests/python/test_dmatrix.py +++ b/tests/python/test_dmatrix.py @@ -15,6 +15,24 @@ rng = np.random.RandomState(1994) +def set_base_margin_info(DType, DMatrixT, tm: str): + rng = np.random.default_rng() + X = DType(rng.normal(0, 1.0, size=100).reshape(50, 2)) + if hasattr(X, "iloc"): + y = X.iloc[:, 0] + else: + y = X[:, 0] + base_margin = X + # no error at set + Xy = DMatrixT(X, y, base_margin=base_margin) + # Error at train, caused by check in predictor. + with pytest.raises(ValueError, match=r".*base_margin.*"): + xgb.train({"tree_method": tm}, Xy) + + # FIXME(jiamingy): Currently the metainfo has no concept of shape. If you pass a + # base_margin with shape (n_classes, n_samples) to XGBoost the result is undefined. + + class TestDMatrix: def test_warn_missing(self): from xgboost import data @@ -122,7 +140,7 @@ def test_slice(self): # base margin is per-class in multi-class classifier base_margin = rng.randn(100, 3).astype(np.float32) - d.set_base_margin(base_margin.flatten()) + d.set_base_margin(base_margin) ridxs = [1, 2, 3, 4, 5, 6] sliced = d.slice(ridxs) @@ -380,3 +398,6 @@ def test_uri_categorical(self): feature_types = ["q"] * 5 + ["c"] + ["q"] * 120 Xy = xgb.DMatrix(path + "?indexing_mode=1", feature_types=feature_types) np.testing.assert_equal(np.array(Xy.feature_types), np.array(feature_types)) + + def test_base_margin(self): + set_base_margin_info(np.asarray, xgb.DMatrix, "hist") diff --git a/tests/python/test_with_dask.py b/tests/python/test_with_dask.py index f506d0d80e92..3315d3d8f184 100644 --- a/tests/python/test_with_dask.py +++ b/tests/python/test_with_dask.py @@ -7,7 +7,7 @@ import numpy as np import scipy import json -from typing import List, Tuple, Dict, Optional, Type, Any +from typing import List, Tuple, Dict, Optional, Type, Any, Callable import asyncio from functools import partial from concurrent.futures import ThreadPoolExecutor @@ -182,6 +182,50 @@ def test_dask_predict_shape_infer(client: "Client") -> None: assert prediction.shape[1] == 3 +def run_boost_from_prediction_multi_clasas( + X: xgb.dask._DaskCollection, + y: xgb.dask._DaskCollection, + tree_method: str, + client: "Client" +) -> None: + model_0 = xgb.dask.DaskXGBClassifier( + learning_rate=0.3, random_state=0, n_estimators=4, tree_method=tree_method + ) + model_0.fit(X=X, y=y) + margin = xgb.dask.inplace_predict( + client, model_0.get_booster(), X, predict_type="margin" + ) + + model_1 = xgb.dask.DaskXGBClassifier( + learning_rate=0.3, random_state=0, n_estimators=4, tree_method=tree_method + ) + model_1.fit(X=X, y=y, base_margin=margin) + predictions_1 = xgb.dask.predict( + client, + model_1.get_booster(), + xgb.dask.DaskDMatrix(client, X, base_margin=margin), + output_margin=True + ) + + model_2 = xgb.dask.DaskXGBClassifier( + learning_rate=0.3, random_state=0, n_estimators=8, tree_method=tree_method + ) + model_2.fit(X=X, y=y) + predictions_2 = xgb.dask.inplace_predict( + client, model_2.get_booster(), X, predict_type="margin" + ) + a = predictions_1.compute() + b = predictions_2.compute() + # cupy/cudf + if hasattr(a, "get"): + a = a.get() + if hasattr(b, "values"): + b = b.values + if hasattr(b, "get"): + b = b.get() + np.testing.assert_allclose(a, b, atol=1e-5) + + def run_boost_from_prediction( X: xgb.dask._DaskCollection, y: xgb.dask._DaskCollection, tree_method: str, client: "Client" ) -> None: @@ -227,11 +271,15 @@ def run_boost_from_prediction( @pytest.mark.parametrize("tree_method", ["hist", "approx"]) def test_boost_from_prediction(tree_method: str, client: "Client") -> None: - from sklearn.datasets import load_breast_cancer + from sklearn.datasets import load_breast_cancer, load_digits X_, y_ = load_breast_cancer(return_X_y=True) X, y = dd.from_array(X_, chunksize=100), dd.from_array(y_, chunksize=100) run_boost_from_prediction(X, y, tree_method, client) + X_, y_ = load_digits(return_X_y=True) + X, y = dd.from_array(X_, chunksize=100), dd.from_array(y_, chunksize=100) + run_boost_from_prediction_multi_clasas(X, y, tree_method, client) + def test_inplace_predict(client: "Client") -> None: from sklearn.datasets import load_boston diff --git a/tests/python/test_with_modin.py b/tests/python/test_with_modin.py index 9a5ca32e5bb7..e997202d12d8 100644 --- a/tests/python/test_with_modin.py +++ b/tests/python/test_with_modin.py @@ -3,6 +3,7 @@ import xgboost as xgb import testing as tm import pytest +from test_dmatrix import set_base_margin_info try: import modin.pandas as md @@ -144,3 +145,6 @@ def test_modin_weight(self): assert data.num_col() == kCols np.testing.assert_array_equal(data.get_weight(), w) + + def test_base_margin(self): + set_base_margin_info(md.DataFrame, xgb.DMatrix, "hist") diff --git a/tests/python/test_with_pandas.py b/tests/python/test_with_pandas.py index 0b25993a5ee5..a1feaacd484a 100644 --- a/tests/python/test_with_pandas.py +++ b/tests/python/test_with_pandas.py @@ -3,6 +3,7 @@ import xgboost as xgb import testing as tm import pytest +from test_dmatrix import set_base_margin_info try: import pandas as pd @@ -205,6 +206,9 @@ def test_pandas_weight(self): np.testing.assert_array_equal(data.get_weight(), w) + def test_base_margin(self): + set_base_margin_info(pd.DataFrame, xgb.DMatrix, "hist") + def test_cv_as_pandas(self): dm = xgb.DMatrix(dpath + 'agaricus.txt.train') params = {'max_depth': 2, 'eta': 1, 'verbosity': 0, diff --git a/tests/python/test_with_sklearn.py b/tests/python/test_with_sklearn.py index 2a400871dc2b..13880a4355fb 100644 --- a/tests/python/test_with_sklearn.py +++ b/tests/python/test_with_sklearn.py @@ -1,3 +1,4 @@ +from typing import Callable, Optional import collections import importlib.util import numpy as np @@ -1147,32 +1148,83 @@ def test_feature_weights(): assert poly_decreasing[0] < -0.08 -def run_boost_from_prediction(tree_method): - from sklearn.datasets import load_breast_cancer - X, y = load_breast_cancer(return_X_y=True) +def run_boost_from_prediction_binary(tree_method, X, y, as_frame: Optional[Callable]): + """ + Parameters + ---------- + + as_frame: A callable function to convert margin into DataFrame, useful for different + df implementations. + """ + model_0 = xgb.XGBClassifier( - learning_rate=0.3, random_state=0, n_estimators=4, - tree_method=tree_method) + learning_rate=0.3, random_state=0, n_estimators=4, tree_method=tree_method + ) model_0.fit(X=X, y=y) margin = model_0.predict(X, output_margin=True) + if as_frame is not None: + margin = as_frame(margin) model_1 = xgb.XGBClassifier( - learning_rate=0.3, random_state=0, n_estimators=4, - tree_method=tree_method) + learning_rate=0.3, random_state=0, n_estimators=4, tree_method=tree_method + ) model_1.fit(X=X, y=y, base_margin=margin) predictions_1 = model_1.predict(X, base_margin=margin) cls_2 = xgb.XGBClassifier( - learning_rate=0.3, random_state=0, n_estimators=8, - tree_method=tree_method) + learning_rate=0.3, random_state=0, n_estimators=8, tree_method=tree_method + ) cls_2.fit(X=X, y=y) predictions_2 = cls_2.predict(X) - assert np.all(predictions_1 == predictions_2) + np.testing.assert_allclose(predictions_1, predictions_2) + + +def run_boost_from_prediction_multi_clasas( + tree_method, X, y, as_frame: Optional[Callable] +): + # Multi-class + model_0 = xgb.XGBClassifier( + learning_rate=0.3, random_state=0, n_estimators=4, tree_method=tree_method + ) + model_0.fit(X=X, y=y) + margin = model_0.get_booster().inplace_predict(X, predict_type="margin") + if as_frame is not None: + margin = as_frame(margin) + + model_1 = xgb.XGBClassifier( + learning_rate=0.3, random_state=0, n_estimators=4, tree_method=tree_method + ) + model_1.fit(X=X, y=y, base_margin=margin) + predictions_1 = model_1.get_booster().predict( + xgb.DMatrix(X, base_margin=margin), output_margin=True + ) + + model_2 = xgb.XGBClassifier( + learning_rate=0.3, random_state=0, n_estimators=8, tree_method=tree_method + ) + model_2.fit(X=X, y=y) + predictions_2 = model_2.get_booster().inplace_predict(X, predict_type="margin") + + if hasattr(predictions_1, "get"): + predictions_1 = predictions_1.get() + if hasattr(predictions_2, "get"): + predictions_2 = predictions_2.get() + np.testing.assert_allclose(predictions_1, predictions_2, atol=1e-6) @pytest.mark.parametrize("tree_method", ["hist", "approx", "exact"]) def test_boost_from_prediction(tree_method): - run_boost_from_prediction(tree_method) + from sklearn.datasets import load_breast_cancer, load_digits + import pandas as pd + X, y = load_breast_cancer(return_X_y=True) + + run_boost_from_prediction_binary(tree_method, X, y, None) + run_boost_from_prediction_binary(tree_method, X, y, pd.DataFrame) + + X, y = load_digits(return_X_y=True) + + run_boost_from_prediction_multi_clasas(tree_method, X, y, None) + run_boost_from_prediction_multi_clasas(tree_method, X, y, pd.DataFrame) def test_estimator_type(): diff --git a/tests/python/testing.py b/tests/python/testing.py index 328cc63a2e85..2d0886079f51 100644 --- a/tests/python/testing.py +++ b/tests/python/testing.py @@ -3,6 +3,7 @@ import urllib import zipfile import sys +from typing import Optional from contextlib import contextmanager from io import StringIO from xgboost.compat import SKLEARN_INSTALLED, PANDAS_INSTALLED @@ -177,7 +178,7 @@ def __init__(self, name, get_dataset, objective, metric): self.metric = metric self.X, self.y = get_dataset() self.w = None - self.margin = None + self.margin: Optional[np.ndarray] = None def set_params(self, params_in): params_in['objective'] = self.objective @@ -315,7 +316,7 @@ def make_categorical( @strategies.composite def _dataset_weight_margin(draw): - data = draw(_unweighted_datasets_strategy) + data: TestDataset = draw(_unweighted_datasets_strategy) if draw(strategies.booleans()): data.w = draw(arrays(np.float64, (len(data.y)), elements=strategies.floats(0.1, 2.0))) if draw(strategies.booleans()): @@ -324,6 +325,8 @@ def _dataset_weight_margin(draw): num_class = int(np.max(data.y) + 1) data.margin = draw( arrays(np.float64, (len(data.y) * num_class), elements=strategies.floats(0.5, 1.0))) + if num_class != 1: + data.margin = data.margin.reshape(data.y.shape[0], num_class) return data