From 14666b96987e97e9ae420056893823ba40700f68 Mon Sep 17 00:00:00 2001 From: fis Date: Fri, 29 Oct 2021 00:33:03 +0800 Subject: [PATCH 01/15] Support base margin for multi-class. * Numpy and pandas. --- python-package/xgboost/data.py | 14 ++++++--- tests/python/test_with_sklearn.py | 49 ++++++++++++++++++++++++------- 2 files changed, 49 insertions(+), 14 deletions(-) diff --git a/python-package/xgboost/data.py b/python-package/xgboost/data.py index 664911a5474b..3624ad0b0bb5 100644 --- a/python-package/xgboost/data.py +++ b/python-package/xgboost/data.py @@ -217,7 +217,7 @@ def _is_modin_df(data): } -def _invalid_dataframe_dtype(data) -> None: +def _invalid_dataframe_dtype(data: Any) -> None: # pandas series has `dtypes` but it's just a single object # cudf series doesn't have `dtypes`. if hasattr(data, "dtypes") and hasattr(data.dtypes, "__iter__"): @@ -291,7 +291,7 @@ def _transform_pandas_df( else: transformed = data - if meta and len(data.columns) > 1: + if meta and len(data.columns) > 1 and meta != "base_margin": raise ValueError(f"DataFrame for {meta} cannot have multiple columns") dtype = meta_type if meta_type else np.float32 @@ -820,8 +820,14 @@ def _to_data_type(dtype: str, name: str): return dtype_map[dtype] -def _validate_meta_shape(data, name: str) -> None: +def _validate_meta_shape(data: Any, name: str) -> None: + msg = f"Invalid shape: {data.shape} for {name}" if hasattr(data, "shape"): + if name == "base_margin": + if len(data.shape) > 2: + raise ValueError(msg) + return + if len(data.shape) > 2 or ( len(data.shape) == 2 and (data.shape[1] != 0 and data.shape[1] != 1) ): @@ -832,7 +838,7 @@ def _meta_from_numpy(data, field, dtype, handle): data = _maybe_np_slice(data, dtype) interface = data.__array_interface__ assert interface.get('mask', None) is None, 'Masked array is not supported' - size = data.shape[0] + size = data.size c_type = _to_data_type(str(data.dtype), field) ptr = interface['data'][0] diff --git a/tests/python/test_with_sklearn.py b/tests/python/test_with_sklearn.py index 2a400871dc2b..80af18ac9811 100644 --- a/tests/python/test_with_sklearn.py +++ b/tests/python/test_with_sklearn.py @@ -1147,32 +1147,61 @@ def test_feature_weights(): assert poly_decreasing[0] < -0.08 -def run_boost_from_prediction(tree_method): - from sklearn.datasets import load_breast_cancer +def run_boost_from_prediction(tree_method, as_frame): + from sklearn.datasets import load_breast_cancer, load_digits + import pandas as pd + + # binary-class X, y = load_breast_cancer(return_X_y=True) model_0 = xgb.XGBClassifier( - learning_rate=0.3, random_state=0, n_estimators=4, - tree_method=tree_method) + learning_rate=0.3, random_state=0, n_estimators=4, tree_method=tree_method + ) model_0.fit(X=X, y=y) margin = model_0.predict(X, output_margin=True) model_1 = xgb.XGBClassifier( - learning_rate=0.3, random_state=0, n_estimators=4, - tree_method=tree_method) + learning_rate=0.3, random_state=0, n_estimators=4, tree_method=tree_method + ) model_1.fit(X=X, y=y, base_margin=margin) predictions_1 = model_1.predict(X, base_margin=margin) cls_2 = xgb.XGBClassifier( - learning_rate=0.3, random_state=0, n_estimators=8, - tree_method=tree_method) + learning_rate=0.3, random_state=0, n_estimators=8, tree_method=tree_method + ) cls_2.fit(X=X, y=y) predictions_2 = cls_2.predict(X) - assert np.all(predictions_1 == predictions_2) + np.testing.assert_allclose(predictions_1, predictions_2) + + # Multi-class + X, y = load_digits(return_X_y=True) + model_0 = xgb.XGBClassifier( + learning_rate=0.3, random_state=0, n_estimators=4, tree_method=tree_method + ) + model_0.fit(X=X, y=y) + margin = model_0.get_booster().inplace_predict(X, predict_type="margin") + if as_frame: + margin = pd.DataFrame(margin) + + model_1 = xgb.XGBClassifier( + learning_rate=0.3, random_state=0, n_estimators=4, tree_method=tree_method + ) + model_1.fit(X=X, y=y, base_margin=margin) + predictions_1 = model_1.get_booster().predict( + xgb.DMatrix(X, base_margin=margin), output_margin=True + ) + + cls_2 = xgb.XGBClassifier( + learning_rate=0.3, random_state=0, n_estimators=8, tree_method=tree_method + ) + cls_2.fit(X=X, y=y) + predictions_2 = cls_2.get_booster().inplace_predict(X, predict_type="margin") + np.testing.assert_allclose(predictions_1, predictions_2) @pytest.mark.parametrize("tree_method", ["hist", "approx", "exact"]) def test_boost_from_prediction(tree_method): - run_boost_from_prediction(tree_method) + run_boost_from_prediction(tree_method, False) + run_boost_from_prediction(tree_method, True) def test_estimator_type(): From 2dcce5ebd3a0cd55b9b9e576611af5aa278c510b Mon Sep 17 00:00:00 2001 From: fis Date: Fri, 29 Oct 2021 00:49:20 +0800 Subject: [PATCH 02/15] Generalize. --- python-package/xgboost/core.py | 2 +- python-package/xgboost/data.py | 14 +++++++++++--- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/python-package/xgboost/core.py b/python-package/xgboost/core.py index 594ed74bb4eb..464e88656406 100644 --- a/python-package/xgboost/core.py +++ b/python-package/xgboost/core.py @@ -577,7 +577,7 @@ def __init__( # force into void_p, mac need to pass things in as void_p if data is None: - self.handle = None + self.handle: Optional[ctypes.c_void_p] = None return from .data import dispatch_data_backend, _is_iter diff --git a/python-package/xgboost/data.py b/python-package/xgboost/data.py index 3624ad0b0bb5..4357fda24c83 100644 --- a/python-package/xgboost/data.py +++ b/python-package/xgboost/data.py @@ -18,6 +18,11 @@ CAT_T = "c" +# meta info that can be a matrix instead of vector. +# For now it's base_margin for multi-class, but it can be extended to label once we have +# multi-output. +_matrix_meta = {"base_margin"} + def _warn_unused_missing(data, missing): if (missing is not None) and (not np.isnan(missing)): @@ -291,7 +296,7 @@ def _transform_pandas_df( else: transformed = data - if meta and len(data.columns) > 1 and meta != "base_margin": + if meta and len(data.columns) > 1 and meta not in _matrix_meta: raise ValueError(f"DataFrame for {meta} cannot have multiple columns") dtype = meta_type if meta_type else np.float32 @@ -823,7 +828,7 @@ def _to_data_type(dtype: str, name: str): def _validate_meta_shape(data: Any, name: str) -> None: msg = f"Invalid shape: {data.shape} for {name}" if hasattr(data, "shape"): - if name == "base_margin": + if name in _matrix_meta: if len(data.shape) > 2: raise ValueError(msg) return @@ -834,7 +839,9 @@ def _validate_meta_shape(data: Any, name: str) -> None: raise ValueError(f"Invalid shape: {data.shape} for {name}") -def _meta_from_numpy(data, field, dtype, handle): +def _meta_from_numpy( + data: np.ndarray, field: str, dtype, handle: ctypes.c_void_p +) -> None: data = _maybe_np_slice(data, dtype) interface = data.__array_interface__ assert interface.get('mask', None) is None, 'Masked array is not supported' @@ -899,6 +906,7 @@ def _meta_from_dt(data, field, dtype, handle): def dispatch_meta_backend(matrix: DMatrix, data, name: str, dtype: str = None): '''Dispatch for meta info.''' handle = matrix.handle + assert handle is not None _validate_meta_shape(data, name) if data is None: return From 5b95c1c78076398979abd0e260e40dadf85355e6 Mon Sep 17 00:00:00 2001 From: fis Date: Fri, 29 Oct 2021 17:06:07 +0800 Subject: [PATCH 03/15] Add cuDF. --- python-package/xgboost/data.py | 32 +++++++++++++++++++------------- src/data/data.cu | 29 ++++++++++++++++++----------- 2 files changed, 37 insertions(+), 24 deletions(-) diff --git a/python-package/xgboost/data.py b/python-package/xgboost/data.py index 4357fda24c83..076c97fa75cc 100644 --- a/python-package/xgboost/data.py +++ b/python-package/xgboost/data.py @@ -328,6 +328,18 @@ def _is_pandas_series(data): return isinstance(data, pd.Series) +def _meta_from_pandas_series( + data, name: str, dtype: Optional[str], handle: ctypes.c_void_p +) -> None: + """Help transform pandas series for meta data like labels""" + data = data.values.astype('float') + from pandas.api.types import is_sparse + if is_sparse(data): + data = data.to_dense() + assert len(data.shape) == 1 or data.shape[1] == 0 or data.shape[1] == 1 + _meta_from_numpy(data, name, dtype, handle) + + def _is_modin_series(data): try: import modin.pandas as pd @@ -382,6 +394,7 @@ def _transform_dt_df( raise ValueError( 'DataTable for label or weight cannot have multiple columns') if meta: + meta_type = "float" if meta_type is None else meta_type # below requires new dt version # extract first column data = data.to_numpy()[:, 0].astype(meta_type) @@ -868,17 +881,12 @@ def _meta_from_tuple(data, field, dtype, handle): return _meta_from_list(data, field, dtype, handle) -def _meta_from_cudf_df(data, field, handle): - if len(data.columns) != 1: - raise ValueError( - 'Expecting meta-info to contain a single column') - data = data[data.columns[0]] +def _meta_from_cudf_df(data, field: str, handle: ctypes.c_void_p) -> None: + if field not in _matrix_meta: + data = data[data.columns[0]] - interface = bytes(json.dumps([data.__cuda_array_interface__], - indent=2), 'utf-8') - _check_call(_LIB.XGDMatrixSetInfoFromInterface(handle, - c_str(field), - interface)) + interface = bytes(json.dumps([data.__cuda_array_interface__], indent=2), "utf-8") + _check_call(_LIB.XGDMatrixSetInfoFromInterface(handle, c_str(field), interface)) def _meta_from_cudf_series(data, field, handle): @@ -925,9 +933,7 @@ def dispatch_meta_backend(matrix: DMatrix, data, name: str, dtype: str = None): _meta_from_numpy(data, name, dtype, handle) return if _is_pandas_series(data): - data = data.values.astype('float') - assert len(data.shape) == 1 or data.shape[1] == 0 or data.shape[1] == 1 - _meta_from_numpy(data, name, dtype, handle) + _meta_from_pandas_series(data, name, dtype, handle) return if _is_dlpack(data): data = _transform_dlpack(data) diff --git a/src/data/data.cu b/src/data/data.cu index 2f298f330743..6c90b97f6f1b 100644 --- a/src/data/data.cu +++ b/src/data/data.cu @@ -30,12 +30,16 @@ void CopyInfoImpl(ArrayInterface column, HostDeviceVector* out) { return; } out->SetDevice(ptr_device); - out->Resize(column.num_rows); - auto p_dst = thrust::device_pointer_cast(out->DevicePointer()); + size_t size = column.num_rows * column.num_cols; + CHECK_NE(size, 0); + out->Resize(size); - dh::LaunchN(column.num_rows, [=] __device__(size_t idx) { - p_dst[idx] = column.GetElement(idx, 0); + auto p_dst = thrust::device_pointer_cast(out->DevicePointer()); + dh::LaunchN(size, [=] __device__(size_t idx) { + size_t ridx = idx / column.num_cols; + size_t cidx = idx - (ridx * column.num_cols); + p_dst[idx] = column.GetElement(ridx, cidx); }); } @@ -131,11 +135,6 @@ void MetaInfo::SetInfo(const char * c_key, std::string const& interface_str) { << "MetaInfo: " << c_key << ". " << ArrayInterfaceErrors::Dimension(1); ArrayInterface array_interface(interface_str); std::string key{c_key}; - if (!((array_interface.num_cols == 1 && array_interface.num_rows == 0) || - (array_interface.num_cols == 0 && array_interface.num_rows == 1))) { - // Not an empty column, transform it. - array_interface.AsColumnVector(); - } CHECK(!array_interface.valid.Data()) << "Meta info " << key << " should be dense, found validity mask"; @@ -143,6 +142,16 @@ void MetaInfo::SetInfo(const char * c_key, std::string const& interface_str) { return; } + if (key == "base_margin") { + CopyInfoImpl(array_interface, &base_margin_); + return; + } + + if (!((array_interface.num_cols == 1 && array_interface.num_rows == 0) || + (array_interface.num_cols == 0 && array_interface.num_rows == 1))) { + // Not an empty column, transform it. + array_interface.AsColumnVector(); + } if (key == "label") { CopyInfoImpl(array_interface, &labels_); auto ptr = labels_.ConstDevicePointer(); @@ -155,8 +164,6 @@ void MetaInfo::SetInfo(const char * c_key, std::string const& interface_str) { auto valid = thrust::none_of(thrust::device, ptr, ptr + weights_.Size(), WeightsCheck{}); CHECK(valid) << "Weights must be positive values."; - } else if (key == "base_margin") { - CopyInfoImpl(array_interface, &base_margin_); } else if (key == "group") { CopyGroupInfoImpl(array_interface, &group_ptr_); ValidateQueryGroup(group_ptr_); From 898a7f6fbf9f2fd37c12673fbe8672ed57f5ef04 Mon Sep 17 00:00:00 2001 From: fis Date: Fri, 29 Oct 2021 17:15:36 +0800 Subject: [PATCH 04/15] Test. --- tests/python-gpu/test_gpu_with_sklearn.py | 19 +++++++++++- tests/python/test_with_sklearn.py | 38 +++++++++++++++++------ 2 files changed, 46 insertions(+), 11 deletions(-) diff --git a/tests/python-gpu/test_gpu_with_sklearn.py b/tests/python-gpu/test_gpu_with_sklearn.py index 5f70ef6310b1..17a2ccf832e9 100644 --- a/tests/python-gpu/test_gpu_with_sklearn.py +++ b/tests/python-gpu/test_gpu_with_sklearn.py @@ -35,8 +35,25 @@ def test_gpu_binary_classification(): assert err < 0.1 +@pytest.mark.skipif(**tm.no_cupy()) +@pytest.mark.skipif(**tm.no_cudf()) def test_boost_from_prediction_gpu_hist(): - twskl.run_boost_from_prediction('gpu_hist') + from sklearn.datasets import load_breast_cancer, load_digits + import cupy as cp + import cudf + + tree_method = "gpu_hist" + X, y = load_breast_cancer(return_X_y=True) + X, y = cp.array(X), cp.array(y) + + twskl.run_boost_from_prediction_binary(tree_method, X, y, None) + twskl.run_boost_from_prediction_binary(tree_method, X, y, cudf.DataFrame) + + X, y = load_digits(return_X_y=True) + X, y = cp.array(X), cp.array(y) + + twskl.run_boost_from_prediction_multi_clasas(tree_method, X, y, None) + twskl.run_boost_from_prediction_multi_clasas(tree_method, X, y, cudf.DataFrame) def test_num_parallel_tree(): diff --git a/tests/python/test_with_sklearn.py b/tests/python/test_with_sklearn.py index 80af18ac9811..885b6ff3c9e6 100644 --- a/tests/python/test_with_sklearn.py +++ b/tests/python/test_with_sklearn.py @@ -1,3 +1,4 @@ +from typing import Callable, Optional import collections import importlib.util import numpy as np @@ -1147,17 +1148,22 @@ def test_feature_weights(): assert poly_decreasing[0] < -0.08 -def run_boost_from_prediction(tree_method, as_frame): - from sklearn.datasets import load_breast_cancer, load_digits - import pandas as pd +def run_boost_from_prediction_binary(tree_method, X, y, as_frame: Optional[Callable]): + """ + Parameters + ---------- + + as_frame: A callable function to convert margin into DataFrame, useful for different + df implementations. + """ - # binary-class - X, y = load_breast_cancer(return_X_y=True) model_0 = xgb.XGBClassifier( learning_rate=0.3, random_state=0, n_estimators=4, tree_method=tree_method ) model_0.fit(X=X, y=y) margin = model_0.predict(X, output_margin=True) + if as_frame is not None: + margin = as_frame(margin) model_1 = xgb.XGBClassifier( learning_rate=0.3, random_state=0, n_estimators=4, tree_method=tree_method @@ -1172,15 +1178,18 @@ def run_boost_from_prediction(tree_method, as_frame): predictions_2 = cls_2.predict(X) np.testing.assert_allclose(predictions_1, predictions_2) + +def run_boost_from_prediction_multi_clasas( + tree_method, X, y, as_frame: Optional[Callable] +): # Multi-class - X, y = load_digits(return_X_y=True) model_0 = xgb.XGBClassifier( learning_rate=0.3, random_state=0, n_estimators=4, tree_method=tree_method ) model_0.fit(X=X, y=y) margin = model_0.get_booster().inplace_predict(X, predict_type="margin") - if as_frame: - margin = pd.DataFrame(margin) + if as_frame is not None: + margin = as_frame(margin) model_1 = xgb.XGBClassifier( learning_rate=0.3, random_state=0, n_estimators=4, tree_method=tree_method @@ -1200,8 +1209,17 @@ def run_boost_from_prediction(tree_method, as_frame): @pytest.mark.parametrize("tree_method", ["hist", "approx", "exact"]) def test_boost_from_prediction(tree_method): - run_boost_from_prediction(tree_method, False) - run_boost_from_prediction(tree_method, True) + from sklearn.datasets import load_breast_cancer, load_digits + import pandas as pd + X, y = load_breast_cancer(return_X_y=True) + + run_boost_from_prediction_binary(tree_method, X, y, None) + run_boost_from_prediction_binary(tree_method, X, y, pd.DataFrame) + + X, y = load_digits(return_X_y=True) + + run_boost_from_prediction_multi_clasas(tree_method, X, y, None) + run_boost_from_prediction_multi_clasas(tree_method, X, y, pd.DataFrame) def test_estimator_type(): From 92025abc637bfb8bb875d9ea7f888e3aeaa8bf53 Mon Sep 17 00:00:00 2001 From: fis Date: Fri, 29 Oct 2021 17:27:47 +0800 Subject: [PATCH 05/15] Add tests. --- tests/python-gpu/test_gpu_with_dask.py | 13 ++++++- tests/python/test_with_dask.py | 50 ++++++++++++++++++++++++-- tests/python/test_with_sklearn.py | 6 ++-- 3 files changed, 63 insertions(+), 6 deletions(-) diff --git a/tests/python-gpu/test_gpu_with_dask.py b/tests/python-gpu/test_gpu_with_dask.py index d9b9f8684863..ca5f3c971f7e 100644 --- a/tests/python-gpu/test_gpu_with_dask.py +++ b/tests/python-gpu/test_gpu_with_dask.py @@ -22,6 +22,7 @@ from test_with_dask import run_empty_dmatrix_auc # noqa from test_with_dask import run_auc # noqa from test_with_dask import run_boost_from_prediction # noqa +from test_with_dask import run_boost_from_prediction_multi_clasas # noqa from test_with_dask import run_dask_classifier # noqa from test_with_dask import run_empty_dmatrix_cls # noqa from test_with_dask import _get_client_workers # noqa @@ -297,13 +298,23 @@ def run_gpu_hist( @pytest.mark.skipif(**tm.no_cudf()) def test_boost_from_prediction(local_cuda_cluster: LocalCUDACluster) -> None: import cudf - from sklearn.datasets import load_breast_cancer + import dask_cudf + from sklearn.datasets import load_breast_cancer, load_digits with Client(local_cuda_cluster) as client: X_, y_ = load_breast_cancer(return_X_y=True) X = dd.from_array(X_, chunksize=100).map_partitions(cudf.from_pandas) y = dd.from_array(y_, chunksize=100).map_partitions(cudf.from_pandas) run_boost_from_prediction(X, y, "gpu_hist", client) + X_, y_ = load_digits(return_X_y=True) + X = dd.from_array(X_, chunksize=100).map_partitions(cudf.from_pandas) + y = dd.from_array(y_, chunksize=100).map_partitions(cudf.from_pandas) + + run_boost_from_prediction_multi_clasas( + X, y, "gpu_hist", client, lambda margin: dask_cudf.from_dask_array(margin) + ) + run_boost_from_prediction_multi_clasas(X, y, "gpu_hist", client, None) + class TestDistributedGPU: @pytest.mark.skipif(**tm.no_dask()) diff --git a/tests/python/test_with_dask.py b/tests/python/test_with_dask.py index f506d0d80e92..8ac1a9ba2eab 100644 --- a/tests/python/test_with_dask.py +++ b/tests/python/test_with_dask.py @@ -7,7 +7,7 @@ import numpy as np import scipy import json -from typing import List, Tuple, Dict, Optional, Type, Any +from typing import List, Tuple, Dict, Optional, Type, Any, Callable import asyncio from functools import partial from concurrent.futures import ThreadPoolExecutor @@ -182,6 +182,45 @@ def test_dask_predict_shape_infer(client: "Client") -> None: assert prediction.shape[1] == 3 +def run_boost_from_prediction_multi_clasas( + X: xgb.dask._DaskCollection, + y: xgb.dask._DaskCollection, + tree_method: str, + as_frame: Optional[Callable], + client: "Client" +) -> None: + model_0 = xgb.dask.DaskXGBClassifier( + learning_rate=0.3, random_state=0, n_estimators=4, tree_method=tree_method + ) + model_0.fit(X=X, y=y) + margin = xgb.dask.inplace_predict( + client, model_0.get_booster(), X, predict_type="margin" + ) + + if as_frame is not None: + margin = as_frame(margin) + + model_1 = xgb.dask.DaskXGBClassifier( + learning_rate=0.3, random_state=0, n_estimators=4, tree_method=tree_method + ) + model_1.fit(X=X, y=y, base_margin=margin) + predictions_1 = xgb.dask.predict( + client, + model_1.get_booster(), + xgb.dask.DaskDMatrix(X, base_margin=margin), + output_margin=True + ) + + model_2 = xgb.dask.DaskXGBClassifier( + learning_rate=0.3, random_state=0, n_estimators=8, tree_method=tree_method + ) + model_2.fit(X=X, y=y) + predictions_2 = xgb.dask.inplace_predict( + client, model_2.get_booster(), X, predict_type="margin" + ) + np.testing.assert_allclose(predictions_1.compute(), predictions_2.compute()) + + def run_boost_from_prediction( X: xgb.dask._DaskCollection, y: xgb.dask._DaskCollection, tree_method: str, client: "Client" ) -> None: @@ -227,11 +266,18 @@ def run_boost_from_prediction( @pytest.mark.parametrize("tree_method", ["hist", "approx"]) def test_boost_from_prediction(tree_method: str, client: "Client") -> None: - from sklearn.datasets import load_breast_cancer + from sklearn.datasets import load_breast_cancer, load_digits X_, y_ = load_breast_cancer(return_X_y=True) X, y = dd.from_array(X_, chunksize=100), dd.from_array(y_, chunksize=100) run_boost_from_prediction(X, y, tree_method, client) + X_, y_ = load_digits(return_X_y=True) + X, y = dd.from_array(X_, chunksize=100), dd.from_array(y_, chunksize=100) + run_boost_from_prediction_multi_clasas( + X, y, tree_method, client, lambda margin: dd.from_dask_array(margin) + ) + run_boost_from_prediction_multi_clasas(X, y, tree_method, client, None) + def test_inplace_predict(client: "Client") -> None: from sklearn.datasets import load_boston diff --git a/tests/python/test_with_sklearn.py b/tests/python/test_with_sklearn.py index 885b6ff3c9e6..ff2f2b8ac51d 100644 --- a/tests/python/test_with_sklearn.py +++ b/tests/python/test_with_sklearn.py @@ -1199,11 +1199,11 @@ def run_boost_from_prediction_multi_clasas( xgb.DMatrix(X, base_margin=margin), output_margin=True ) - cls_2 = xgb.XGBClassifier( + model_2 = xgb.XGBClassifier( learning_rate=0.3, random_state=0, n_estimators=8, tree_method=tree_method ) - cls_2.fit(X=X, y=y) - predictions_2 = cls_2.get_booster().inplace_predict(X, predict_type="margin") + model_2.fit(X=X, y=y) + predictions_2 = model_2.get_booster().inplace_predict(X, predict_type="margin") np.testing.assert_allclose(predictions_1, predictions_2) From 4043c62835e5e2043f4c88bc26dc541fbefadcd2 Mon Sep 17 00:00:00 2001 From: fis Date: Fri, 29 Oct 2021 17:33:55 +0800 Subject: [PATCH 06/15] Fixes. --- tests/python-gpu/test_gpu_with_dask.py | 7 +------ tests/python/test_with_dask.py | 11 ++--------- 2 files changed, 3 insertions(+), 15 deletions(-) diff --git a/tests/python-gpu/test_gpu_with_dask.py b/tests/python-gpu/test_gpu_with_dask.py index ca5f3c971f7e..7150566c89a5 100644 --- a/tests/python-gpu/test_gpu_with_dask.py +++ b/tests/python-gpu/test_gpu_with_dask.py @@ -298,7 +298,6 @@ def run_gpu_hist( @pytest.mark.skipif(**tm.no_cudf()) def test_boost_from_prediction(local_cuda_cluster: LocalCUDACluster) -> None: import cudf - import dask_cudf from sklearn.datasets import load_breast_cancer, load_digits with Client(local_cuda_cluster) as client: X_, y_ = load_breast_cancer(return_X_y=True) @@ -309,11 +308,7 @@ def test_boost_from_prediction(local_cuda_cluster: LocalCUDACluster) -> None: X_, y_ = load_digits(return_X_y=True) X = dd.from_array(X_, chunksize=100).map_partitions(cudf.from_pandas) y = dd.from_array(y_, chunksize=100).map_partitions(cudf.from_pandas) - - run_boost_from_prediction_multi_clasas( - X, y, "gpu_hist", client, lambda margin: dask_cudf.from_dask_array(margin) - ) - run_boost_from_prediction_multi_clasas(X, y, "gpu_hist", client, None) + run_boost_from_prediction_multi_clasas(X, y, "gpu_hist", client) class TestDistributedGPU: diff --git a/tests/python/test_with_dask.py b/tests/python/test_with_dask.py index 8ac1a9ba2eab..f9a1f47a7042 100644 --- a/tests/python/test_with_dask.py +++ b/tests/python/test_with_dask.py @@ -186,7 +186,6 @@ def run_boost_from_prediction_multi_clasas( X: xgb.dask._DaskCollection, y: xgb.dask._DaskCollection, tree_method: str, - as_frame: Optional[Callable], client: "Client" ) -> None: model_0 = xgb.dask.DaskXGBClassifier( @@ -197,9 +196,6 @@ def run_boost_from_prediction_multi_clasas( client, model_0.get_booster(), X, predict_type="margin" ) - if as_frame is not None: - margin = as_frame(margin) - model_1 = xgb.dask.DaskXGBClassifier( learning_rate=0.3, random_state=0, n_estimators=4, tree_method=tree_method ) @@ -207,7 +203,7 @@ def run_boost_from_prediction_multi_clasas( predictions_1 = xgb.dask.predict( client, model_1.get_booster(), - xgb.dask.DaskDMatrix(X, base_margin=margin), + xgb.dask.DaskDMatrix(client, X, base_margin=margin), output_margin=True ) @@ -273,10 +269,7 @@ def test_boost_from_prediction(tree_method: str, client: "Client") -> None: X_, y_ = load_digits(return_X_y=True) X, y = dd.from_array(X_, chunksize=100), dd.from_array(y_, chunksize=100) - run_boost_from_prediction_multi_clasas( - X, y, tree_method, client, lambda margin: dd.from_dask_array(margin) - ) - run_boost_from_prediction_multi_clasas(X, y, tree_method, client, None) + run_boost_from_prediction_multi_clasas(X, y, tree_method, client) def test_inplace_predict(client: "Client") -> None: From 29294718dcf3738264677e9c6ec3f000782fa441 Mon Sep 17 00:00:00 2001 From: fis Date: Fri, 29 Oct 2021 18:09:53 +0800 Subject: [PATCH 07/15] Fixes. --- python-package/xgboost/data.py | 11 ++++++----- src/data/data.cu | 5 ++--- tests/python/test_with_sklearn.py | 7 ++++++- 3 files changed, 14 insertions(+), 9 deletions(-) diff --git a/python-package/xgboost/data.py b/python-package/xgboost/data.py index 076c97fa75cc..5886bb66302c 100644 --- a/python-package/xgboost/data.py +++ b/python-package/xgboost/data.py @@ -839,8 +839,8 @@ def _to_data_type(dtype: str, name: str): def _validate_meta_shape(data: Any, name: str) -> None: - msg = f"Invalid shape: {data.shape} for {name}" if hasattr(data, "shape"): + msg = f"Invalid shape: {data.shape} for {name}" if name in _matrix_meta: if len(data.shape) > 2: raise ValueError(msg) @@ -883,10 +883,11 @@ def _meta_from_tuple(data, field, dtype, handle): def _meta_from_cudf_df(data, field: str, handle: ctypes.c_void_p) -> None: if field not in _matrix_meta: - data = data[data.columns[0]] - - interface = bytes(json.dumps([data.__cuda_array_interface__], indent=2), "utf-8") - _check_call(_LIB.XGDMatrixSetInfoFromInterface(handle, c_str(field), interface)) + _meta_from_cudf_series(data.iloc[:, 0], field, handle) + else: + data = data.values + interface = _cuda_array_interface(data) + _check_call(_LIB.XGDMatrixSetInfoFromInterface(handle, c_str(field), interface)) def _meta_from_cudf_series(data, field, handle): diff --git a/src/data/data.cu b/src/data/data.cu index 6c90b97f6f1b..c4c7c503cb2e 100644 --- a/src/data/data.cu +++ b/src/data/data.cu @@ -130,9 +130,6 @@ void ValidateQueryGroup(std::vector const &group_ptr_); void MetaInfo::SetInfo(const char * c_key, std::string const& interface_str) { Json j_interface = Json::Load({interface_str.c_str(), interface_str.size()}); - auto const& j_arr = get(j_interface); - CHECK_EQ(j_arr.size(), 1) - << "MetaInfo: " << c_key << ". " << ArrayInterfaceErrors::Dimension(1); ArrayInterface array_interface(interface_str); std::string key{c_key}; @@ -147,6 +144,8 @@ void MetaInfo::SetInfo(const char * c_key, std::string const& interface_str) { return; } + CHECK(array_interface.num_cols == 1 || array_interface.num_rows == 1) + << "MetaInfo: " << c_key << " has invalid shape"; if (!((array_interface.num_cols == 1 && array_interface.num_rows == 0) || (array_interface.num_cols == 0 && array_interface.num_rows == 1))) { // Not an empty column, transform it. diff --git a/tests/python/test_with_sklearn.py b/tests/python/test_with_sklearn.py index ff2f2b8ac51d..13880a4355fb 100644 --- a/tests/python/test_with_sklearn.py +++ b/tests/python/test_with_sklearn.py @@ -1204,7 +1204,12 @@ def run_boost_from_prediction_multi_clasas( ) model_2.fit(X=X, y=y) predictions_2 = model_2.get_booster().inplace_predict(X, predict_type="margin") - np.testing.assert_allclose(predictions_1, predictions_2) + + if hasattr(predictions_1, "get"): + predictions_1 = predictions_1.get() + if hasattr(predictions_2, "get"): + predictions_2 = predictions_2.get() + np.testing.assert_allclose(predictions_1, predictions_2, atol=1e-6) @pytest.mark.parametrize("tree_method", ["hist", "approx", "exact"]) From 696024e6e6ef92a8fc7defea42915d43e3757e75 Mon Sep 17 00:00:00 2001 From: fis Date: Fri, 29 Oct 2021 18:30:22 +0800 Subject: [PATCH 08/15] Test and error message. --- src/predictor/cpu_predictor.cc | 27 ++++++++------------------- src/predictor/gpu_predictor.cu | 6 +++++- tests/python-gpu/test_from_cudf.py | 5 +++++ tests/python-gpu/test_from_cupy.py | 5 +++++ 4 files changed, 23 insertions(+), 20 deletions(-) diff --git a/src/predictor/cpu_predictor.cc b/src/predictor/cpu_predictor.cc index b113a31761f5..d581f64a1d56 100644 --- a/src/predictor/cpu_predictor.cc +++ b/src/predictor/cpu_predictor.cc @@ -290,27 +290,16 @@ class CPUPredictor : public Predictor { const auto& base_margin = info.base_margin_.HostVector(); out_preds->Resize(n); std::vector& out_preds_h = out_preds->HostVector(); - if (base_margin.size() == n) { - CHECK_EQ(out_preds->Size(), n); - std::copy(base_margin.begin(), base_margin.end(), out_preds_h.begin()); - } else { - if (!base_margin.empty()) { - std::ostringstream oss; - oss << "Ignoring the base margin, since it has incorrect length. " - << "The base margin must be an array of length "; - if (model.learner_model_param->num_output_group > 1) { - oss << "[num_class] * [number of data points], i.e. " - << model.learner_model_param->num_output_group << " * " << info.num_row_ - << " = " << n << ". "; - } else { - oss << "[number of data points], i.e. " << info.num_row_ << ". "; - } - oss << "Instead, all data points will use " - << "base_score = " << model.learner_model_param->base_score; - LOG(WARNING) << oss.str(); - } + if (base_margin.empty()) { std::fill(out_preds_h.begin(), out_preds_h.end(), model.learner_model_param->base_score); + } else { + std::string expected{ + "(" + std::to_string(info.num_row_) + ", " + + std::to_string(model.learner_model_param->num_output_group) + ")"}; + CHECK_EQ(base_margin.size(), n) + << "Invalid shape of base_margin. Expected:" << expected; + std::copy(base_margin.begin(), base_margin.end(), out_preds_h.begin()); } } diff --git a/src/predictor/gpu_predictor.cu b/src/predictor/gpu_predictor.cu index d1b9c8132659..51674237e973 100644 --- a/src/predictor/gpu_predictor.cu +++ b/src/predictor/gpu_predictor.cu @@ -938,7 +938,11 @@ class GPUPredictor : public xgboost::Predictor { out_preds->SetDevice(generic_param_->gpu_id); out_preds->Resize(n); if (base_margin.Size() != 0) { - CHECK_EQ(base_margin.Size(), n); + std::string expected{ + "(" + std::to_string(info.num_row_) + ", " + + std::to_string(model.learner_model_param->num_output_group) + ")"}; + CHECK_EQ(base_margin.Size(), n) + << "Invalid shape of base_margin. Expected:" << expected; out_preds->Copy(base_margin); } else { out_preds->Fill(model.learner_model_param->base_score); diff --git a/tests/python-gpu/test_from_cudf.py b/tests/python-gpu/test_from_cudf.py index 6250ab328208..f2061f303d04 100644 --- a/tests/python-gpu/test_from_cudf.py +++ b/tests/python-gpu/test_from_cudf.py @@ -142,6 +142,11 @@ def _test_cudf_metainfo(DMatrixT): dmat_cudf.get_float_info('base_margin')) assert np.array_equal(dmat.get_uint_info('group_ptr'), dmat_cudf.get_uint_info('group_ptr')) + # Invalid shape for base margin + Xy = xgb.DMatrix(X, floats, base_margin=X) + with pytest.raises(ValueError): + xgb.train({}, Xy) + class TestFromColumnar: '''Tests for constructing DMatrix from data structure conforming Apache diff --git a/tests/python-gpu/test_from_cupy.py b/tests/python-gpu/test_from_cupy.py index fee5edbb8c44..18b8f77a5a96 100644 --- a/tests/python-gpu/test_from_cupy.py +++ b/tests/python-gpu/test_from_cupy.py @@ -107,6 +107,11 @@ def _test_cupy_metainfo(DMatrixT): assert np.array_equal(dmat.get_uint_info('group_ptr'), dmat_cupy.get_uint_info('group_ptr')) + # Invalid shape for base margin + Xy = xgb.DMatrix(X, floats, base_margin=X) + with pytest.raises(ValueError): + xgb.train({}, Xy) + @pytest.mark.skipif(**tm.no_cupy()) @pytest.mark.skipif(**tm.no_sklearn()) From 3d1169abf9c7fcb0989b25915bbfbb6848d63dcf Mon Sep 17 00:00:00 2001 From: fis Date: Fri, 29 Oct 2021 18:58:18 +0800 Subject: [PATCH 09/15] Thorough. --- python-package/xgboost/data.py | 7 +++---- tests/python-gpu/test_from_cudf.py | 6 ++---- tests/python-gpu/test_from_cupy.py | 6 ++---- tests/python/test_dmatrix.py | 17 ++++++++++++++++- tests/python/test_with_modin.py | 4 ++++ tests/python/test_with_pandas.py | 4 ++++ 6 files changed, 31 insertions(+), 13 deletions(-) diff --git a/python-package/xgboost/data.py b/python-package/xgboost/data.py index 5886bb66302c..285b2e840f59 100644 --- a/python-package/xgboost/data.py +++ b/python-package/xgboost/data.py @@ -391,8 +391,7 @@ def _transform_dt_df( ): """Validate feature names and types if data table""" if meta and data.shape[1] > 1: - raise ValueError( - 'DataTable for label or weight cannot have multiple columns') + raise ValueError('DataTable for meta info cannot have multiple columns') if meta: meta_type = "float" if meta_type is None else meta_type # below requires new dt version @@ -907,8 +906,8 @@ def _meta_from_cupy_array(data, field, handle): interface)) -def _meta_from_dt(data, field, dtype, handle): - data, _, _ = _transform_dt_df(data, None, None) +def _meta_from_dt(data, field: str, dtype, handle: ctypes.c_void_p): + data, _, _ = _transform_dt_df(data, None, None, field, dtype) _meta_from_numpy(data, field, dtype, handle) diff --git a/tests/python-gpu/test_from_cudf.py b/tests/python-gpu/test_from_cudf.py index f2061f303d04..defc2b2197be 100644 --- a/tests/python-gpu/test_from_cudf.py +++ b/tests/python-gpu/test_from_cudf.py @@ -5,6 +5,7 @@ sys.path.append("tests/python") import testing as tm +from test_dmatrix import set_base_margin_info def dmatrix_from_cudf(input_type, DMatrixT, missing=np.NAN): @@ -142,10 +143,7 @@ def _test_cudf_metainfo(DMatrixT): dmat_cudf.get_float_info('base_margin')) assert np.array_equal(dmat.get_uint_info('group_ptr'), dmat_cudf.get_uint_info('group_ptr')) - # Invalid shape for base margin - Xy = xgb.DMatrix(X, floats, base_margin=X) - with pytest.raises(ValueError): - xgb.train({}, Xy) + set_base_margin_info(df, DMatrixT, "gpu_hist") class TestFromColumnar: diff --git a/tests/python-gpu/test_from_cupy.py b/tests/python-gpu/test_from_cupy.py index 18b8f77a5a96..cbed40777fc6 100644 --- a/tests/python-gpu/test_from_cupy.py +++ b/tests/python-gpu/test_from_cupy.py @@ -5,6 +5,7 @@ sys.path.append("tests/python") import testing as tm +from test_dmatrix import set_base_margin_info def dmatrix_from_cupy(input_type, DMatrixT, missing=np.NAN): @@ -107,10 +108,7 @@ def _test_cupy_metainfo(DMatrixT): assert np.array_equal(dmat.get_uint_info('group_ptr'), dmat_cupy.get_uint_info('group_ptr')) - # Invalid shape for base margin - Xy = xgb.DMatrix(X, floats, base_margin=X) - with pytest.raises(ValueError): - xgb.train({}, Xy) + set_base_margin_info(cp.asarray, DMatrixT, "gpu_hist") @pytest.mark.skipif(**tm.no_cupy()) diff --git a/tests/python/test_dmatrix.py b/tests/python/test_dmatrix.py index 313cf822fb98..f5e2c7fcfc67 100644 --- a/tests/python/test_dmatrix.py +++ b/tests/python/test_dmatrix.py @@ -15,6 +15,18 @@ rng = np.random.RandomState(1994) +def set_base_margin_info(DType, DMatrixT, tm: str): + rng = np.random.default_rng() + X = rng.normal(0, 1.0, size=100).reshape(10, 10) + y = X[:, 0] + base_margin = DType(X) + # no error at set + Xy = DMatrixT(X, y, base_margin=base_margin) + # Error at train, caused by check in predictor. + with pytest.raises(ValueError, match=r".*base_margin.*"): + xgb.train({"tree_method": tm}, Xy) + + class TestDMatrix: def test_warn_missing(self): from xgboost import data @@ -122,7 +134,7 @@ def test_slice(self): # base margin is per-class in multi-class classifier base_margin = rng.randn(100, 3).astype(np.float32) - d.set_base_margin(base_margin.flatten()) + d.set_base_margin(base_margin) ridxs = [1, 2, 3, 4, 5, 6] sliced = d.slice(ridxs) @@ -380,3 +392,6 @@ def test_uri_categorical(self): feature_types = ["q"] * 5 + ["c"] + ["q"] * 120 Xy = xgb.DMatrix(path + "?indexing_mode=1", feature_types=feature_types) np.testing.assert_equal(np.array(Xy.feature_types), np.array(feature_types)) + + def test_base_margin(self): + set_base_margin_info(np.asarray, xgb.DMatrix, "hist") diff --git a/tests/python/test_with_modin.py b/tests/python/test_with_modin.py index 9a5ca32e5bb7..e997202d12d8 100644 --- a/tests/python/test_with_modin.py +++ b/tests/python/test_with_modin.py @@ -3,6 +3,7 @@ import xgboost as xgb import testing as tm import pytest +from test_dmatrix import set_base_margin_info try: import modin.pandas as md @@ -144,3 +145,6 @@ def test_modin_weight(self): assert data.num_col() == kCols np.testing.assert_array_equal(data.get_weight(), w) + + def test_base_margin(self): + set_base_margin_info(md.DataFrame, xgb.DMatrix, "hist") diff --git a/tests/python/test_with_pandas.py b/tests/python/test_with_pandas.py index 0b25993a5ee5..a1feaacd484a 100644 --- a/tests/python/test_with_pandas.py +++ b/tests/python/test_with_pandas.py @@ -3,6 +3,7 @@ import xgboost as xgb import testing as tm import pytest +from test_dmatrix import set_base_margin_info try: import pandas as pd @@ -205,6 +206,9 @@ def test_pandas_weight(self): np.testing.assert_array_equal(data.get_weight(), w) + def test_base_margin(self): + set_base_margin_info(pd.DataFrame, xgb.DMatrix, "hist") + def test_cv_as_pandas(self): dm = xgb.DMatrix(dpath + 'agaricus.txt.train') params = {'max_depth': 2, 'eta': 1, 'verbosity': 0, From 3888faf45eb30fccb2f58b19136650ef07137a41 Mon Sep 17 00:00:00 2001 From: fis Date: Fri, 29 Oct 2021 19:03:26 +0800 Subject: [PATCH 10/15] Fix. --- tests/python/test_with_dask.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/tests/python/test_with_dask.py b/tests/python/test_with_dask.py index f9a1f47a7042..3315d3d8f184 100644 --- a/tests/python/test_with_dask.py +++ b/tests/python/test_with_dask.py @@ -214,7 +214,16 @@ def run_boost_from_prediction_multi_clasas( predictions_2 = xgb.dask.inplace_predict( client, model_2.get_booster(), X, predict_type="margin" ) - np.testing.assert_allclose(predictions_1.compute(), predictions_2.compute()) + a = predictions_1.compute() + b = predictions_2.compute() + # cupy/cudf + if hasattr(a, "get"): + a = a.get() + if hasattr(b, "values"): + b = b.values + if hasattr(b, "get"): + b = b.get() + np.testing.assert_allclose(a, b, atol=1e-5) def run_boost_from_prediction( From 7232bc3fdbfe3beb90ff0fbd8b27f04d09a17ab7 Mon Sep 17 00:00:00 2001 From: fis Date: Fri, 29 Oct 2021 19:13:04 +0800 Subject: [PATCH 11/15] Remove old document. --- python-package/xgboost/dask.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/python-package/xgboost/dask.py b/python-package/xgboost/dask.py index cb103b194bb5..9560ff2f1992 100644 --- a/python-package/xgboost/dask.py +++ b/python-package/xgboost/dask.py @@ -1432,9 +1432,7 @@ def inplace_predict( # pylint: disable=unused-argument Value in the input data which needs to be present as a missing value. If None, defaults to np.nan. base_margin: - See :py:obj:`xgboost.DMatrix` for details. Right now classifier is not well - supported with base_margin as it requires the size of base margin to be `n_classes - * n_samples`. + See :py:obj:`xgboost.DMatrix` for details. .. versionadded:: 1.4.0 From 2db843c697cfb706ff54037d6b0b0dd6dafc085e Mon Sep 17 00:00:00 2001 From: fis Date: Fri, 29 Oct 2021 19:52:37 +0800 Subject: [PATCH 12/15] Fix test. --- tests/python/test_dmatrix.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/tests/python/test_dmatrix.py b/tests/python/test_dmatrix.py index f5e2c7fcfc67..52086378ad5b 100644 --- a/tests/python/test_dmatrix.py +++ b/tests/python/test_dmatrix.py @@ -17,9 +17,12 @@ def set_base_margin_info(DType, DMatrixT, tm: str): rng = np.random.default_rng() - X = rng.normal(0, 1.0, size=100).reshape(10, 10) - y = X[:, 0] - base_margin = DType(X) + X = DType(rng.normal(0, 1.0, size=100).reshape(50, 2)) + if hasattr(X, "iloc"): + y = X.iloc[:, 0] + else: + y = X[:, 0] + base_margin = X # no error at set Xy = DMatrixT(X, y, base_margin=base_margin) # Error at train, caused by check in predictor. From 23be786ba3fbd780b3699bafed02be0689964f57 Mon Sep 17 00:00:00 2001 From: fis Date: Fri, 29 Oct 2021 20:17:21 +0800 Subject: [PATCH 13/15] Try to avoid using array. --- src/data/array_interface.h | 28 +++++++++++++++------------- tests/python/test_dmatrix.py | 3 +++ 2 files changed, 18 insertions(+), 13 deletions(-) diff --git a/src/data/array_interface.h b/src/data/array_interface.h index b7ca311439ae..6524f4512407 100644 --- a/src/data/array_interface.h +++ b/src/data/array_interface.h @@ -210,27 +210,28 @@ class ArrayInterfaceHandler { } static void ExtractStride(std::map const &column, - size_t strides[2], size_t rows, size_t cols, size_t itemsize) { + size_t *stride_r, size_t *stride_c, size_t rows, + size_t cols, size_t itemsize) { auto strides_it = column.find("strides"); if (strides_it == column.cend() || IsA(strides_it->second)) { // default strides - strides[0] = cols; - strides[1] = 1; + *stride_r = cols; + *stride_c = 1; } else { // strides specified by the array interface auto const &j_strides = get(strides_it->second); CHECK_LE(j_strides.size(), 2) << ArrayInterfaceErrors::Dimension(2); - strides[0] = get(j_strides[0]) / itemsize; + *stride_r = get(j_strides[0]) / itemsize; size_t n = 1; if (j_strides.size() == 2) { n = get(j_strides[1]) / itemsize; } - strides[1] = n; + *stride_c = n; } - auto valid = rows * strides[0] + cols * strides[1] >= (rows * cols); + auto valid = rows * (*stride_r) + cols * (*stride_c) >= (rows * cols); CHECK(valid) << "Invalid strides in array." - << " strides: (" << strides[0] << "," << strides[1] + << " strides: (" << (*stride_r) << "," << (*stride_c) << "), shape: (" << rows << ", " << cols << ")"; } @@ -281,8 +282,8 @@ class ArrayInterface { << "Masked array is not yet supported."; } - ArrayInterfaceHandler::ExtractStride(array, strides, num_rows, num_cols, - typestr[2] - '0'); + ArrayInterfaceHandler::ExtractStride(array, &stride_row, &stride_col, + num_rows, num_cols, typestr[2] - '0'); auto stream_it = array.find("stream"); if (stream_it != array.cend() && !IsA(stream_it->second)) { @@ -323,8 +324,8 @@ class ArrayInterface { num_rows = std::max(num_rows, static_cast(num_cols)); num_cols = 1; - strides[0] = std::max(strides[0], strides[1]); - strides[1] = 1; + stride_row = std::max(stride_row, stride_col); + stride_col = 1; } void AssignType(StringView typestr) { @@ -406,13 +407,14 @@ class ArrayInterface { template XGBOOST_DEVICE T GetElement(size_t r, size_t c) const { return this->DispatchCall( - [=](auto *p_values) -> T { return p_values[strides[0] * r + strides[1] * c]; }); + [=](auto *p_values) -> T { return p_values[stride_row * r + stride_col * c]; }); } RBitField8 valid; bst_row_t num_rows; bst_feature_t num_cols; - size_t strides[2]{0, 0}; + size_t stride_row{0}; + size_t stride_col{0}; void* data; Type type; }; diff --git a/tests/python/test_dmatrix.py b/tests/python/test_dmatrix.py index 52086378ad5b..1b5ad266d3d9 100644 --- a/tests/python/test_dmatrix.py +++ b/tests/python/test_dmatrix.py @@ -29,6 +29,9 @@ def set_base_margin_info(DType, DMatrixT, tm: str): with pytest.raises(ValueError, match=r".*base_margin.*"): xgb.train({"tree_method": tm}, Xy) + # FIXME(jiamingy): Currently the metainfo has no concept of shape. If you pass a + # base_margin with shape (n_classes, n_samples) to XGBoost the result is undefined. + class TestDMatrix: def test_warn_missing(self): From 342e2d080d329de25594e6400dca8eea585924d8 Mon Sep 17 00:00:00 2001 From: fis Date: Mon, 1 Nov 2021 19:50:46 +0800 Subject: [PATCH 14/15] Set device. --- tests/cpp/data/test_metainfo.cc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/cpp/data/test_metainfo.cc b/tests/cpp/data/test_metainfo.cc index a58bda90d9df..bb5452a56d28 100644 --- a/tests/cpp/data/test_metainfo.cc +++ b/tests/cpp/data/test_metainfo.cc @@ -252,6 +252,8 @@ TEST(MetaInfo, Validate) { EXPECT_THROW(info.Validate(1), dmlc::Error); xgboost::HostDeviceVector d_groups{groups}; + d_groups.SetDevice(0); + d_groups.DevicePointer(); // pull to device auto arr_interface = xgboost::GetArrayInterface(&d_groups, 64, 1); std::string arr_interface_str; xgboost::Json::Dump(arr_interface, &arr_interface_str); From f7a05896fa8859f734f9f7f090a2c5851f7fb1bd Mon Sep 17 00:00:00 2001 From: fis Date: Tue, 2 Nov 2021 02:46:28 +0800 Subject: [PATCH 15/15] Better testing. --- tests/python/testing.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/tests/python/testing.py b/tests/python/testing.py index 328cc63a2e85..2d0886079f51 100644 --- a/tests/python/testing.py +++ b/tests/python/testing.py @@ -3,6 +3,7 @@ import urllib import zipfile import sys +from typing import Optional from contextlib import contextmanager from io import StringIO from xgboost.compat import SKLEARN_INSTALLED, PANDAS_INSTALLED @@ -177,7 +178,7 @@ def __init__(self, name, get_dataset, objective, metric): self.metric = metric self.X, self.y = get_dataset() self.w = None - self.margin = None + self.margin: Optional[np.ndarray] = None def set_params(self, params_in): params_in['objective'] = self.objective @@ -315,7 +316,7 @@ def make_categorical( @strategies.composite def _dataset_weight_margin(draw): - data = draw(_unweighted_datasets_strategy) + data: TestDataset = draw(_unweighted_datasets_strategy) if draw(strategies.booleans()): data.w = draw(arrays(np.float64, (len(data.y)), elements=strategies.floats(0.1, 2.0))) if draw(strategies.booleans()): @@ -324,6 +325,8 @@ def _dataset_weight_margin(draw): num_class = int(np.max(data.y) + 1) data.margin = draw( arrays(np.float64, (len(data.y) * num_class), elements=strategies.floats(0.5, 1.0))) + if num_class != 1: + data.margin = data.margin.reshape(data.y.shape[0], num_class) return data