From 6925b222e0dc268803b59c93b28db65cf130beb7 Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Thu, 29 Sep 2022 00:57:52 +0800 Subject: [PATCH] Fix mixed types with cuDF. (#8280) --- python-package/xgboost/core.py | 59 ++++++++++++++++++++--------- python-package/xgboost/data.py | 60 +++++++++++++++++++----------- tests/python-gpu/test_from_cudf.py | 28 ++++++++++++-- 3 files changed, 103 insertions(+), 44 deletions(-) diff --git a/python-package/xgboost/core.py b/python-package/xgboost/core.py index b97035424dc2..9fd0317c47d6 100644 --- a/python-package/xgboost/core.py +++ b/python-package/xgboost/core.py @@ -1,49 +1,72 @@ # pylint: disable=too-many-arguments, too-many-branches, invalid-name # pylint: disable=too-many-lines, too-many-locals """Core XGBoost Library.""" -from abc import ABC, abstractmethod -from collections.abc import Mapping import copy -from typing import List, Optional, Any, Union, Dict, TypeVar -from typing import Callable, Tuple, cast, Sequence, Type, Iterable import ctypes +import json import os import re import sys -import json import warnings +from abc import ABC, abstractmethod +from collections.abc import Mapping from functools import wraps -from inspect import signature, Parameter +from inspect import Parameter, signature +from typing import ( + Any, + Callable, + Dict, + Iterable, + List, + Optional, + Sequence, + Tuple, + Type, + TypeVar, + Union, + cast, + overload, +) import numpy as np import scipy.sparse -from .compat import DataFrame, py_str, PANDAS_INSTALLED -from .libpath import find_lib_path from ._typing import ( - CStrPptr, - c_bst_ulong, + _T, + ArrayLike, + BoosterParam, + CFloatPtr, CNumeric, - DataType, CNumericPtr, + CStrPptr, CStrPtr, CTypeT, - ArrayLike, - CFloatPtr, - NumpyOrCupy, + CupyT, + DataType, FeatureInfo, - FeatureTypes, FeatureNames, - _T, - CupyT, - BoosterParam + FeatureTypes, + NumpyOrCupy, + c_bst_ulong, ) +from .compat import PANDAS_INSTALLED, DataFrame, py_str +from .libpath import find_lib_path class XGBoostError(ValueError): """Error thrown by xgboost trainer.""" +@overload +def from_pystr_to_cstr(data: str) -> bytes: + ... + + +@overload +def from_pystr_to_cstr(data: List[str]) -> ctypes.Array: + ... + + def from_pystr_to_cstr(data: Union[str, List[str]]) -> Union[bytes, ctypes.Array]: """Convert a Python str or list of Python str to C pointer diff --git a/python-package/xgboost/data.py b/python-package/xgboost/data.py index e11b5f067e01..bc1185809136 100644 --- a/python-package/xgboost/data.py +++ b/python-package/xgboost/data.py @@ -3,24 +3,33 @@ '''Data dispatching for DMatrix.''' import ctypes import json -import warnings import os -from typing import Any, Tuple, Callable, Optional, List, Union, Iterator, Sequence, cast +import warnings +from typing import Any, Callable, Iterator, List, Optional, Sequence, Tuple, Union, cast import numpy as np -from .core import c_array, _LIB, _check_call, c_str -from .core import _cuda_array_interface -from .core import DataIter, _ProxyDMatrix, DMatrix -from .compat import lazy_isinstance, DataFrame from ._typing import ( - c_bst_ulong, + CupyT, DataType, - FeatureTypes, FeatureNames, + FeatureTypes, + FloatCompatible, NumpyDType, - CupyT, - FloatCompatible, PandasDType + PandasDType, + c_bst_ulong, +) +from .compat import DataFrame, lazy_isinstance +from .core import ( + _LIB, + DataIter, + DMatrix, + _check_call, + _cuda_array_interface, + _ProxyDMatrix, + c_array, + c_str, + from_pystr_to_cstr, ) DispatchedDataBackendReturnType = Tuple[ @@ -631,10 +640,10 @@ def _is_cudf_df(data: DataType) -> bool: def _cudf_array_interfaces(data: DataType, cat_codes: list) -> bytes: - """Extract CuDF __cuda_array_interface__. This is special as it returns a new list of - data and a list of array interfaces. The data is list of categorical codes that - caller can safely ignore, but have to keep their reference alive until usage of array - interface is finished. + """Extract CuDF __cuda_array_interface__. This is special as it returns a new list + of data and a list of array interfaces. The data is list of categorical codes that + caller can safely ignore, but have to keep their reference alive until usage of + array interface is finished. """ try: @@ -643,14 +652,18 @@ def _cudf_array_interfaces(data: DataType, cat_codes: list) -> bytes: from cudf.utils.dtypes import is_categorical_dtype interfaces = [] + + def append(interface: dict) -> None: + if "mask" in interface: + interface["mask"] = interface["mask"].__cuda_array_interface__ + interfaces.append(interface) + if _is_cudf_ser(data): if is_categorical_dtype(data.dtype): interface = cat_codes[0].__cuda_array_interface__ else: interface = data.__cuda_array_interface__ - if "mask" in interface: - interface["mask"] = interface["mask"].__cuda_array_interface__ - interfaces.append(interface) + append(interface) else: for i, col in enumerate(data): if is_categorical_dtype(data[col].dtype): @@ -658,10 +671,8 @@ def _cudf_array_interfaces(data: DataType, cat_codes: list) -> bytes: interface = codes.__cuda_array_interface__ else: interface = data[col].__cuda_array_interface__ - if "mask" in interface: - interface["mask"] = interface["mask"].__cuda_array_interface__ - interfaces.append(interface) - interfaces_str = bytes(json.dumps(interfaces, indent=2), "utf-8") + append(interface) + interfaces_str = from_pystr_to_cstr(json.dumps(interfaces)) return interfaces_str @@ -722,9 +733,14 @@ def _transform_cudf_df( cat_codes.append(codes) else: for col in data: - if is_categorical_dtype(data[col].dtype) and enable_categorical: + dtype = data[col].dtype + if is_categorical_dtype(dtype) and enable_categorical: codes = data[col].cat.codes cat_codes.append(codes) + elif is_categorical_dtype(dtype): + raise ValueError(_ENABLE_CAT_ERR) + else: + cat_codes.append([]) return data, cat_codes, feature_names, feature_types diff --git a/tests/python-gpu/test_from_cudf.py b/tests/python-gpu/test_from_cudf.py index dc474f15e825..f924fc3483d9 100644 --- a/tests/python-gpu/test_from_cudf.py +++ b/tests/python-gpu/test_from_cudf.py @@ -1,6 +1,8 @@ +import json +import sys + import numpy as np import xgboost as xgb -import sys import pytest sys.path.append("tests/python") @@ -176,20 +178,38 @@ def test_cudf_metainfo_device_dmatrix(self): _test_cudf_metainfo(xgb.DeviceQuantileDMatrix) @pytest.mark.skipif(**tm.no_cudf()) - def test_cudf_categorical(self): + def test_cudf_categorical(self) -> None: import cudf - _X, _y = tm.make_categorical(100, 30, 17, False) + n_features = 30 + _X, _y = tm.make_categorical(100, n_features, 17, False) X = cudf.from_pandas(_X) y = cudf.from_pandas(_y) Xy = xgb.DMatrix(X, y, enable_categorical=True) + assert Xy.feature_types is not None assert len(Xy.feature_types) == X.shape[1] assert all(t == "c" for t in Xy.feature_types) Xy = xgb.DeviceQuantileDMatrix(X, y, enable_categorical=True) + assert Xy.feature_types is not None assert len(Xy.feature_types) == X.shape[1] assert all(t == "c" for t in Xy.feature_types) + # mixed dtypes + X["1"] = X["1"].astype(np.int64) + X["3"] = X["3"].astype(np.int64) + df, cat_codes, _, _ = xgb.data._transform_cudf_df( + X, None, None, enable_categorical=True + ) + assert X.shape[1] == n_features + assert len(cat_codes) == X.shape[1] + assert not cat_codes[0] + assert not cat_codes[2] + + interfaces_str = xgb.data._cudf_array_interfaces(df, cat_codes) + interfaces = json.loads(interfaces_str) + assert len(interfaces) == X.shape[1] + # test missing value X = cudf.DataFrame({"f0": ["a", "b", np.NaN]}) X["f0"] = X["f0"].astype("category") @@ -206,7 +226,7 @@ def test_cudf_categorical(self): assert Xy.num_row() == 3 assert Xy.num_col() == 1 - with pytest.raises(ValueError): + with pytest.raises(ValueError, match="enable_categorical"): xgb.DeviceQuantileDMatrix(X, y) Xy = xgb.DeviceQuantileDMatrix(X, y, enable_categorical=True)