From 7622b8cdb8005e1d221c6c15b7cd5c7479560c55 Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Tue, 29 Sep 2020 11:22:56 +0800 Subject: [PATCH] Enable categorical data support on Python DMatrix. (#6166) * Only pandas is recognized. --- python-package/xgboost/core.py | 16 ++++++++++++-- python-package/xgboost/data.py | 36 ++++++++++++++++++++------------ tests/python/test_with_modin.py | 9 ++++---- tests/python/test_with_pandas.py | 19 +++++++++++++---- 4 files changed, 57 insertions(+), 23 deletions(-) diff --git a/python-package/xgboost/core.py b/python-package/xgboost/core.py index 60863dbd45cb..87d5c3aecfe2 100644 --- a/python-package/xgboost/core.py +++ b/python-package/xgboost/core.py @@ -384,7 +384,8 @@ def __init__(self, data, label=None, weight=None, base_margin=None, silent=False, feature_names=None, feature_types=None, - nthread=None): + nthread=None, + enable_categorical=False): """Parameters ---------- data : os.PathLike/string/numpy.array/scipy.sparse/pd.DataFrame/ @@ -419,6 +420,16 @@ def __init__(self, data, label=None, weight=None, base_margin=None, Number of threads to use for loading data when parallelization is applicable. If -1, uses maximum threads available on the system. + enable_categorical: boolean, optional + + .. versionadded:: 1.3.0 + + Experimental support of specializing for categorical features. Do + not set to True unless you are interested in development. + Currently it's only available for `gpu_hist` tree method with 1 vs + rest (one hot) categorical split. Also, JSON serialization format, + `gpu_predictor` and pandas input are required. + """ if isinstance(data, list): raise TypeError('Input data can not be a list.') @@ -437,7 +448,8 @@ def __init__(self, data, label=None, weight=None, base_margin=None, data, missing=self.missing, threads=self.nthread, feature_names=feature_names, - feature_types=feature_types) + feature_types=feature_types, + enable_categorical=enable_categorical) assert handle is not None self.handle = handle diff --git a/python-package/xgboost/data.py b/python-package/xgboost/data.py index 68ee917d7739..3d3fda1064b3 100644 --- a/python-package/xgboost/data.py +++ b/python-package/xgboost/data.py @@ -184,20 +184,24 @@ def _is_modin_df(data): } -def _transform_pandas_df(data, feature_names=None, feature_types=None, +def _transform_pandas_df(data, enable_categorical, + feature_names=None, feature_types=None, meta=None, meta_type=None): from pandas import MultiIndex, Int64Index - from pandas.api.types import is_sparse + from pandas.api.types import is_sparse, is_categorical + data_dtypes = data.dtypes - if not all(dtype.name in _pandas_dtype_mapper or is_sparse(dtype) + if not all(dtype.name in _pandas_dtype_mapper or is_sparse(dtype) or + (is_categorical(dtype) and enable_categorical) for dtype in data_dtypes): bad_fields = [ str(data.columns[i]) for i, dtype in enumerate(data_dtypes) if dtype.name not in _pandas_dtype_mapper ] - msg = """DataFrame.dtypes for data must be int, float or bool. - Did not expect the data types in fields """ + msg = """DataFrame.dtypes for data must be int, float, bool or categorical. When + categorical type is supplied, DMatrix parameter + `enable_categorical` must be set to `True`.""" raise ValueError(msg + ', '.join(bad_fields)) if feature_names is None and meta is None: @@ -216,6 +220,8 @@ def _transform_pandas_df(data, feature_names=None, feature_types=None, if is_sparse(dtype): feature_types.append(_pandas_dtype_mapper[ dtype.subtype.name]) + elif is_categorical(dtype) and enable_categorical: + feature_types.append('categorical') else: feature_types.append(_pandas_dtype_mapper[dtype.name]) @@ -226,13 +232,13 @@ def _transform_pandas_df(data, feature_names=None, feature_types=None, dtype = meta_type if meta_type else np.float32 data = np.ascontiguousarray(data.values, dtype=dtype) - return data, feature_names, feature_types -def _from_pandas_df(data, missing, nthread, feature_names, feature_types): +def _from_pandas_df(data, enable_categorical, missing, nthread, + feature_names, feature_types): data, feature_names, feature_types = _transform_pandas_df( - data, feature_names, feature_types) + data, enable_categorical, feature_names, feature_types) return _from_numpy_array(data, missing, nthread, feature_names, feature_types) @@ -244,6 +250,7 @@ def _is_pandas_series(data): return False return isinstance(data, pd.Series) + def _is_modin_series(data): try: import modin.pandas as pd @@ -507,7 +514,8 @@ def _has_array_protocol(data): def dispatch_data_backend(data, missing, threads, - feature_names, feature_types): + feature_names, feature_types, + enable_categorical=False): '''Dispatch data for DMatrix.''' if _is_scipy_csr(data): return _from_scipy_csr(data, missing, feature_names, feature_types) @@ -525,7 +533,7 @@ def dispatch_data_backend(data, missing, threads, if _is_tuple(data): return _from_tuple(data, missing, feature_names, feature_types) if _is_pandas_df(data): - return _from_pandas_df(data, missing, threads, + return _from_pandas_df(data, enable_categorical, missing, threads, feature_names, feature_types) if _is_pandas_series(data): return _from_pandas_series(data, missing, threads, feature_names, @@ -551,7 +559,7 @@ def dispatch_data_backend(data, missing, threads, return _from_dt_df(data, missing, threads, feature_names, feature_types) if _is_modin_df(data): - return _from_pandas_df(data, missing, threads, + return _from_pandas_df(data, enable_categorical, missing, threads, feature_names, feature_types) if _is_modin_series(data): return _from_pandas_series(data, missing, threads, feature_names, @@ -655,7 +663,8 @@ def dispatch_meta_backend(matrix: DMatrix, data, name: str, dtype: str = None): _meta_from_numpy(data, name, dtype, handle) return if _is_pandas_df(data): - data, _, _ = _transform_pandas_df(data, meta=name, meta_type=dtype) + data, _, _ = _transform_pandas_df(data, False, meta=name, + meta_type=dtype) _meta_from_numpy(data, name, dtype, handle) return if _is_pandas_series(data): @@ -680,7 +689,8 @@ def dispatch_meta_backend(matrix: DMatrix, data, name: str, dtype: str = None): _meta_from_dt(data, name, dtype, handle) return if _is_modin_df(data): - data, _, _ = _transform_pandas_df(data, meta=name, meta_type=dtype) + data, _, _ = _transform_pandas_df( + data, False, meta=name, meta_type=dtype) _meta_from_numpy(data, name, dtype, handle) return if _is_modin_series(data): diff --git a/tests/python/test_with_modin.py b/tests/python/test_with_modin.py index 05cbaf8817ec..daa2ed039883 100644 --- a/tests/python/test_with_modin.py +++ b/tests/python/test_with_modin.py @@ -67,7 +67,8 @@ def test_modin(self): # 0 1 1 0 0 # 1 2 0 1 0 # 2 3 0 0 1 - result, _, _ = xgb.data._transform_pandas_df(dummies) + result, _, _ = xgb.data._transform_pandas_df(dummies, + enable_categorical=False) exp = np.array([[1., 1., 0., 0.], [2., 0., 1., 0.], [3., 0., 0., 1.]]) @@ -113,15 +114,15 @@ def test_modin_label(self): # label must be a single column df = md.DataFrame({'A': ['X', 'Y', 'Z'], 'B': [1, 2, 3]}) self.assertRaises(ValueError, xgb.data._transform_pandas_df, df, - None, None, 'label', 'float') + False, None, None, 'label', 'float') # label must be supported dtype df = md.DataFrame({'A': np.array(['a', 'b', 'c'], dtype=object)}) self.assertRaises(ValueError, xgb.data._transform_pandas_df, df, - None, None, 'label', 'float') + False, None, None, 'label', 'float') df = md.DataFrame({'A': np.array([1, 2, 3], dtype=int)}) - result, _, _ = xgb.data._transform_pandas_df(df, None, None, + result, _, _ = xgb.data._transform_pandas_df(df, False, None, None, 'label', 'float') np.testing.assert_array_equal(result, np.array([[1.], [2.], [3.]], dtype=float)) diff --git a/tests/python/test_with_pandas.py b/tests/python/test_with_pandas.py index 56aa3e9f3247..7ea8012425ff 100644 --- a/tests/python/test_with_pandas.py +++ b/tests/python/test_with_pandas.py @@ -67,7 +67,8 @@ def test_pandas(self): # 0 1 1 0 0 # 1 2 0 1 0 # 2 3 0 0 1 - result, _, _ = xgb.data._transform_pandas_df(dummies) + result, _, _ = xgb.data._transform_pandas_df(dummies, + enable_categorical=False) exp = np.array([[1., 1., 0., 0.], [2., 0., 1., 0.], [3., 0., 0., 1.]]) @@ -109,6 +110,16 @@ def test_pandas(self): assert dm.num_row() == 2 assert dm.num_col() == 6 + def test_pandas_categorical(self): + rng = np.random.RandomState(1994) + rows = 100 + X = rng.randint(3, 7, size=rows) + X = pd.Series(X, dtype="category") + X = pd.DataFrame({'f0': X}) + y = rng.randn(rows) + m = xgb.DMatrix(X, y, enable_categorical=True) + assert m.feature_types[0] == 'categorical' + def test_pandas_sparse(self): import pandas as pd rows = 100 @@ -129,15 +140,15 @@ def test_pandas_label(self): # label must be a single column df = pd.DataFrame({'A': ['X', 'Y', 'Z'], 'B': [1, 2, 3]}) self.assertRaises(ValueError, xgb.data._transform_pandas_df, df, - None, None, 'label', 'float') + False, None, None, 'label', 'float') # label must be supported dtype df = pd.DataFrame({'A': np.array(['a', 'b', 'c'], dtype=object)}) self.assertRaises(ValueError, xgb.data._transform_pandas_df, df, - None, None, 'label', 'float') + False, None, None, 'label', 'float') df = pd.DataFrame({'A': np.array([1, 2, 3], dtype=int)}) - result, _, _ = xgb.data._transform_pandas_df(df, None, None, + result, _, _ = xgb.data._transform_pandas_df(df, False, None, None, 'label', 'float') np.testing.assert_array_equal(result, np.array([[1.], [2.], [3.]], dtype=float))