Skip to content

Commit

Permalink
Enable categorical data support on Python DMatrix. (#6166)
Browse files Browse the repository at this point in the history
* Only pandas is recognized.
  • Loading branch information
trivialfis committed Sep 29, 2020
1 parent 52c0b3f commit 7622b8c
Show file tree
Hide file tree
Showing 4 changed files with 57 additions and 23 deletions.
16 changes: 14 additions & 2 deletions python-package/xgboost/core.py
Expand Up @@ -384,7 +384,8 @@ def __init__(self, data, label=None, weight=None, base_margin=None,
silent=False,
feature_names=None,
feature_types=None,
nthread=None):
nthread=None,
enable_categorical=False):
"""Parameters
----------
data : os.PathLike/string/numpy.array/scipy.sparse/pd.DataFrame/
Expand Down Expand Up @@ -419,6 +420,16 @@ def __init__(self, data, label=None, weight=None, base_margin=None,
Number of threads to use for loading data when parallelization is
applicable. If -1, uses maximum threads available on the system.
enable_categorical: boolean, optional
.. versionadded:: 1.3.0
Experimental support of specializing for categorical features. Do
not set to True unless you are interested in development.
Currently it's only available for `gpu_hist` tree method with 1 vs
rest (one hot) categorical split. Also, JSON serialization format,
`gpu_predictor` and pandas input are required.
"""
if isinstance(data, list):
raise TypeError('Input data can not be a list.')
Expand All @@ -437,7 +448,8 @@ def __init__(self, data, label=None, weight=None, base_margin=None,
data, missing=self.missing,
threads=self.nthread,
feature_names=feature_names,
feature_types=feature_types)
feature_types=feature_types,
enable_categorical=enable_categorical)
assert handle is not None
self.handle = handle

Expand Down
36 changes: 23 additions & 13 deletions python-package/xgboost/data.py
Expand Up @@ -184,20 +184,24 @@ def _is_modin_df(data):
}


def _transform_pandas_df(data, feature_names=None, feature_types=None,
def _transform_pandas_df(data, enable_categorical,
feature_names=None, feature_types=None,
meta=None, meta_type=None):
from pandas import MultiIndex, Int64Index
from pandas.api.types import is_sparse
from pandas.api.types import is_sparse, is_categorical

data_dtypes = data.dtypes
if not all(dtype.name in _pandas_dtype_mapper or is_sparse(dtype)
if not all(dtype.name in _pandas_dtype_mapper or is_sparse(dtype) or
(is_categorical(dtype) and enable_categorical)
for dtype in data_dtypes):
bad_fields = [
str(data.columns[i]) for i, dtype in enumerate(data_dtypes)
if dtype.name not in _pandas_dtype_mapper
]

msg = """DataFrame.dtypes for data must be int, float or bool.
Did not expect the data types in fields """
msg = """DataFrame.dtypes for data must be int, float, bool or categorical. When
categorical type is supplied, DMatrix parameter
`enable_categorical` must be set to `True`."""
raise ValueError(msg + ', '.join(bad_fields))

if feature_names is None and meta is None:
Expand All @@ -216,6 +220,8 @@ def _transform_pandas_df(data, feature_names=None, feature_types=None,
if is_sparse(dtype):
feature_types.append(_pandas_dtype_mapper[
dtype.subtype.name])
elif is_categorical(dtype) and enable_categorical:
feature_types.append('categorical')
else:
feature_types.append(_pandas_dtype_mapper[dtype.name])

Expand All @@ -226,13 +232,13 @@ def _transform_pandas_df(data, feature_names=None, feature_types=None,

dtype = meta_type if meta_type else np.float32
data = np.ascontiguousarray(data.values, dtype=dtype)

return data, feature_names, feature_types


def _from_pandas_df(data, missing, nthread, feature_names, feature_types):
def _from_pandas_df(data, enable_categorical, missing, nthread,
feature_names, feature_types):
data, feature_names, feature_types = _transform_pandas_df(
data, feature_names, feature_types)
data, enable_categorical, feature_names, feature_types)
return _from_numpy_array(data, missing, nthread, feature_names,
feature_types)

Expand All @@ -244,6 +250,7 @@ def _is_pandas_series(data):
return False
return isinstance(data, pd.Series)


def _is_modin_series(data):
try:
import modin.pandas as pd
Expand Down Expand Up @@ -507,7 +514,8 @@ def _has_array_protocol(data):


def dispatch_data_backend(data, missing, threads,
feature_names, feature_types):
feature_names, feature_types,
enable_categorical=False):
'''Dispatch data for DMatrix.'''
if _is_scipy_csr(data):
return _from_scipy_csr(data, missing, feature_names, feature_types)
Expand All @@ -525,7 +533,7 @@ def dispatch_data_backend(data, missing, threads,
if _is_tuple(data):
return _from_tuple(data, missing, feature_names, feature_types)
if _is_pandas_df(data):
return _from_pandas_df(data, missing, threads,
return _from_pandas_df(data, enable_categorical, missing, threads,
feature_names, feature_types)
if _is_pandas_series(data):
return _from_pandas_series(data, missing, threads, feature_names,
Expand All @@ -551,7 +559,7 @@ def dispatch_data_backend(data, missing, threads,
return _from_dt_df(data, missing, threads, feature_names,
feature_types)
if _is_modin_df(data):
return _from_pandas_df(data, missing, threads,
return _from_pandas_df(data, enable_categorical, missing, threads,
feature_names, feature_types)
if _is_modin_series(data):
return _from_pandas_series(data, missing, threads, feature_names,
Expand Down Expand Up @@ -655,7 +663,8 @@ def dispatch_meta_backend(matrix: DMatrix, data, name: str, dtype: str = None):
_meta_from_numpy(data, name, dtype, handle)
return
if _is_pandas_df(data):
data, _, _ = _transform_pandas_df(data, meta=name, meta_type=dtype)
data, _, _ = _transform_pandas_df(data, False, meta=name,
meta_type=dtype)
_meta_from_numpy(data, name, dtype, handle)
return
if _is_pandas_series(data):
Expand All @@ -680,7 +689,8 @@ def dispatch_meta_backend(matrix: DMatrix, data, name: str, dtype: str = None):
_meta_from_dt(data, name, dtype, handle)
return
if _is_modin_df(data):
data, _, _ = _transform_pandas_df(data, meta=name, meta_type=dtype)
data, _, _ = _transform_pandas_df(
data, False, meta=name, meta_type=dtype)
_meta_from_numpy(data, name, dtype, handle)
return
if _is_modin_series(data):
Expand Down
9 changes: 5 additions & 4 deletions tests/python/test_with_modin.py
Expand Up @@ -67,7 +67,8 @@ def test_modin(self):
# 0 1 1 0 0
# 1 2 0 1 0
# 2 3 0 0 1
result, _, _ = xgb.data._transform_pandas_df(dummies)
result, _, _ = xgb.data._transform_pandas_df(dummies,
enable_categorical=False)
exp = np.array([[1., 1., 0., 0.],
[2., 0., 1., 0.],
[3., 0., 0., 1.]])
Expand Down Expand Up @@ -113,15 +114,15 @@ def test_modin_label(self):
# label must be a single column
df = md.DataFrame({'A': ['X', 'Y', 'Z'], 'B': [1, 2, 3]})
self.assertRaises(ValueError, xgb.data._transform_pandas_df, df,
None, None, 'label', 'float')
False, None, None, 'label', 'float')

# label must be supported dtype
df = md.DataFrame({'A': np.array(['a', 'b', 'c'], dtype=object)})
self.assertRaises(ValueError, xgb.data._transform_pandas_df, df,
None, None, 'label', 'float')
False, None, None, 'label', 'float')

df = md.DataFrame({'A': np.array([1, 2, 3], dtype=int)})
result, _, _ = xgb.data._transform_pandas_df(df, None, None,
result, _, _ = xgb.data._transform_pandas_df(df, False, None, None,
'label', 'float')
np.testing.assert_array_equal(result, np.array([[1.], [2.], [3.]],
dtype=float))
Expand Down
19 changes: 15 additions & 4 deletions tests/python/test_with_pandas.py
Expand Up @@ -67,7 +67,8 @@ def test_pandas(self):
# 0 1 1 0 0
# 1 2 0 1 0
# 2 3 0 0 1
result, _, _ = xgb.data._transform_pandas_df(dummies)
result, _, _ = xgb.data._transform_pandas_df(dummies,
enable_categorical=False)
exp = np.array([[1., 1., 0., 0.],
[2., 0., 1., 0.],
[3., 0., 0., 1.]])
Expand Down Expand Up @@ -109,6 +110,16 @@ def test_pandas(self):
assert dm.num_row() == 2
assert dm.num_col() == 6

def test_pandas_categorical(self):
rng = np.random.RandomState(1994)
rows = 100
X = rng.randint(3, 7, size=rows)
X = pd.Series(X, dtype="category")
X = pd.DataFrame({'f0': X})
y = rng.randn(rows)
m = xgb.DMatrix(X, y, enable_categorical=True)
assert m.feature_types[0] == 'categorical'

def test_pandas_sparse(self):
import pandas as pd
rows = 100
Expand All @@ -129,15 +140,15 @@ def test_pandas_label(self):
# label must be a single column
df = pd.DataFrame({'A': ['X', 'Y', 'Z'], 'B': [1, 2, 3]})
self.assertRaises(ValueError, xgb.data._transform_pandas_df, df,
None, None, 'label', 'float')
False, None, None, 'label', 'float')

# label must be supported dtype
df = pd.DataFrame({'A': np.array(['a', 'b', 'c'], dtype=object)})
self.assertRaises(ValueError, xgb.data._transform_pandas_df, df,
None, None, 'label', 'float')
False, None, None, 'label', 'float')

df = pd.DataFrame({'A': np.array([1, 2, 3], dtype=int)})
result, _, _ = xgb.data._transform_pandas_df(df, None, None,
result, _, _ = xgb.data._transform_pandas_df(df, False, None, None,
'label', 'float')
np.testing.assert_array_equal(result, np.array([[1.], [2.], [3.]],
dtype=float))
Expand Down

0 comments on commit 7622b8c

Please sign in to comment.