Skip to content

Commit

Permalink
Encode pandas categorical data automatically. (#7231)
Browse files Browse the repository at this point in the history
  • Loading branch information
trivialfis committed Sep 17, 2021
1 parent 32e0858 commit 22d56ce
Show file tree
Hide file tree
Showing 3 changed files with 40 additions and 16 deletions.
43 changes: 27 additions & 16 deletions python-package/xgboost/data.py
Expand Up @@ -220,61 +220,72 @@ def _is_modin_df(data):

def _transform_pandas_df(
data,
enable_categorical,
enable_categorical: bool,
feature_names: Optional[List[str]] = None,
feature_types: Optional[List[str]] = None,
meta=None,
meta_type=None,
):
from pandas import MultiIndex, Int64Index, RangeIndex
import pandas as pd
from pandas.api.types import is_sparse, is_categorical_dtype

data_dtypes = data.dtypes
if not all(dtype.name in _pandas_dtype_mapper or is_sparse(dtype) or
(is_categorical_dtype(dtype) and enable_categorical)
for dtype in data_dtypes):
for dtype in data.dtypes):
bad_fields = [
str(data.columns[i]) for i, dtype in enumerate(data_dtypes)
str(data.columns[i]) for i, dtype in enumerate(data.dtypes)
if dtype.name not in _pandas_dtype_mapper
]

msg = """DataFrame.dtypes for data must be int, float, bool or categorical. When
categorical type is supplied, DMatrix parameter
`enable_categorical` must be set to `True`."""
msg = """DataFrame.dtypes for data must be int, float, bool or category. When
categorical type is supplied, DMatrix parameter `enable_categorical` must
be set to `True`."""
raise ValueError(msg + ', '.join(bad_fields))

# handle feature names
if feature_names is None and meta is None:
if isinstance(data.columns, MultiIndex):
if isinstance(data.columns, pd.MultiIndex):
feature_names = [
' '.join([str(x) for x in i]) for i in data.columns
]
elif isinstance(data.columns, (Int64Index, RangeIndex)):
elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
feature_names = list(map(str, data.columns))
else:
feature_names = data.columns.format()

# handle feature types
if feature_types is None and meta is None:
feature_types = []
for dtype in data_dtypes:
for i, dtype in enumerate(data.dtypes):
if is_sparse(dtype):
feature_types.append(_pandas_dtype_mapper[
dtype.subtype.name])
feature_types.append(_pandas_dtype_mapper[dtype.subtype.name])
elif is_categorical_dtype(dtype) and enable_categorical:
feature_types.append(CAT_T)
else:
feature_types.append(_pandas_dtype_mapper[dtype.name])

# handle categorical codes.
transformed = pd.DataFrame()
if enable_categorical:
for i, dtype in enumerate(data.dtypes):
if is_categorical_dtype(dtype):
transformed[data.columns[i]] = data[data.columns[i]].cat.codes
else:
transformed[data.columns[i]] = data[data.columns[i]]
else:
transformed = data

if meta and len(data.columns) > 1:
raise ValueError(
'DataFrame for {meta} cannot have multiple columns'.format(
meta=meta)
)

dtype = meta_type if meta_type else np.float32
data = data.values
arr = transformed.values
if meta_type:
data = data.astype(meta_type)
return data, feature_names, feature_types
arr = arr.astype(meta_type)
return arr, feature_names, feature_types


def _from_pandas_df(
Expand Down
2 changes: 2 additions & 0 deletions src/tree/updater_gpu_hist.cu
Expand Up @@ -582,6 +582,8 @@ struct GPUHistMakerDevice {

auto is_cat = candidate.split.is_cat;
if (is_cat) {
CHECK_LT(candidate.split.fvalue, std::numeric_limits<bst_cat_t>::max())
<< "Categorical feature value too large.";
auto cat = common::AsCat(candidate.split.fvalue);
std::vector<uint32_t> split_cats(LBitField32::ComputeStorageSize(std::max(cat+1, 1)), 0);
LBitField32 cats_bits(split_cats);
Expand Down
11 changes: 11 additions & 0 deletions tests/python/test_with_pandas.py
Expand Up @@ -130,6 +130,17 @@ def test_pandas_categorical(self):
m = xgb.DMatrix(X, y, enable_categorical=True)
assert m.feature_types[0] == 'c'

X_0 = ["f", "o", "o"]
X_1 = [4, 3, 2]
X = pd.DataFrame({"feat_0": X_0, "feat_1": X_1})
X["feat_0"] = X["feat_0"].astype("category")
transformed, _, feature_types = xgb.data._transform_pandas_df(
X, enable_categorical=True
)

assert np.issubdtype(transformed[:, 0].dtype, np.integer)
assert transformed[:, 0].min() == 0

def test_pandas_sparse(self):
import pandas as pd
rows = 100
Expand Down

0 comments on commit 22d56ce

Please sign in to comment.