Skip to content

Commit

Permalink
Encode pandas categorical data automatically.
Browse files Browse the repository at this point in the history
  • Loading branch information
trivialfis committed Sep 15, 2021
1 parent f21863d commit 49bad50
Show file tree
Hide file tree
Showing 3 changed files with 36 additions and 10 deletions.
33 changes: 23 additions & 10 deletions python-package/xgboost/data.py
Expand Up @@ -220,12 +220,13 @@ def _is_modin_df(data):

def _transform_pandas_df(
data,
enable_categorical,
enable_categorical: bool,
feature_names: Optional[List[str]] = None,
feature_types: Optional[List[str]] = None,
meta=None,
meta_type=None,
):
import pandas as pd
from pandas import MultiIndex, Int64Index, RangeIndex
from pandas.api.types import is_sparse, is_categorical_dtype

Expand All @@ -238,11 +239,12 @@ def _transform_pandas_df(
if dtype.name not in _pandas_dtype_mapper
]

msg = """DataFrame.dtypes for data must be int, float, bool or categorical. When
categorical type is supplied, DMatrix parameter
`enable_categorical` must be set to `True`."""
msg = """DataFrame.dtypes for data must be int, float, bool or category. When
categorical type is supplied, DMatrix parameter `enable_categorical` must
be set to `True`."""
raise ValueError(msg + ', '.join(bad_fields))

# handle feature names
if feature_names is None and meta is None:
if isinstance(data.columns, MultiIndex):
feature_names = [
Expand All @@ -253,28 +255,39 @@ def _transform_pandas_df(
else:
feature_names = data.columns.format()

# handle feature types
if feature_types is None and meta is None:
feature_types = []
for dtype in data_dtypes:
for i, dtype in enumerate(data_dtypes):
if is_sparse(dtype):
feature_types.append(_pandas_dtype_mapper[
dtype.subtype.name])
feature_types.append(_pandas_dtype_mapper[dtype.subtype.name])
elif is_categorical_dtype(dtype) and enable_categorical:
feature_types.append(CAT_T)
else:
feature_types.append(_pandas_dtype_mapper[dtype.name])

# handle categorical codes.
transformed = pd.DataFrame()
if enable_categorical:
for i, dtype in enumerate(data_dtypes):
if is_categorical_dtype(dtype):
transformed[data.columns[i]] = data[data.columns[i]].cat.codes
else:
transformed[data.columns[i]] = data[data.columns[i]]
else:
transformed = data

if meta and len(data.columns) > 1:
raise ValueError(
'DataFrame for {meta} cannot have multiple columns'.format(
meta=meta)
)

dtype = meta_type if meta_type else np.float32
data = data.values
arr = transformed.values
if meta_type:
data = data.astype(meta_type)
return data, feature_names, feature_types
arr = arr.astype(meta_type)
return arr, feature_names, feature_types


def _from_pandas_df(
Expand Down
2 changes: 2 additions & 0 deletions src/tree/updater_gpu_hist.cu
Expand Up @@ -582,6 +582,8 @@ struct GPUHistMakerDevice {

auto is_cat = candidate.split.is_cat;
if (is_cat) {
CHECK_LT(candidate.split.fvalue, std::numeric_limits<bst_cat_t>::max())
<< "Categorical feature value too large.";
auto cat = common::AsCat(candidate.split.fvalue);
std::vector<uint32_t> split_cats(LBitField32::ComputeStorageSize(std::max(cat+1, 1)), 0);
LBitField32 cats_bits(split_cats);
Expand Down
11 changes: 11 additions & 0 deletions tests/python/test_with_pandas.py
Expand Up @@ -130,6 +130,17 @@ def test_pandas_categorical(self):
m = xgb.DMatrix(X, y, enable_categorical=True)
assert m.feature_types[0] == 'c'

X_0 = ["f", "o", "o"]
X_1 = [4, 3, 2]
X = pd.DataFrame({"feat_0": X_0, "feat_1": X_1})
X["feat_0"] = X["feat_0"].astype("category")
transformed, _, feature_types = xgb.data._transform_pandas_df(
X, enable_categorical=True
)

assert np.issubdtype(transformed[:, 0].dtype, np.integer)
assert transformed[:, 0].min() == 0

def test_pandas_sparse(self):
import pandas as pd
rows = 100
Expand Down

0 comments on commit 49bad50

Please sign in to comment.