Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Encode pandas categorical feature automatically. #7231

Merged
merged 1 commit into from Sep 17, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
43 changes: 27 additions & 16 deletions python-package/xgboost/data.py
Expand Up @@ -220,61 +220,72 @@ def _is_modin_df(data):

def _transform_pandas_df(
data,
enable_categorical,
enable_categorical: bool,
feature_names: Optional[List[str]] = None,
feature_types: Optional[List[str]] = None,
meta=None,
meta_type=None,
):
from pandas import MultiIndex, Int64Index, RangeIndex
import pandas as pd
from pandas.api.types import is_sparse, is_categorical_dtype

data_dtypes = data.dtypes
if not all(dtype.name in _pandas_dtype_mapper or is_sparse(dtype) or
(is_categorical_dtype(dtype) and enable_categorical)
for dtype in data_dtypes):
for dtype in data.dtypes):
bad_fields = [
str(data.columns[i]) for i, dtype in enumerate(data_dtypes)
str(data.columns[i]) for i, dtype in enumerate(data.dtypes)
if dtype.name not in _pandas_dtype_mapper
]

msg = """DataFrame.dtypes for data must be int, float, bool or categorical. When
categorical type is supplied, DMatrix parameter
`enable_categorical` must be set to `True`."""
msg = """DataFrame.dtypes for data must be int, float, bool or category. When
categorical type is supplied, DMatrix parameter `enable_categorical` must
be set to `True`."""
raise ValueError(msg + ', '.join(bad_fields))

# handle feature names
if feature_names is None and meta is None:
if isinstance(data.columns, MultiIndex):
if isinstance(data.columns, pd.MultiIndex):
feature_names = [
' '.join([str(x) for x in i]) for i in data.columns
]
elif isinstance(data.columns, (Int64Index, RangeIndex)):
elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):
feature_names = list(map(str, data.columns))
else:
feature_names = data.columns.format()

# handle feature types
if feature_types is None and meta is None:
feature_types = []
for dtype in data_dtypes:
for i, dtype in enumerate(data.dtypes):
if is_sparse(dtype):
feature_types.append(_pandas_dtype_mapper[
dtype.subtype.name])
feature_types.append(_pandas_dtype_mapper[dtype.subtype.name])
elif is_categorical_dtype(dtype) and enable_categorical:
feature_types.append(CAT_T)
else:
feature_types.append(_pandas_dtype_mapper[dtype.name])

# handle categorical codes.
transformed = pd.DataFrame()
if enable_categorical:
for i, dtype in enumerate(data.dtypes):
if is_categorical_dtype(dtype):
transformed[data.columns[i]] = data[data.columns[i]].cat.codes
else:
transformed[data.columns[i]] = data[data.columns[i]]
else:
transformed = data

if meta and len(data.columns) > 1:
raise ValueError(
'DataFrame for {meta} cannot have multiple columns'.format(
meta=meta)
)

dtype = meta_type if meta_type else np.float32
data = data.values
arr = transformed.values
if meta_type:
data = data.astype(meta_type)
return data, feature_names, feature_types
arr = arr.astype(meta_type)
return arr, feature_names, feature_types


def _from_pandas_df(
Expand Down
2 changes: 2 additions & 0 deletions src/tree/updater_gpu_hist.cu
Expand Up @@ -582,6 +582,8 @@ struct GPUHistMakerDevice {

auto is_cat = candidate.split.is_cat;
if (is_cat) {
CHECK_LT(candidate.split.fvalue, std::numeric_limits<bst_cat_t>::max())
<< "Categorical feature value too large.";
auto cat = common::AsCat(candidate.split.fvalue);
std::vector<uint32_t> split_cats(LBitField32::ComputeStorageSize(std::max(cat+1, 1)), 0);
LBitField32 cats_bits(split_cats);
Expand Down
11 changes: 11 additions & 0 deletions tests/python/test_with_pandas.py
Expand Up @@ -130,6 +130,17 @@ def test_pandas_categorical(self):
m = xgb.DMatrix(X, y, enable_categorical=True)
assert m.feature_types[0] == 'c'

X_0 = ["f", "o", "o"]
X_1 = [4, 3, 2]
X = pd.DataFrame({"feat_0": X_0, "feat_1": X_1})
X["feat_0"] = X["feat_0"].astype("category")
transformed, _, feature_types = xgb.data._transform_pandas_df(
X, enable_categorical=True
)

assert np.issubdtype(transformed[:, 0].dtype, np.integer)
assert transformed[:, 0].min() == 0

def test_pandas_sparse(self):
import pandas as pd
rows = 100
Expand Down