diff --git a/python-package/xgboost/data.py b/python-package/xgboost/data.py index 8a446e287c5f..48d1884ef12c 100644 --- a/python-package/xgboost/data.py +++ b/python-package/xgboost/data.py @@ -220,50 +220,61 @@ def _is_modin_df(data): def _transform_pandas_df( data, - enable_categorical, + enable_categorical: bool, feature_names: Optional[List[str]] = None, feature_types: Optional[List[str]] = None, meta=None, meta_type=None, ): - from pandas import MultiIndex, Int64Index, RangeIndex + import pandas as pd from pandas.api.types import is_sparse, is_categorical_dtype - data_dtypes = data.dtypes if not all(dtype.name in _pandas_dtype_mapper or is_sparse(dtype) or (is_categorical_dtype(dtype) and enable_categorical) - for dtype in data_dtypes): + for dtype in data.dtypes): bad_fields = [ - str(data.columns[i]) for i, dtype in enumerate(data_dtypes) + str(data.columns[i]) for i, dtype in enumerate(data.dtypes) if dtype.name not in _pandas_dtype_mapper ] - msg = """DataFrame.dtypes for data must be int, float, bool or categorical. When - categorical type is supplied, DMatrix parameter - `enable_categorical` must be set to `True`.""" + msg = """DataFrame.dtypes for data must be int, float, bool or category. When + categorical type is supplied, DMatrix parameter `enable_categorical` must + be set to `True`.""" raise ValueError(msg + ', '.join(bad_fields)) + # handle feature names if feature_names is None and meta is None: - if isinstance(data.columns, MultiIndex): + if isinstance(data.columns, pd.MultiIndex): feature_names = [ ' '.join([str(x) for x in i]) for i in data.columns ] - elif isinstance(data.columns, (Int64Index, RangeIndex)): + elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)): feature_names = list(map(str, data.columns)) else: feature_names = data.columns.format() + # handle feature types if feature_types is None and meta is None: feature_types = [] - for dtype in data_dtypes: + for i, dtype in enumerate(data.dtypes): if is_sparse(dtype): - feature_types.append(_pandas_dtype_mapper[ - dtype.subtype.name]) + feature_types.append(_pandas_dtype_mapper[dtype.subtype.name]) elif is_categorical_dtype(dtype) and enable_categorical: feature_types.append(CAT_T) else: feature_types.append(_pandas_dtype_mapper[dtype.name]) + # handle categorical codes. + transformed = pd.DataFrame() + if enable_categorical: + for i, dtype in enumerate(data.dtypes): + if is_categorical_dtype(dtype): + transformed[data.columns[i]] = data[data.columns[i]].cat.codes + else: + transformed[data.columns[i]] = data[data.columns[i]] + else: + transformed = data + if meta and len(data.columns) > 1: raise ValueError( 'DataFrame for {meta} cannot have multiple columns'.format( @@ -271,10 +282,10 @@ def _transform_pandas_df( ) dtype = meta_type if meta_type else np.float32 - data = data.values + arr = transformed.values if meta_type: - data = data.astype(meta_type) - return data, feature_names, feature_types + arr = arr.astype(meta_type) + return arr, feature_names, feature_types def _from_pandas_df( diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu index 6cd66d21fcaa..1014bf53a916 100644 --- a/src/tree/updater_gpu_hist.cu +++ b/src/tree/updater_gpu_hist.cu @@ -582,6 +582,8 @@ struct GPUHistMakerDevice { auto is_cat = candidate.split.is_cat; if (is_cat) { + CHECK_LT(candidate.split.fvalue, std::numeric_limits::max()) + << "Categorical feature value too large."; auto cat = common::AsCat(candidate.split.fvalue); std::vector split_cats(LBitField32::ComputeStorageSize(std::max(cat+1, 1)), 0); LBitField32 cats_bits(split_cats); diff --git a/tests/python/test_with_pandas.py b/tests/python/test_with_pandas.py index dbd8ceb7e637..67d1d66f65ba 100644 --- a/tests/python/test_with_pandas.py +++ b/tests/python/test_with_pandas.py @@ -130,6 +130,17 @@ def test_pandas_categorical(self): m = xgb.DMatrix(X, y, enable_categorical=True) assert m.feature_types[0] == 'c' + X_0 = ["f", "o", "o"] + X_1 = [4, 3, 2] + X = pd.DataFrame({"feat_0": X_0, "feat_1": X_1}) + X["feat_0"] = X["feat_0"].astype("category") + transformed, _, feature_types = xgb.data._transform_pandas_df( + X, enable_categorical=True + ) + + assert np.issubdtype(transformed[:, 0].dtype, np.integer) + assert transformed[:, 0].min() == 0 + def test_pandas_sparse(self): import pandas as pd rows = 100