Skip to content

Commit

Permalink
Expose feature_types to sklearn interface. (#7821)
Browse files Browse the repository at this point in the history
  • Loading branch information
trivialfis committed Apr 21, 2022
1 parent 401d451 commit c70fa50
Show file tree
Hide file tree
Showing 7 changed files with 131 additions and 48 deletions.
5 changes: 3 additions & 2 deletions python-package/xgboost/_typing.py
@@ -1,15 +1,16 @@
"""Shared typing definition."""
import ctypes
import os
from typing import Optional, List, Any, TypeVar, Union
from typing import Optional, Any, TypeVar, Union, Sequence

# os.PathLike/string/numpy.array/scipy.sparse/pd.DataFrame/dt.Frame/
# cudf.DataFrame/cupy.array/dlpack
DataType = Any

# xgboost accepts some other possible types in practice due to historical reason, which is
# lesser tested. For now we encourage users to pass a simple list of string.
FeatureNames = Optional[List[str]]
FeatureNames = Optional[Sequence[str]]
FeatureTypes = Optional[Sequence[str]]

ArrayLike = Any
PathLike = Union[str, os.PathLike]
Expand Down
27 changes: 14 additions & 13 deletions python-package/xgboost/core.py
Expand Up @@ -31,6 +31,7 @@
CFloatPtr,
NumpyOrCupy,
FeatureNames,
FeatureTypes,
_T,
CupyT,
)
Expand Down Expand Up @@ -553,7 +554,7 @@ def __init__(
missing: Optional[float] = None,
silent: bool = False,
feature_names: FeatureNames = None,
feature_types: Optional[List[str]] = None,
feature_types: FeatureTypes = None,
nthread: Optional[int] = None,
group: Optional[ArrayLike] = None,
qid: Optional[ArrayLike] = None,
Expand Down Expand Up @@ -594,10 +595,15 @@ def __init__(
Whether print messages during construction
feature_names : list, optional
Set names for features.
feature_types :
feature_types : FeatureTypes
Set types for features. When `enable_categorical` is set to `True`, string
"c" represents categorical data type.
"c" represents categorical data type while "q" represents numerical feature
type. For categorical features, the input is assumed to be preprocessed and
encoded by the users. The encoding can be done via
:py:class:`sklearn.preprocessing.OrdinalEncoder` or pandas dataframe
`.cat.codes` method. This is useful when users want to specify categorical
features without having to construct a dataframe as input.
nthread : integer, optional
Number of threads to use for loading data when parallelization is
Expand Down Expand Up @@ -1062,12 +1068,7 @@ def feature_names(self, feature_names: FeatureNames) -> None:

@property
def feature_types(self) -> Optional[List[str]]:
"""Get feature types (column types).
Returns
-------
feature_types : list or None
"""
"""Get feature types. See :py:class:`DMatrix` for details."""
length = c_bst_ulong()
sarr = ctypes.POINTER(ctypes.c_char_p)()
_check_call(_LIB.XGDMatrixGetStrFeatureInfo(self.handle,
Expand All @@ -1083,8 +1084,8 @@ def feature_types(self) -> Optional[List[str]]:
def feature_types(self, feature_types: Optional[Union[List[str], str]]) -> None:
"""Set feature types (column types).
This is for displaying the results and categorical data support. See doc string
of :py:obj:`xgboost.DMatrix` for details.
This is for displaying the results and categorical data support. See
:py:class:`DMatrix` for details.
Parameters
----------
Expand Down Expand Up @@ -1647,7 +1648,7 @@ def _get_feature_info(self, field: str) -> Optional[List[str]]:
feature_info = from_cstr_to_pystr(sarr, length)
return feature_info if feature_info else None

def _set_feature_info(self, features: Optional[List[str]], field: str) -> None:
def _set_feature_info(self, features: Optional[Sequence[str]], field: str) -> None:
if features is not None:
assert isinstance(features, list)
feature_info_bytes = [bytes(f, encoding="utf-8") for f in features]
Expand All @@ -1667,7 +1668,7 @@ def _set_feature_info(self, features: Optional[List[str]], field: str) -> None:
@property
def feature_types(self) -> Optional[List[str]]:
"""Feature types for this booster. Can be directly set by input data or by
assignment.
assignment. See :py:class:`DMatrix` for details.
"""
return self._get_feature_info("feature_type")
Expand Down
18 changes: 14 additions & 4 deletions python-package/xgboost/dask.py
Expand Up @@ -54,10 +54,11 @@
from .compat import PANDAS_INSTALLED, DataFrame, Series, pandas_concat
from .compat import lazy_isinstance

from ._typing import FeatureNames, FeatureTypes

from .core import DMatrix, DeviceQuantileDMatrix, Booster, _expect, DataIter
from .core import Objective, Metric
from .core import _deprecate_positional_args, _has_categorical
from .data import FeatureNames
from .training import train as worker_train
from .tracker import RabitTracker, get_host_ip
from .sklearn import XGBModel, XGBClassifier, XGBRegressorBase, XGBClassifierBase
Expand Down Expand Up @@ -327,7 +328,7 @@ def __init__(
missing: float = None,
silent: bool = False, # pylint: disable=unused-argument
feature_names: FeatureNames = None,
feature_types: Optional[List[str]] = None,
feature_types: FeatureTypes = None,
group: Optional[_DaskCollection] = None,
qid: Optional[_DaskCollection] = None,
label_lower_bound: Optional[_DaskCollection] = None,
Expand Down Expand Up @@ -1601,7 +1602,11 @@ async def _predict_async(
predts = predts.to_dask_array()
else:
test_dmatrix = await DaskDMatrix(
self.client, data=data, base_margin=base_margin, missing=self.missing
self.client,
data=data,
base_margin=base_margin,
missing=self.missing,
feature_types=self.feature_types
)
predts = await predict(
self.client,
Expand Down Expand Up @@ -1640,7 +1645,9 @@ async def _apply_async(
iteration_range: Optional[Tuple[int, int]] = None,
) -> Any:
iteration_range = self._get_iteration_range(iteration_range)
test_dmatrix = await DaskDMatrix(self.client, data=X, missing=self.missing)
test_dmatrix = await DaskDMatrix(
self.client, data=X, missing=self.missing, feature_types=self.feature_types,
)
predts = await predict(
self.client,
model=self.get_booster(),
Expand Down Expand Up @@ -1755,6 +1762,7 @@ async def _fit_async(
eval_qid=None,
missing=self.missing,
enable_categorical=self.enable_categorical,
feature_types=self.feature_types,
)

if callable(self.objective):
Expand Down Expand Up @@ -1849,6 +1857,7 @@ async def _fit_async(
eval_qid=None,
missing=self.missing,
enable_categorical=self.enable_categorical,
feature_types=self.feature_types,
)

# pylint: disable=attribute-defined-outside-init
Expand Down Expand Up @@ -2054,6 +2063,7 @@ async def _fit_async(
eval_qid=eval_qid,
missing=self.missing,
enable_categorical=self.enable_categorical,
feature_types=self.feature_types,
)
if eval_metric is not None:
if callable(eval_metric):
Expand Down
57 changes: 31 additions & 26 deletions python-package/xgboost/data.py
Expand Up @@ -13,6 +13,7 @@
from .core import c_array, _LIB, _check_call, c_str
from .core import _cuda_array_interface
from .core import DataIter, _ProxyDMatrix, DMatrix, FeatureNames
from ._typing import FeatureTypes
from .compat import lazy_isinstance, DataFrame

c_bst_ulong = ctypes.c_uint64 # pylint: disable=invalid-name
Expand Down Expand Up @@ -70,7 +71,7 @@ def _from_scipy_csr(
missing,
nthread,
feature_names: FeatureNames,
feature_types: Optional[List[str]],
feature_types: FeatureTypes,
):
"""Initialize data from a CSR matrix."""
if len(data.indices) != len(data.data):
Expand Down Expand Up @@ -109,7 +110,7 @@ def _from_scipy_csc(
data,
missing,
feature_names: FeatureNames,
feature_types: Optional[List[str]],
feature_types: FeatureTypes,
):
if len(data.indices) != len(data.data):
raise ValueError(f"length mismatch: {len(data.indices)} vs {len(data.data)}")
Expand Down Expand Up @@ -165,7 +166,7 @@ def _from_numpy_array(
missing,
nthread,
feature_names: FeatureNames,
feature_types: Optional[List[str]],
feature_types: FeatureTypes,
):
"""Initialize data from a 2-D numpy matrix.
Expand Down Expand Up @@ -228,6 +229,12 @@ def _is_modin_df(data):
}


_ENABLE_CAT_ERR = (
"When categorical type is supplied, DMatrix parameter `enable_categorical` must "
"be set to `True`."
)


def _invalid_dataframe_dtype(data: Any) -> None:
# pandas series has `dtypes` but it's just a single object
# cudf series doesn't have `dtypes`.
Expand All @@ -241,9 +248,8 @@ def _invalid_dataframe_dtype(data: Any) -> None:
else:
err = ""

msg = """DataFrame.dtypes for data must be int, float, bool or category. When
categorical type is supplied, DMatrix parameter `enable_categorical` must
be set to `True`.""" + err
type_err = "DataFrame.dtypes for data must be int, float, bool or category."
msg = f"""{type_err} {_ENABLE_CAT_ERR} {err}"""
raise ValueError(msg)


Expand Down Expand Up @@ -340,8 +346,8 @@ def _from_pandas_df(
missing: float,
nthread: int,
feature_names: FeatureNames,
feature_types: Optional[List[str]],
) -> Tuple[ctypes.c_void_p, FeatureNames, Optional[List[str]]]:
feature_types: FeatureTypes,
) -> Tuple[ctypes.c_void_p, FeatureNames, FeatureTypes]:
data, feature_names, feature_types = _transform_pandas_df(
data, enable_categorical, feature_names, feature_types
)
Expand Down Expand Up @@ -382,7 +388,7 @@ def _from_pandas_series(
nthread: int,
enable_categorical: bool,
feature_names: FeatureNames,
feature_types: Optional[List[str]],
feature_types: FeatureTypes,
):
from pandas.api.types import is_categorical_dtype

Expand Down Expand Up @@ -413,7 +419,7 @@ def _is_dt_df(data):
def _transform_dt_df(
data,
feature_names: FeatureNames,
feature_types: Optional[List[str]],
feature_types: FeatureTypes,
meta=None,
meta_type=None,
):
Expand Down Expand Up @@ -454,9 +460,9 @@ def _from_dt_df(
missing,
nthread,
feature_names: FeatureNames,
feature_types: Optional[List[str]],
feature_types: FeatureTypes,
enable_categorical: bool,
) -> Tuple[ctypes.c_void_p, FeatureNames, Optional[List[str]]]:
) -> Tuple[ctypes.c_void_p, FeatureNames, FeatureTypes]:
if enable_categorical:
raise ValueError("categorical data in datatable is not supported yet.")
data, feature_names, feature_types = _transform_dt_df(
Expand Down Expand Up @@ -542,10 +548,10 @@ def _from_arrow(
data,
missing: float,
nthread: int,
feature_names: Optional[List[str]],
feature_types: Optional[List[str]],
feature_names: FeatureNames,
feature_types: FeatureTypes,
enable_categorical: bool,
) -> Tuple[ctypes.c_void_p, Optional[List[str]], Optional[List[str]]]:
) -> Tuple[ctypes.c_void_p, FeatureNames, FeatureTypes]:
import pyarrow as pa

if not all(
Expand Down Expand Up @@ -621,7 +627,7 @@ def _cudf_array_interfaces(data, cat_codes: list) -> bytes:
def _transform_cudf_df(
data,
feature_names: FeatureNames,
feature_types: Optional[List[str]],
feature_types: FeatureTypes,
enable_categorical: bool,
):
try:
Expand Down Expand Up @@ -687,7 +693,7 @@ def _from_cudf_df(
missing,
nthread,
feature_names: FeatureNames,
feature_types: Optional[List[str]],
feature_types: FeatureTypes,
enable_categorical: bool,
) -> Tuple[ctypes.c_void_p, Any, Any]:
data, cat_codes, feature_names, feature_types = _transform_cudf_df(
Expand Down Expand Up @@ -735,7 +741,7 @@ def _from_cupy_array(
missing,
nthread,
feature_names: FeatureNames,
feature_types: Optional[List[str]],
feature_types: FeatureTypes,
):
"""Initialize DMatrix from cupy ndarray."""
data = _transform_cupy_array(data)
Expand Down Expand Up @@ -782,7 +788,7 @@ def _from_dlpack(
missing,
nthread,
feature_names: FeatureNames,
feature_types: Optional[List[str]],
feature_types: FeatureTypes,
):
data = _transform_dlpack(data)
return _from_cupy_array(data, missing, nthread, feature_names,
Expand All @@ -797,7 +803,7 @@ def _from_uri(
data,
missing,
feature_names: FeatureNames,
feature_types: Optional[List[str]],
feature_types: FeatureTypes,
):
_warn_unused_missing(data, missing)
handle = ctypes.c_void_p()
Expand All @@ -817,7 +823,7 @@ def _from_list(
missing,
n_threads,
feature_names: FeatureNames,
feature_types: Optional[List[str]],
feature_types: FeatureTypes,
):
array = np.array(data)
_check_data_shape(data)
Expand All @@ -833,7 +839,7 @@ def _from_tuple(
missing,
n_threads,
feature_names: FeatureNames,
feature_types: Optional[List[str]],
feature_types: FeatureTypes,
):
return _from_list(data, missing, n_threads, feature_names, feature_types)

Expand Down Expand Up @@ -869,7 +875,7 @@ def dispatch_data_backend(
missing,
threads,
feature_names: FeatureNames,
feature_types: Optional[List[str]],
feature_types: FeatureTypes,
enable_categorical: bool = False,
):
'''Dispatch data for DMatrix.'''
Expand All @@ -884,8 +890,7 @@ def dispatch_data_backend(
data.tocsr(), missing, threads, feature_names, feature_types
)
if _is_numpy_array(data):
return _from_numpy_array(data, missing, threads, feature_names,
feature_types)
return _from_numpy_array(data, missing, threads, feature_names, feature_types)
if _is_uri(data):
return _from_uri(data, missing, feature_names, feature_types)
if _is_list(data):
Expand Down Expand Up @@ -1101,7 +1106,7 @@ def reset(self) -> None:
def _proxy_transform(
data,
feature_names: FeatureNames,
feature_types: Optional[List[str]],
feature_types: FeatureTypes,
enable_categorical: bool,
):
if _is_cudf_df(data) or _is_cudf_ser(data):
Expand Down

0 comments on commit c70fa50

Please sign in to comment.