Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support more input types for categorical data. #7220

Merged
merged 6 commits into from Sep 16, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
3 changes: 2 additions & 1 deletion demo/guide-python/categorical.py
Expand Up @@ -44,7 +44,8 @@ def make_categorical(

def main() -> None:
# Use builtin categorical data support
# Must be pandas DataFrame or cudf DataFrame with categorical data
# For scikit-learn interface, the input data must be pandas DataFrame or cudf
# DataFrame with categorical features
X, y = make_categorical(100, 10, 4, False)
# Specify `enable_categorical` to True.
reg = xgb.XGBRegressor(tree_method="gpu_hist", enable_categorical=True)
Expand Down
2 changes: 1 addition & 1 deletion include/xgboost/feature_map.h
Expand Up @@ -83,7 +83,7 @@ class FeatureMap {
if (!strcmp("q", tname)) return kQuantitive;
if (!strcmp("int", tname)) return kInteger;
if (!strcmp("float", tname)) return kFloat;
if (!strcmp("categorical", tname)) return kCategorical;
if (!strcmp("c", tname)) return kCategorical;
LOG(FATAL) << "unknown feature type, use i for indicator and q for quantity";
return kIndicator;
}
Expand Down
47 changes: 29 additions & 18 deletions python-package/xgboost/core.py
Expand Up @@ -518,8 +518,8 @@ def __init__(
base_margin=None,
missing: Optional[float] = None,
silent=False,
feature_names=None,
feature_types=None,
feature_names: Optional[List[str]] = None,
feature_types: Optional[List[str]] = None,
nthread: Optional[int] = None,
group=None,
qid=None,
Expand Down Expand Up @@ -558,8 +558,11 @@ def __init__(
Whether print messages during construction
feature_names : list, optional
Set names for features.
feature_types : list, optional
Set types for features.
feature_types :

Set types for features. When `enable_categorical` is set to `True`, string
"c" represents categorical data type.

nthread : integer, optional
Number of threads to use for loading data when parallelization is
applicable. If -1, uses maximum threads available on the system.
Expand All @@ -577,11 +580,10 @@ def __init__(

.. versionadded:: 1.3.0

Experimental support of specializing for categorical features. Do
not set to True unless you are interested in development.
Currently it's only available for `gpu_hist` tree method with 1 vs
rest (one hot) categorical split. Also, JSON serialization format,
`gpu_predictor` and pandas input are required.
Experimental support of specializing for categorical features. Do not set to
True unless you are interested in development. Currently it's only available
for `gpu_hist` tree method with 1 vs rest (one hot) categorical split. Also,
JSON serialization format is required.

"""
if group is not None and qid is not None:
Expand Down Expand Up @@ -673,8 +675,8 @@ def set_info(
qid=None,
label_lower_bound=None,
label_upper_bound=None,
feature_names=None,
feature_types=None,
feature_names: Optional[List[str]] = None,
feature_types: Optional[List[str]] = None,
feature_weights=None
) -> None:
"""Set meta info for DMatrix. See doc string for :py:obj:`xgboost.DMatrix`."""
Expand Down Expand Up @@ -945,7 +947,7 @@ def slice(
return res

@property
def feature_names(self) -> List[str]:
def feature_names(self) -> Optional[List[str]]:
"""Get feature names (column labels).

Returns
Expand Down Expand Up @@ -1033,17 +1035,21 @@ def feature_types(self) -> Optional[List[str]]:
return res

@feature_types.setter
def feature_types(self, feature_types: Optional[Union[List[Any], Any]]) -> None:
def feature_types(self, feature_types: Optional[Union[List[str], str]]) -> None:
"""Set feature types (column types).

This is for displaying the results and unrelated
to the learning process.
This is for displaying the results and categorical data support. See doc string
of :py:obj:`xgboost.DMatrix` for details.

Parameters
----------
feature_types : list or None
Labels for features. None will reset existing feature names

"""
# For compatibility reason this function wraps single str input into a list. But
# we should not promote such usage since other than visualization, the field is
# also used for specifying categorical data type.
if feature_types is not None:
if not isinstance(feature_types, (list, str)):
raise TypeError(
Expand Down Expand Up @@ -2461,8 +2467,13 @@ def _validate_features(self, data: DMatrix):

raise ValueError(msg.format(self.feature_names, data.feature_names))

def get_split_value_histogram(self, feature, fmap='', bins=None,
as_pandas=True):
def get_split_value_histogram(
self,
feature: str,
fmap: Union[os.PathLike, str] = '',
bins: Optional[int] = None,
as_pandas: bool = True
):
"""Get split value histogram of a feature

Parameters
Expand Down Expand Up @@ -2510,7 +2521,7 @@ def get_split_value_histogram(self, feature, fmap='', bins=None,
except (ValueError, AttributeError, TypeError):
# None.index: attr err, None[0]: type err, fn.index(-1): value err
feature_t = None
if feature_t == "categorical":
if feature_t == "c": # categorical
raise ValueError(
"Split value historgam doesn't support categorical split."
)
Expand Down