Support more input types for categorical data. (#7220)

* Support more input types for categorical data. * Shorten the type name from "categorical" to "c". * Tests for np/cp array and scipy csr/csc/coo. * Specify the type for feature info.
dmlc · Sep 16, 2021 · 0ed979b · 0ed979b
1 parent 2942dc6
commit 0ed979b
Show file tree

Hide file tree

Showing 11 changed files with 229 additions and 61 deletions.
diff --git a/demo/guide-python/categorical.py b/demo/guide-python/categorical.py
@@ -44,7 +44,8 @@ def make_categorical(
 
 def main() -> None:
     # Use builtin categorical data support
-    # Must be pandas DataFrame or cudf DataFrame with categorical data
+    # For scikit-learn interface, the input data must be pandas DataFrame or cudf
+    # DataFrame with categorical features
     X, y = make_categorical(100, 10, 4, False)
     # Specify `enable_categorical` to True.
     reg = xgb.XGBRegressor(tree_method="gpu_hist", enable_categorical=True)

diff --git a/include/xgboost/feature_map.h b/include/xgboost/feature_map.h
@@ -83,7 +83,7 @@ class FeatureMap {
     if (!strcmp("q", tname)) return kQuantitive;
     if (!strcmp("int", tname)) return kInteger;
     if (!strcmp("float", tname)) return kFloat;
-    if (!strcmp("categorical", tname)) return kCategorical;
+    if (!strcmp("c", tname)) return kCategorical;
     LOG(FATAL) << "unknown feature type, use i for indicator and q for quantity";
     return kIndicator;
   }

diff --git a/python-package/xgboost/core.py b/python-package/xgboost/core.py
@@ -518,8 +518,8 @@ def __init__(
         base_margin=None,
         missing: Optional[float] = None,
         silent=False,
-        feature_names=None,
-        feature_types=None,
+        feature_names: Optional[List[str]] = None,
+        feature_types: Optional[List[str]] = None,
         nthread: Optional[int] = None,
         group=None,
         qid=None,
@@ -558,8 +558,11 @@ def __init__(
             Whether print messages during construction
         feature_names : list, optional
             Set names for features.
-        feature_types : list, optional
-            Set types for features.
+        feature_types :
+
+            Set types for features.  When `enable_categorical` is set to `True`, string
+            "c" represents categorical data type.
+
         nthread : integer, optional
             Number of threads to use for loading data when parallelization is
             applicable. If -1, uses maximum threads available on the system.
@@ -577,11 +580,10 @@ def __init__(
 
             .. versionadded:: 1.3.0
 
-            Experimental support of specializing for categorical features.  Do
-            not set to True unless you are interested in development.
-            Currently it's only available for `gpu_hist` tree method with 1 vs
-            rest (one hot) categorical split.  Also, JSON serialization format,
-            `gpu_predictor` and pandas input are required.
+            Experimental support of specializing for categorical features.  Do not set to
+            True unless you are interested in development.  Currently it's only available
+            for `gpu_hist` tree method with 1 vs rest (one hot) categorical split.  Also,
+            JSON serialization format is required.
 
         """
         if group is not None and qid is not None:
@@ -673,8 +675,8 @@ def set_info(
         qid=None,
         label_lower_bound=None,
         label_upper_bound=None,
-        feature_names=None,
-        feature_types=None,
+        feature_names: Optional[List[str]] = None,
+        feature_types: Optional[List[str]] = None,
         feature_weights=None
     ) -> None:
         """Set meta info for DMatrix.  See doc string for :py:obj:`xgboost.DMatrix`."""
@@ -945,7 +947,7 @@ def slice(
         return res
 
     @property
-    def feature_names(self) -> List[str]:
+    def feature_names(self) -> Optional[List[str]]:
         """Get feature names (column labels).
 
         Returns
@@ -1033,17 +1035,21 @@ def feature_types(self) -> Optional[List[str]]:
         return res
 
     @feature_types.setter
-    def feature_types(self, feature_types: Optional[Union[List[Any], Any]]) -> None:
+    def feature_types(self, feature_types: Optional[Union[List[str], str]]) -> None:
         """Set feature types (column types).
 
-        This is for displaying the results and unrelated
-        to the learning process.
+        This is for displaying the results and categorical data support.  See doc string
+        of :py:obj:`xgboost.DMatrix` for details.
 
         Parameters
         ----------
         feature_types : list or None
             Labels for features. None will reset existing feature names
+
         """
+        # For compatibility reason this function wraps single str input into a list.  But
+        # we should not promote such usage since other than visualization, the field is
+        # also used for specifying categorical data type.
         if feature_types is not None:
             if not isinstance(feature_types, (list, str)):
                 raise TypeError(
@@ -2461,8 +2467,13 @@ def _validate_features(self, data: DMatrix):
 
             raise ValueError(msg.format(self.feature_names, data.feature_names))
 
-    def get_split_value_histogram(self, feature, fmap='', bins=None,
-                                  as_pandas=True):
+    def get_split_value_histogram(
+        self,
+        feature: str,
+        fmap: Union[os.PathLike, str] = '',
+        bins: Optional[int] = None,
+        as_pandas: bool = True
+    ):
         """Get split value histogram of a feature
 
         Parameters
@@ -2510,7 +2521,7 @@ def get_split_value_histogram(self, feature, fmap='', bins=None,
             except (ValueError, AttributeError, TypeError):
                 # None.index: attr err, None[0]: type err, fn.index(-1): value err
                 feature_t = None
-            if feature_t == "categorical":
+            if feature_t == "c":  # categorical
                 raise ValueError(
                     "Split value historgam doesn't support categorical split."
                 )