Update document for multi output and categorical. (#7574)

* Group together categorical related parameters. * Update documents about multioutput and categorical.
dmlc · Jan 18, 2022 · b4ec168 · b4ec168
1 parent dac9eb1
commit b4ec168
Show file tree

Hide file tree

Showing 5 changed files with 27 additions and 22 deletions.
diff --git a/demo/guide-python/custom_rmsle.py b/demo/guide-python/custom_rmsle.py
@@ -7,7 +7,7 @@
 Error (SLE) objective and RMSLE metric as customized functions, then compare it with
 native implementation in XGBoost.
 
-See doc/tutorials/custom_metric_obj.rst for a step by step walkthrough, with other
+See :doc:`/tutorials/custom_metric_obj` for a step by step walkthrough, with other
 details.
 
 The `SLE` objective reduces impact of outliers in training dataset, hence here we also

diff --git a/demo/guide-python/multioutput_regression.py b/demo/guide-python/multioutput_regression.py
@@ -5,6 +5,8 @@
 The demo is adopted from scikit-learn:
 
 https://scikit-learn.org/stable/auto_examples/ensemble/plot_random_forest_regression_multioutput.html#sphx-glr-auto-examples-ensemble-plot-random-forest-regression-multioutput-py
+
+See :doc:`/tutorials/multioutput` for more information.
 """
 import numpy as np
 import xgboost as xgb

diff --git a/doc/tutorials/categorical.rst b/doc/tutorials/categorical.rst
@@ -113,7 +113,7 @@ Miscellaneous
 *************
 
 By default, XGBoost assumes input categories are integers starting from 0 till the number
-of categories :math:`[0, n_categories)`. However, user might provide inputs with invalid
+of categories :math:`[0, n\_categories)`. However, user might provide inputs with invalid
 values due to mistakes or missing values. It can be negative value, integer values that
 can not be accurately represented by 32-bit floating point, or values that are larger than
 actual number of unique categories.  During training this is validated but for prediction

diff --git a/doc/tutorials/multioutput.rst b/doc/tutorials/multioutput.rst
@@ -12,14 +12,15 @@ terminologies related to different multi-output models please refer to the `scik
 user guide <https://scikit-learn.org/stable/modules/multiclass.HTML>`_.
 
 Internally, XGBoost builds one model for each target similar to sklearn meta estimators,
-with the added benefit of reusing data and custom objective support.  For a worked example
-of regression, see :ref:`sphx_glr_python_examples_multioutput_regression.py`. For
-multi-label classification, the binary relevance strategy is used.  Input ``y`` should be
-of shape ``(n_samples, n_classes)`` with each column having a value of 0 or 1 to specify
-whether the sample is labeled as positive for respective class. Given a sample with 3
-output classes and 2 labels, the corresponding `y` should be encoded as ``[1, 0, 1]`` with
-the second class labeled as negative and the rest labeled as positive. At the moment
-XGBoost supports only dense matrix for labels.
+with the added benefit of reusing data and other integrated features like SHAP.  For a
+worked example of regression, see
+:ref:`sphx_glr_python_examples_multioutput_regression.py`. For multi-label classification,
+the binary relevance strategy is used.  Input ``y`` should be of shape ``(n_samples,
+n_classes)`` with each column having a value of 0 or 1 to specify whether the sample is
+labeled as positive for respective class. Given a sample with 3 output classes and 2
+labels, the corresponding `y` should be encoded as ``[1, 0, 1]`` with the second class
+labeled as negative and the rest labeled as positive. At the moment XGBoost supports only
+dense matrix for labels.
 
 .. code-block:: python
 

diff --git a/python-package/xgboost/sklearn.py b/python-package/xgboost/sklearn.py
@@ -197,6 +197,18 @@ def inner(y_score: np.ndarray, dmatrix: DMatrix) -> Tuple[str, float]:
         Experimental support for categorical data.  Do not set to true unless you are
         interested in development. Only valid when `gpu_hist` and dataframe are used.
 
+    max_cat_to_onehot : bool
+
+        .. versionadded:: 1.6.0
+
+        .. note:: This parameter is experimental
+
+        A threshold for deciding whether XGBoost should use one-hot encoding based split
+        for categorical data.  When number of categories is lesser than the threshold then
+        one-hot encoding is chosen, otherwise the categories will be partitioned into
+        children nodes.  Only relevant for regression and binary classification and
+        `approx` tree method.
+
     eval_metric : Optional[Union[str, List[str], Callable]]
 
         .. versionadded:: 1.6.0
@@ -267,16 +279,6 @@ def inner(y_score: np.ndarray, dmatrix: DMatrix) -> Tuple[str, float]:
             callbacks = [xgb.callback.EarlyStopping(rounds=early_stopping_rounds,
                                                     save_best=True)]
 
-    max_cat_to_onehot : bool
-
-        .. versionadded:: 1.6.0
-
-        A threshold for deciding whether XGBoost should use one-hot encoding based split
-        for categorical data.  When number of categories is lesser than the threshold then
-        one-hot encoding is chosen, otherwise the categories will be partitioned into
-        children nodes.  Only relevant for regression and binary classification and
-        `approx` tree method.
-
     kwargs : dict, optional
         Keyword arguments for XGBoost Booster object.  Full documentation of parameters
         can be found :doc:`here </parameter>`.
@@ -490,10 +492,10 @@ def __init__(
         validate_parameters: Optional[bool] = None,
         predictor: Optional[str] = None,
         enable_categorical: bool = False,
+        max_cat_to_onehot: Optional[int] = None,
         eval_metric: Optional[Union[str, List[str], Callable]] = None,
         early_stopping_rounds: Optional[int] = None,
         callbacks: Optional[List[TrainingCallback]] = None,
-        max_cat_to_onehot: Optional[int] = None,
         **kwargs: Any
     ) -> None:
         if not SKLEARN_INSTALLED:
@@ -530,10 +532,10 @@ def __init__(
         self.validate_parameters = validate_parameters
         self.predictor = predictor
         self.enable_categorical = enable_categorical
+        self.max_cat_to_onehot = max_cat_to_onehot
         self.eval_metric = eval_metric
         self.early_stopping_rounds = early_stopping_rounds
         self.callbacks = callbacks
-        self.max_cat_to_onehot = max_cat_to_onehot
         if kwargs:
             self.kwargs = kwargs