[doc] Small improvements for categorical data document. (#7330)

dmlc · Oct 20, 2021 · 1568599 · 1568599
1 parent f999897
commit 1568599
Show file tree

Hide file tree

Showing 3 changed files with 19 additions and 16 deletions.
diff --git a/demo/guide-python/cat_in_the_dat.py b/demo/guide-python/cat_in_the_dat.py
@@ -7,6 +7,9 @@
 And the data can be found at:
 https://www.kaggle.com/shahules/an-overview-of-encoding-techniques/data
 
+Also, see the tutorial for using XGBoost with categorical data:
+https://xgboost.readthedocs.io/en/latest/tutorials/categorical.html
+
     .. versionadded 1.6.0
 
 """
@@ -48,8 +51,6 @@ def load_cat_in_the_dat() -> tuple[pd.DataFrame, pd.Series]:
     for i in range(0, 6):
         X["ord_" + str(i)] = X["ord_" + str(i)].astype("category")
 
-    print(X.shape)
-
     print(
         "train data set has got {} rows and {} columns".format(X.shape[0], X.shape[1])
     )
@@ -64,15 +65,14 @@ def categorical_model(X: pd.DataFrame, y: pd.Series, output_dir: str) -> None:
     X_train, X_test, y_train, y_test = train_test_split(
         X, y, random_state=1994, test_size=0.2
     )
-
+    # Specify `enable_categorical`.
     clf = xgb.XGBClassifier(**params, enable_categorical=True)
     clf.fit(
         X_train,
         y_train,
         eval_set=[(X_test, y_test), (X_train, y_train)],
         eval_metric="auc",
     )
-    print(clf.n_classes_)
     clf.save_model(os.path.join(output_dir, "categorical.json"))
 
     y_score = clf.predict_proba(X_test)[:, 1]  # proba of positive samples
@@ -82,12 +82,10 @@ def categorical_model(X: pd.DataFrame, y: pd.Series, output_dir: str) -> None:
 
 def onehot_encoding_model(X: pd.DataFrame, y: pd.Series, output_dir: str) -> None:
     """Train using one-hot encoded data."""
-
     X_train, X_test, y_train, y_test = train_test_split(
         X, y, random_state=42, test_size=0.2
     )
-    print(X_train.shape, y_train.shape)
-
+    # Specify `enable_categorical`.
     clf = xgb.XGBClassifier(**params, enable_categorical=False)
     clf.fit(
         X_train,

diff --git a/demo/guide-python/categorical.py b/demo/guide-python/categorical.py
@@ -5,6 +5,9 @@
 which creates a sparse matrix and potentially increase memory usage.  This demo showcases
 the experimental categorical data support, more advanced features are planned.
 
+Also, see the tutorial for using XGBoost with categorical data:
+https://xgboost.readthedocs.io/en/latest/tutorials/categorical.html
+
     .. versionadded:: 1.5.0
 
 """

diff --git a/doc/tutorials/categorical.rst b/doc/tutorials/categorical.rst
@@ -58,10 +58,12 @@ can plot the model and calculate the global feature importance:
 The ``scikit-learn`` interface from dask is similar to single node version.  The basic
 idea is create dataframe with category feature type, and tell XGBoost to use ``gpu_hist``
 with parameter ``enable_categorical``.  See `this demo
-<https://github.com/dmlc/xgboost/blob/master/demo/guide-python/categorical.py>`_ for a
-worked example using categorical data with ``scikit-learn`` interface.  For using it with
-the Kaggle tutorial dataset, see `this demo
-<https://github.com/dmlc/xgboost/blob/master/demo/guide-python/cat_in_the_dat.py>`_
+<https://github.com/dmlc/xgboost/blob/master/demo/guide-python/categorical.py>`__ for a
+worked example of using categorical data with ``scikit-learn`` interface.  A comparison
+between using one-hot encoded data and XGBoost's categorical data support can be found
+`here
+<https://github.com/dmlc/xgboost/blob/master/demo/guide-python/cat_in_the_dat.py>`__.
+
 
 
 **********************
@@ -70,10 +72,10 @@ Using native interface
 
 The ``scikit-learn`` interface is user friendly, but lacks some features that are only
 available in native interface.  For instance users cannot compute SHAP value directly or
-use quantized ``DMatrix``.  Also native interface supports data types other than
-dataframe, like ``numpy/cupy array``. To use the native interface with categorical data,
-we need to pass the similar parameter to ``DMatrix`` and the ``train`` function.  For
-dataframe input:
+use quantized :class:`DMatrix <xgboost.DMatrix>`.  Also native interface supports data
+types other than dataframe, like ``numpy/cupy array``. To use the native interface with
+categorical data, we need to pass the similar parameter to :class:`DMatrix
+<xgboost.DMatrix>` and the :func:`train <xgboost.train>` function.  For dataframe input:
 
 .. code:: python
 
@@ -106,7 +108,7 @@ types by using the ``feature_types`` parameter in :class:`DMatrix <xgboost.DMatr
 
 For numerical data, the feature type can be ``"q"`` or ``"float"``, while for categorical
 feature it's specified as ``"c"``.  The Dask module in XGBoost has the same interface so
-``dask.Array`` can also be used as categorical data.
+:class:`dask.Array <dask.Array>` can also be used as categorical data.
 
 
 **********