diff --git a/demo/guide-python/cat_in_the_dat.py b/demo/guide-python/cat_in_the_dat.py index 551f4f535714..e502aab5ab35 100644 --- a/demo/guide-python/cat_in_the_dat.py +++ b/demo/guide-python/cat_in_the_dat.py @@ -7,6 +7,9 @@ And the data can be found at: https://www.kaggle.com/shahules/an-overview-of-encoding-techniques/data +Also, see the tutorial for using XGBoost with categorical data: +https://xgboost.readthedocs.io/en/latest/tutorials/categorical.html + .. versionadded 1.6.0 """ @@ -48,8 +51,6 @@ def load_cat_in_the_dat() -> tuple[pd.DataFrame, pd.Series]: for i in range(0, 6): X["ord_" + str(i)] = X["ord_" + str(i)].astype("category") - print(X.shape) - print( "train data set has got {} rows and {} columns".format(X.shape[0], X.shape[1]) ) @@ -64,7 +65,7 @@ def categorical_model(X: pd.DataFrame, y: pd.Series, output_dir: str) -> None: X_train, X_test, y_train, y_test = train_test_split( X, y, random_state=1994, test_size=0.2 ) - + # Specify `enable_categorical`. clf = xgb.XGBClassifier(**params, enable_categorical=True) clf.fit( X_train, @@ -72,7 +73,6 @@ def categorical_model(X: pd.DataFrame, y: pd.Series, output_dir: str) -> None: eval_set=[(X_test, y_test), (X_train, y_train)], eval_metric="auc", ) - print(clf.n_classes_) clf.save_model(os.path.join(output_dir, "categorical.json")) y_score = clf.predict_proba(X_test)[:, 1] # proba of positive samples @@ -82,12 +82,10 @@ def categorical_model(X: pd.DataFrame, y: pd.Series, output_dir: str) -> None: def onehot_encoding_model(X: pd.DataFrame, y: pd.Series, output_dir: str) -> None: """Train using one-hot encoded data.""" - X_train, X_test, y_train, y_test = train_test_split( X, y, random_state=42, test_size=0.2 ) - print(X_train.shape, y_train.shape) - + # Specify `enable_categorical`. clf = xgb.XGBClassifier(**params, enable_categorical=False) clf.fit( X_train, diff --git a/demo/guide-python/categorical.py b/demo/guide-python/categorical.py index 35d03e4f87b4..9476c1ed6232 100644 --- a/demo/guide-python/categorical.py +++ b/demo/guide-python/categorical.py @@ -5,6 +5,9 @@ which creates a sparse matrix and potentially increase memory usage. This demo showcases the experimental categorical data support, more advanced features are planned. +Also, see the tutorial for using XGBoost with categorical data: +https://xgboost.readthedocs.io/en/latest/tutorials/categorical.html + .. versionadded:: 1.5.0 """ diff --git a/doc/tutorials/categorical.rst b/doc/tutorials/categorical.rst index 9598512e34d0..38ef9c02772f 100644 --- a/doc/tutorials/categorical.rst +++ b/doc/tutorials/categorical.rst @@ -58,10 +58,12 @@ can plot the model and calculate the global feature importance: The ``scikit-learn`` interface from dask is similar to single node version. The basic idea is create dataframe with category feature type, and tell XGBoost to use ``gpu_hist`` with parameter ``enable_categorical``. See `this demo -`_ for a -worked example using categorical data with ``scikit-learn`` interface. For using it with -the Kaggle tutorial dataset, see `this demo -`_ +`__ for a +worked example of using categorical data with ``scikit-learn`` interface. A comparison +between using one-hot encoded data and XGBoost's categorical data support can be found +`here +`__. + ********************** @@ -70,10 +72,10 @@ Using native interface The ``scikit-learn`` interface is user friendly, but lacks some features that are only available in native interface. For instance users cannot compute SHAP value directly or -use quantized ``DMatrix``. Also native interface supports data types other than -dataframe, like ``numpy/cupy array``. To use the native interface with categorical data, -we need to pass the similar parameter to ``DMatrix`` and the ``train`` function. For -dataframe input: +use quantized :class:`DMatrix `. Also native interface supports data +types other than dataframe, like ``numpy/cupy array``. To use the native interface with +categorical data, we need to pass the similar parameter to :class:`DMatrix +` and the :func:`train ` function. For dataframe input: .. code:: python @@ -106,7 +108,7 @@ types by using the ``feature_types`` parameter in :class:`DMatrix ` can also be used as categorical data. **********