From a5d222fcdb26d55cf60637840d0719efa2c74082 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Fri, 2 Jul 2021 13:10:36 +0800
Subject: [PATCH] Handle categorical split in model histogram and dataframe.
 (#7065)

* Error on get_split_value_histogram when feature is categorical
* Add a category column to output dataframe
---
 python-package/xgboost/core.py          | 46 ++++++++++++++++++++----
 tests/python-gpu/test_gpu_parse_tree.py | 25 +++++++++++++
 tests/python-gpu/test_gpu_updaters.py   | 48 ++++++++++++++++---------
 3 files changed, 96 insertions(+), 23 deletions(-)
 create mode 100644 tests/python-gpu/test_gpu_parse_tree.py

diff --git a/python-package/xgboost/core.py b/python-package/xgboost/core.py
index f4fe1b3967fe..61356fc52e24 100644
--- a/python-package/xgboost/core.py
+++ b/python-package/xgboost/core.py
@@ -2225,7 +2225,7 @@ def get_score(
                 results[feat] = float(score)
         return results
 
-    def trees_to_dataframe(self, fmap=''):
+    def trees_to_dataframe(self, fmap=''):  # pylint: disable=too-many-statements
         """Parse a boosted tree model text dump into a pandas DataFrame structure.
 
         This feature is only defined when the decision tree model is chosen as base
@@ -2251,6 +2251,7 @@ def trees_to_dataframe(self, fmap=''):
         node_ids = []
         fids = []
         splits = []
+        categories = []
         y_directs = []
         n_directs = []
         missings = []
@@ -2275,6 +2276,7 @@ def trees_to_dataframe(self, fmap=''):
                     node_ids.append(int(re.findall(r'\b\d+\b', parse[0])[0]))
                     fids.append('Leaf')
                     splits.append(float('NAN'))
+                    categories.append(float('NAN'))
                     y_directs.append(float('NAN'))
                     n_directs.append(float('NAN'))
                     missings.append(float('NAN'))
@@ -2284,14 +2286,26 @@ def trees_to_dataframe(self, fmap=''):
                 else:
                     # parse string
                     fid = arr[1].split(']')
-                    parse = fid[0].split('<')
+                    if fid[0].find("<") != -1:
+                        # numerical
+                        parse = fid[0].split('<')
+                        splits.append(float(parse[1]))
+                        categories.append(None)
+                    elif fid[0].find(":{") != -1:
+                        # categorical
+                        parse = fid[0].split(":")
+                        cats = parse[1][1:-1]  # strip the {}
+                        cats = cats.split(",")
+                        splits.append(float("NAN"))
+                        categories.append(cats if cats else None)
+                    else:
+                        raise ValueError("Failed to parse model text dump.")
                     stats = re.split('=|,', fid[1])
 
                     # append to lists
                     tree_ids.append(i)
                     node_ids.append(int(re.findall(r'\b\d+\b', arr[0])[0]))
                     fids.append(parse[0])
-                    splits.append(float(parse[1]))
                     str_i = str(i)
                     y_directs.append(str_i + '-' + stats[1])
                     n_directs.append(str_i + '-' + stats[3])
@@ -2303,7 +2317,7 @@ def trees_to_dataframe(self, fmap=''):
         df = DataFrame({'Tree': tree_ids, 'Node': node_ids, 'ID': ids,
                         'Feature': fids, 'Split': splits, 'Yes': y_directs,
                         'No': n_directs, 'Missing': missings, 'Gain': gains,
-                        'Cover': covers})
+                        'Cover': covers, "Category": categories})
 
         if callable(getattr(df, 'sort_values', None)):
             # pylint: disable=no-member
@@ -2381,9 +2395,29 @@ def get_split_value_histogram(self, feature, fmap='', bins=None,
         nph = np.column_stack((nph[1][1:], nph[0]))
         nph = nph[nph[:, 1] > 0]
 
+        if nph.size == 0:
+            ft = self.feature_types
+            fn = self.feature_names
+            if fn is None:
+                # Let xgboost generate the feature names.
+                fn = ["f{0}".format(i) for i in range(self.num_features())]
+            try:
+                index = fn.index(feature)
+                feature_t = ft[index]
+            except (ValueError, AttributeError, TypeError):
+                # None.index: attr err, None[0]: type err, fn.index(-1): value err
+                feature_t = None
+            if feature_t == "categorical":
+                raise ValueError(
+                    "Split value historgam doesn't support categorical split."
+                )
+
         if as_pandas and PANDAS_INSTALLED:
             return DataFrame(nph, columns=['SplitValue', 'Count'])
         if as_pandas and not PANDAS_INSTALLED:
-            sys.stderr.write(
-                "Returning histogram as ndarray (as_pandas == True, but pandas is not installed).")
+            warnings.warn(
+                "Returning histogram as ndarray"
+                " (as_pandas == True, but pandas is not installed).",
+                UserWarning
+            )
         return nph
diff --git a/tests/python-gpu/test_gpu_parse_tree.py b/tests/python-gpu/test_gpu_parse_tree.py
new file mode 100644
index 000000000000..8033fb9852d1
--- /dev/null
+++ b/tests/python-gpu/test_gpu_parse_tree.py
@@ -0,0 +1,25 @@
+import sys
+import pytest
+import xgboost as xgb
+
+sys.path.append("tests/python")
+import testing as tm
+
+
+def test_tree_to_df_categorical():
+    X, y = tm.make_categorical(100, 10, 31, False)
+    Xy = xgb.DMatrix(X, y, enable_categorical=True)
+    booster = xgb.train({"tree_method": "gpu_hist"}, Xy, num_boost_round=10)
+    df = booster.trees_to_dataframe()
+    for _, x in df.iterrows():
+        if x["Feature"] != "Leaf":
+            assert len(x["Category"]) == 1
+
+
+def test_split_value_histograms():
+    X, y = tm.make_categorical(1000, 10, 13, False)
+    reg = xgb.XGBRegressor(tree_method="gpu_hist", enable_categorical=True)
+    reg.fit(X, y)
+
+    with pytest.raises(ValueError, match="doesn't"):
+        reg.get_booster().get_split_value_histogram("3", bins=5)
diff --git a/tests/python-gpu/test_gpu_updaters.py b/tests/python-gpu/test_gpu_updaters.py
index 3c3a7e045058..11140a7083dc 100644
--- a/tests/python-gpu/test_gpu_updaters.py
+++ b/tests/python-gpu/test_gpu_updaters.py
@@ -32,15 +32,14 @@ def train_result(param, dmat, num_rounds):
 
 
 class TestGPUUpdaters:
-    @given(parameter_strategy, strategies.integers(1, 20),
-           tm.dataset_strategy)
+    @given(parameter_strategy, strategies.integers(1, 20), tm.dataset_strategy)
     @settings(deadline=None)
     def test_gpu_hist(self, param, num_rounds, dataset):
-        param['tree_method'] = 'gpu_hist'
+        param["tree_method"] = "gpu_hist"
         param = dataset.set_params(param)
         result = train_result(param, dataset.get_dmat(), num_rounds)
         note(result)
-        assert tm.non_increasing(result['train'][dataset.metric])
+        assert tm.non_increasing(result["train"][dataset.metric])
 
     def run_categorical_basic(self, rows, cols, rounds, cats):
         onehot, label = tm.make_categorical(rows, cols, cats, True)
@@ -49,25 +48,40 @@ def run_categorical_basic(self, rows, cols, rounds, cats):
         by_etl_results = {}
         by_builtin_results = {}
 
-        parameters = {'tree_method': 'gpu_hist', 'predictor': 'gpu_predictor'}
+        parameters = {"tree_method": "gpu_hist", "predictor": "gpu_predictor"}
 
-        m = xgb.DMatrix(onehot, label, enable_categorical=True)
-        xgb.train(parameters, m,
-                  num_boost_round=rounds,
-                  evals=[(m, 'Train')], evals_result=by_etl_results)
+        m = xgb.DMatrix(onehot, label, enable_categorical=False)
+        xgb.train(
+            parameters,
+            m,
+            num_boost_round=rounds,
+            evals=[(m, "Train")],
+            evals_result=by_etl_results,
+        )
 
         m = xgb.DMatrix(cat, label, enable_categorical=True)
-        xgb.train(parameters, m,
-                  num_boost_round=rounds,
-                  evals=[(m, 'Train')], evals_result=by_builtin_results)
+        xgb.train(
+            parameters,
+            m,
+            num_boost_round=rounds,
+            evals=[(m, "Train")],
+            evals_result=by_builtin_results,
+        )
+
+        # There are guidelines on how to specify tolerance based on considering output as
+        # random variables. But in here the tree construction is extremely sensitive to
+        # floating point errors. An 1e-5 error in a histogram bin can lead to an entirely
+        # different tree.  So even though the test is quite lenient, hypothesis can still
+        # pick up falsifying examples from time to time.
         np.testing.assert_allclose(
-            np.array(by_etl_results['Train']['rmse']),
-            np.array(by_builtin_results['Train']['rmse']),
-            rtol=1e-3)
-        assert tm.non_increasing(by_builtin_results['Train']['rmse'])
+            np.array(by_etl_results["Train"]["rmse"]),
+            np.array(by_builtin_results["Train"]["rmse"]),
+            rtol=1e-3,
+        )
+        assert tm.non_increasing(by_builtin_results["Train"]["rmse"])
 
     @given(strategies.integers(10, 400), strategies.integers(3, 8),
-           strategies.integers(1, 5), strategies.integers(4, 7))
+           strategies.integers(1, 2), strategies.integers(4, 7))
     @settings(deadline=None)
     @pytest.mark.skipif(**tm.no_pandas())
     def test_categorical(self, rows, cols, rounds, cats):