Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Handle categorical split in model histogram and dataframe. #7065

Merged
merged 10 commits into from Jul 2, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
46 changes: 40 additions & 6 deletions python-package/xgboost/core.py
Expand Up @@ -2225,7 +2225,7 @@ def get_score(
results[feat] = float(score)
return results

def trees_to_dataframe(self, fmap=''):
def trees_to_dataframe(self, fmap=''): # pylint: disable=too-many-statements
"""Parse a boosted tree model text dump into a pandas DataFrame structure.

This feature is only defined when the decision tree model is chosen as base
Expand All @@ -2251,6 +2251,7 @@ def trees_to_dataframe(self, fmap=''):
node_ids = []
fids = []
splits = []
categories = []
y_directs = []
n_directs = []
missings = []
Expand All @@ -2275,6 +2276,7 @@ def trees_to_dataframe(self, fmap=''):
node_ids.append(int(re.findall(r'\b\d+\b', parse[0])[0]))
fids.append('Leaf')
splits.append(float('NAN'))
categories.append(float('NAN'))
y_directs.append(float('NAN'))
n_directs.append(float('NAN'))
missings.append(float('NAN'))
Expand All @@ -2284,14 +2286,26 @@ def trees_to_dataframe(self, fmap=''):
else:
# parse string
fid = arr[1].split(']')
parse = fid[0].split('<')
if fid[0].find("<") != -1:
# numerical
parse = fid[0].split('<')
splits.append(float(parse[1]))
categories.append(None)
elif fid[0].find(":{") != -1:
# categorical
parse = fid[0].split(":")
cats = parse[1][1:-1] # strip the {}
cats = cats.split(",")
splits.append(float("NAN"))
categories.append(cats if cats else None)
else:
raise ValueError("Failed to parse model text dump.")
stats = re.split('=|,', fid[1])

# append to lists
tree_ids.append(i)
node_ids.append(int(re.findall(r'\b\d+\b', arr[0])[0]))
fids.append(parse[0])
splits.append(float(parse[1]))
str_i = str(i)
y_directs.append(str_i + '-' + stats[1])
n_directs.append(str_i + '-' + stats[3])
Expand All @@ -2303,7 +2317,7 @@ def trees_to_dataframe(self, fmap=''):
df = DataFrame({'Tree': tree_ids, 'Node': node_ids, 'ID': ids,
'Feature': fids, 'Split': splits, 'Yes': y_directs,
'No': n_directs, 'Missing': missings, 'Gain': gains,
'Cover': covers})
'Cover': covers, "Category": categories})

if callable(getattr(df, 'sort_values', None)):
# pylint: disable=no-member
Expand Down Expand Up @@ -2381,9 +2395,29 @@ def get_split_value_histogram(self, feature, fmap='', bins=None,
nph = np.column_stack((nph[1][1:], nph[0]))
nph = nph[nph[:, 1] > 0]

if nph.size == 0:
ft = self.feature_types
fn = self.feature_names
if fn is None:
# Let xgboost generate the feature names.
fn = ["f{0}".format(i) for i in range(self.num_features())]
try:
index = fn.index(feature)
feature_t = ft[index]
except (ValueError, AttributeError, TypeError):
# None.index: attr err, None[0]: type err, fn.index(-1): value err
feature_t = None
if feature_t == "categorical":
raise ValueError(
"Split value historgam doesn't support categorical split."
)

if as_pandas and PANDAS_INSTALLED:
return DataFrame(nph, columns=['SplitValue', 'Count'])
if as_pandas and not PANDAS_INSTALLED:
sys.stderr.write(
"Returning histogram as ndarray (as_pandas == True, but pandas is not installed).")
warnings.warn(
"Returning histogram as ndarray"
" (as_pandas == True, but pandas is not installed).",
UserWarning
)
return nph
25 changes: 25 additions & 0 deletions tests/python-gpu/test_gpu_parse_tree.py
@@ -0,0 +1,25 @@
import sys
import pytest
import xgboost as xgb

sys.path.append("tests/python")
import testing as tm


def test_tree_to_df_categorical():
X, y = tm.make_categorical(100, 10, 31, False)
Xy = xgb.DMatrix(X, y, enable_categorical=True)
booster = xgb.train({"tree_method": "gpu_hist"}, Xy, num_boost_round=10)
df = booster.trees_to_dataframe()
for _, x in df.iterrows():
if x["Feature"] != "Leaf":
assert len(x["Category"]) == 1


def test_split_value_histograms():
X, y = tm.make_categorical(1000, 10, 13, False)
reg = xgb.XGBRegressor(tree_method="gpu_hist", enable_categorical=True)
reg.fit(X, y)

with pytest.raises(ValueError, match="doesn't"):
reg.get_booster().get_split_value_histogram("3", bins=5)
48 changes: 31 additions & 17 deletions tests/python-gpu/test_gpu_updaters.py
Expand Up @@ -32,15 +32,14 @@ def train_result(param, dmat, num_rounds):


class TestGPUUpdaters:
@given(parameter_strategy, strategies.integers(1, 20),
tm.dataset_strategy)
@given(parameter_strategy, strategies.integers(1, 20), tm.dataset_strategy)
@settings(deadline=None)
def test_gpu_hist(self, param, num_rounds, dataset):
param['tree_method'] = 'gpu_hist'
param["tree_method"] = "gpu_hist"
param = dataset.set_params(param)
result = train_result(param, dataset.get_dmat(), num_rounds)
note(result)
assert tm.non_increasing(result['train'][dataset.metric])
assert tm.non_increasing(result["train"][dataset.metric])

def run_categorical_basic(self, rows, cols, rounds, cats):
onehot, label = tm.make_categorical(rows, cols, cats, True)
Expand All @@ -49,25 +48,40 @@ def run_categorical_basic(self, rows, cols, rounds, cats):
by_etl_results = {}
by_builtin_results = {}

parameters = {'tree_method': 'gpu_hist', 'predictor': 'gpu_predictor'}
parameters = {"tree_method": "gpu_hist", "predictor": "gpu_predictor"}

m = xgb.DMatrix(onehot, label, enable_categorical=True)
xgb.train(parameters, m,
num_boost_round=rounds,
evals=[(m, 'Train')], evals_result=by_etl_results)
m = xgb.DMatrix(onehot, label, enable_categorical=False)
xgb.train(
parameters,
m,
num_boost_round=rounds,
evals=[(m, "Train")],
evals_result=by_etl_results,
)

m = xgb.DMatrix(cat, label, enable_categorical=True)
xgb.train(parameters, m,
num_boost_round=rounds,
evals=[(m, 'Train')], evals_result=by_builtin_results)
xgb.train(
parameters,
m,
num_boost_round=rounds,
evals=[(m, "Train")],
evals_result=by_builtin_results,
)

# There are guidelines on how to specify tolerance based on considering output as
# random variables. But in here the tree construction is extremely sensitive to
# floating point errors. An 1e-5 error in a histogram bin can lead to an entirely
# different tree. So even though the test is quite lenient, hypothesis can still
# pick up falsifying examples from time to time.
np.testing.assert_allclose(
np.array(by_etl_results['Train']['rmse']),
np.array(by_builtin_results['Train']['rmse']),
rtol=1e-3)
assert tm.non_increasing(by_builtin_results['Train']['rmse'])
np.array(by_etl_results["Train"]["rmse"]),
np.array(by_builtin_results["Train"]["rmse"]),
rtol=1e-3,
)
assert tm.non_increasing(by_builtin_results["Train"]["rmse"])

@given(strategies.integers(10, 400), strategies.integers(3, 8),
strategies.integers(1, 5), strategies.integers(4, 7))
strategies.integers(1, 2), strategies.integers(4, 7))
@settings(deadline=None)
@pytest.mark.skipif(**tm.no_pandas())
def test_categorical(self, rows, cols, rounds, cats):
Expand Down