-
-
Notifications
You must be signed in to change notification settings - Fork 8.7k
/
cat_in_the_dat.py
123 lines (93 loc) · 3.64 KB
/
cat_in_the_dat.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
"""
Train XGBoost with cat_in_the_dat dataset
=========================================
A simple demo for categorical data support using dataset from Kaggle categorical data
tutorial.
The excellent tutorial is at:
https://www.kaggle.com/shahules/an-overview-of-encoding-techniques
And the data can be found at:
https://www.kaggle.com/shahules/an-overview-of-encoding-techniques/data
Also, see the tutorial for using XGBoost with categorical data:
:doc:`/tutorials/categorical`.
.. versionadded 1.6.0
"""
from __future__ import annotations
from time import time
import os
from tempfile import TemporaryDirectory
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import xgboost as xgb
def load_cat_in_the_dat() -> tuple[pd.DataFrame, pd.Series]:
"""Assuming you have already downloaded the data into `input` directory."""
df_train = pd.read_csv("./input/cat-in-the-dat/train.csv")
print(
"train data set has got {} rows and {} columns".format(
df_train.shape[0], df_train.shape[1]
)
)
X = df_train.drop(["target"], axis=1)
y = df_train["target"]
for i in range(0, 5):
X["bin_" + str(i)] = X["bin_" + str(i)].astype("category")
for i in range(0, 5):
X["nom_" + str(i)] = X["nom_" + str(i)].astype("category")
for i in range(5, 10):
X["nom_" + str(i)] = X["nom_" + str(i)].apply(int, base=16)
for i in range(0, 6):
X["ord_" + str(i)] = X["ord_" + str(i)].astype("category")
print(
"train data set has got {} rows and {} columns".format(X.shape[0], X.shape[1])
)
return X, y
params = {
"tree_method": "gpu_hist",
"n_estimators": 32,
"colsample_bylevel": 0.7,
}
def categorical_model(X: pd.DataFrame, y: pd.Series, output_dir: str) -> None:
"""Train using builtin categorical data support from XGBoost"""
X_train, X_test, y_train, y_test = train_test_split(
X, y, random_state=1994, test_size=0.2
)
# Specify `enable_categorical` to True.
clf = xgb.XGBClassifier(
**params,
eval_metric="auc",
enable_categorical=True,
max_cat_to_onehot=1, # We use optimal partitioning exclusively
)
clf.fit(X_train, y_train, eval_set=[(X_test, y_test), (X_train, y_train)])
clf.save_model(os.path.join(output_dir, "categorical.json"))
y_score = clf.predict_proba(X_test)[:, 1] # proba of positive samples
auc = roc_auc_score(y_test, y_score)
print("AUC of using builtin categorical data support:", auc)
def onehot_encoding_model(X: pd.DataFrame, y: pd.Series, output_dir: str) -> None:
"""Train using one-hot encoded data."""
X_train, X_test, y_train, y_test = train_test_split(
X, y, random_state=42, test_size=0.2
)
# Specify `enable_categorical` to False as we are using encoded data.
clf = xgb.XGBClassifier(**params, eval_metric="auc", enable_categorical=False)
clf.fit(
X_train,
y_train,
eval_set=[(X_test, y_test), (X_train, y_train)],
)
clf.save_model(os.path.join(output_dir, "one-hot.json"))
y_score = clf.predict_proba(X_test)[:, 1] # proba of positive samples
auc = roc_auc_score(y_test, y_score)
print("AUC of using onehot encoding:", auc)
if __name__ == "__main__":
X, y = load_cat_in_the_dat()
with TemporaryDirectory() as tmpdir:
start = time()
categorical_model(X, y, tmpdir)
end = time()
print("Duration:categorical", end - start)
X = pd.get_dummies(X)
start = time()
onehot_encoding_model(X, y, tmpdir)
end = time()
print("Duration:onehot", end - start)