forked from dmlc/xgboost
/
training.py
521 lines (452 loc) · 20.8 KB
/
training.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
# coding: utf-8
# pylint: disable=too-many-locals, too-many-arguments, invalid-name
# pylint: disable=too-many-branches, too-many-statements
"""Training Library containing training routines."""
import warnings
import copy
import numpy as np
from .core import Booster, XGBoostError, _get_booster_layer_trees
from .compat import (SKLEARN_INSTALLED, XGBStratifiedKFold)
from . import callback
def _configure_deprecated_callbacks(
verbose_eval, early_stopping_rounds, maximize, start_iteration,
num_boost_round, feval, evals_result, callbacks, show_stdv, cvfolds):
link = 'https://xgboost.readthedocs.io/en/latest/python/callbacks.html'
warnings.warn(f'Old style callback is deprecated. See: {link}', UserWarning)
# Most of legacy advanced options becomes callbacks
if early_stopping_rounds is not None:
callbacks.append(callback.early_stop(early_stopping_rounds,
maximize=maximize,
verbose=bool(verbose_eval)))
if isinstance(verbose_eval, bool) and verbose_eval:
callbacks.append(callback.print_evaluation(show_stdv=show_stdv))
else:
if isinstance(verbose_eval, int):
callbacks.append(callback.print_evaluation(verbose_eval,
show_stdv=show_stdv))
if evals_result is not None:
callbacks.append(callback.record_evaluation(evals_result))
callbacks = callback.LegacyCallbacks(
callbacks, start_iteration, num_boost_round, feval, cvfolds=cvfolds)
return callbacks
def _is_new_callback(callbacks):
return any(isinstance(c, callback.TrainingCallback)
for c in callbacks) or not callbacks
def _train_internal(params, dtrain,
num_boost_round=10, evals=(),
obj=None, feval=None,
xgb_model=None, callbacks=None,
evals_result=None, maximize=None,
verbose_eval=None, early_stopping_rounds=None):
"""internal training function"""
callbacks = [] if callbacks is None else copy.copy(callbacks)
evals = list(evals)
bst = Booster(params, [dtrain] + [d[0] for d in evals])
if xgb_model is not None:
bst = Booster(params, [dtrain] + [d[0] for d in evals],
model_file=xgb_model)
start_iteration = 0
is_new_callback = _is_new_callback(callbacks)
if is_new_callback:
assert all(isinstance(c, callback.TrainingCallback)
for c in callbacks), "You can't mix new and old callback styles."
if verbose_eval:
verbose_eval = 1 if verbose_eval is True else verbose_eval
callbacks.append(callback.EvaluationMonitor(period=verbose_eval))
if early_stopping_rounds:
callbacks.append(callback.EarlyStopping(
rounds=early_stopping_rounds, maximize=maximize))
callbacks = callback.CallbackContainer(callbacks, metric=feval)
else:
callbacks = _configure_deprecated_callbacks(
verbose_eval, early_stopping_rounds, maximize, start_iteration,
num_boost_round, feval, evals_result, callbacks,
show_stdv=False, cvfolds=None)
bst = callbacks.before_training(bst)
for i in range(start_iteration, num_boost_round):
if callbacks.before_iteration(bst, i, dtrain, evals):
break
bst.update(dtrain, i, obj)
if callbacks.after_iteration(bst, i, dtrain, evals):
break
bst = callbacks.after_training(bst)
if evals_result is not None and is_new_callback:
evals_result.update(callbacks.history)
# These should be moved into callback functions `after_training`, but until old
# callbacks are removed, the train function is the only place for setting the
# attributes.
num_parallel_tree, _ = _get_booster_layer_trees(bst)
if bst.attr('best_score') is not None:
bst.best_score = float(bst.attr('best_score'))
bst.best_iteration = int(bst.attr('best_iteration'))
# num_class is handled internally
bst.set_attr(
best_ntree_limit=str((bst.best_iteration + 1) * num_parallel_tree)
)
bst.best_ntree_limit = int(bst.attr("best_ntree_limit"))
else:
# Due to compatibility with version older than 1.4, these attributes are added
# to Python object even if early stopping is not used.
bst.best_iteration = bst.num_boosted_rounds() - 1
bst.set_attr(best_iteration=str(bst.best_iteration))
bst.best_ntree_limit = (bst.best_iteration + 1) * num_parallel_tree
bst.set_attr(best_ntree_limit=str(bst.best_ntree_limit))
# Copy to serialise and unserialise booster to reset state and free
# training memory
return bst.copy()
def train(params, dtrain, num_boost_round=10, evals=(), obj=None, feval=None,
maximize=None, early_stopping_rounds=None, evals_result=None,
verbose_eval=True, xgb_model=None, callbacks=None):
# pylint: disable=too-many-statements,too-many-branches, attribute-defined-outside-init
"""Train a booster with given parameters.
Parameters
----------
params : dict
Booster params.
dtrain : DMatrix
Data to be trained.
num_boost_round: int
Number of boosting iterations.
evals: list of pairs (DMatrix, string)
List of validation sets for which metrics will evaluated during training.
Validation metrics will help us track the performance of the model.
obj : function
Customized objective function.
feval : function
Customized evaluation function.
maximize : bool
Whether to maximize feval.
early_stopping_rounds: int
Activates early stopping. Validation metric needs to improve at least once in
every **early_stopping_rounds** round(s) to continue training.
Requires at least one item in **evals**.
The method returns the model from the last iteration (not the best one). Use
custom callback or model slicing if the best model is desired.
If there's more than one item in **evals**, the last entry will be used for early
stopping.
If there's more than one metric in the **eval_metric** parameter given in
**params**, the last metric will be used for early stopping.
If early stopping occurs, the model will have three additional fields:
``bst.best_score``, ``bst.best_iteration``.
evals_result: dict
This dictionary stores the evaluation results of all the items in watchlist.
Example: with a watchlist containing
``[(dtest,'eval'), (dtrain,'train')]`` and
a parameter containing ``('eval_metric': 'logloss')``,
the **evals_result** returns
.. code-block:: python
{'train': {'logloss': ['0.48253', '0.35953']},
'eval': {'logloss': ['0.480385', '0.357756']}}
verbose_eval : bool or int
Requires at least one item in **evals**.
If **verbose_eval** is True then the evaluation metric on the validation set is
printed at each boosting stage.
If **verbose_eval** is an integer then the evaluation metric on the validation set
is printed at every given **verbose_eval** boosting stage. The last boosting stage
/ the boosting stage found by using **early_stopping_rounds** is also printed.
Example: with ``verbose_eval=4`` and at least one item in **evals**, an evaluation metric
is printed every 4 boosting stages, instead of every boosting stage.
xgb_model : file name of stored xgb model or 'Booster' instance
Xgb model to be loaded before training (allows training continuation).
callbacks : list of callback functions
List of callback functions that are applied at end of each iteration.
It is possible to use predefined callbacks by using
:ref:`Callback API <callback_api>`.
Example:
.. code-block:: python
[xgb.callback.LearningRateScheduler(custom_rates)]
Returns
-------
Booster : a trained booster model
"""
bst = _train_internal(params, dtrain,
num_boost_round=num_boost_round,
evals=evals,
obj=obj, feval=feval,
xgb_model=xgb_model, callbacks=callbacks,
verbose_eval=verbose_eval,
evals_result=evals_result,
maximize=maximize,
early_stopping_rounds=early_stopping_rounds)
return bst
class CVPack(object):
""""Auxiliary datastruct to hold one fold of CV."""
def __init__(self, dtrain, dtest, param):
""""Initialize the CVPack"""
self.dtrain = dtrain
self.dtest = dtest
self.watchlist = [(dtrain, 'train'), (dtest, 'test')]
self.bst = Booster(param, [dtrain, dtest])
def __getattr__(self, name):
def _inner(*args, **kwargs):
return getattr(self.bst, name)(*args, **kwargs)
return _inner
def update(self, iteration, fobj):
""""Update the boosters for one iteration"""
self.bst.update(self.dtrain, iteration, fobj)
def eval(self, iteration, feval):
""""Evaluate the CVPack for one iteration."""
return self.bst.eval_set(self.watchlist, iteration, feval)
class _PackedBooster:
def __init__(self, cvfolds) -> None:
self.cvfolds = cvfolds
def update(self, iteration, obj):
'''Iterate through folds for update'''
for fold in self.cvfolds:
fold.update(iteration, obj)
def eval(self, iteration, feval):
'''Iterate through folds for eval'''
result = [f.eval(iteration, feval) for f in self.cvfolds]
return result
def set_attr(self, **kwargs):
'''Iterate through folds for setting attributes'''
for f in self.cvfolds:
f.bst.set_attr(**kwargs)
def attr(self, key):
'''Redirect to booster attr.'''
return self.cvfolds[0].bst.attr(key)
def set_param(self, params, value=None):
"""Iterate through folds for set_param"""
for f in self.cvfolds:
f.bst.set_param(params, value)
def num_boosted_rounds(self):
'''Number of boosted rounds.'''
return self.cvfolds[0].num_boosted_rounds()
@property
def best_iteration(self):
'''Get best_iteration'''
return int(self.cvfolds[0].bst.attr("best_iteration"))
@property
def best_score(self):
"""Get best_score."""
return float(self.cvfolds[0].bst.attr("best_score"))
def groups_to_rows(groups, boundaries):
"""
Given group row boundaries, convert ground indexes to row indexes
:param groups: list of groups for testing
:param boundaries: rows index limits of each group
:return: row in group
"""
return np.concatenate([np.arange(boundaries[g], boundaries[g+1]) for g in groups])
def mkgroupfold(dall, nfold, param, evals=(), fpreproc=None, shuffle=True):
"""
Make n folds for cross-validation maintaining groups
:return: cross-validation folds
"""
# we have groups for pairwise ranking... get a list of the group indexes
group_boundaries = dall.get_uint_info('group_ptr')
group_sizes = np.diff(group_boundaries)
if shuffle is True:
idx = np.random.permutation(len(group_sizes))
else:
idx = np.arange(len(group_sizes))
# list by fold of test group indexes
out_group_idset = np.array_split(idx, nfold)
# list by fold of train group indexes
in_group_idset = [np.concatenate([out_group_idset[i] for i in range(nfold) if k != i])
for k in range(nfold)]
# from the group indexes, convert them to row indexes
in_idset = [groups_to_rows(in_groups, group_boundaries) for in_groups in in_group_idset]
out_idset = [groups_to_rows(out_groups, group_boundaries) for out_groups in out_group_idset]
# build the folds by taking the appropriate slices
ret = []
for k in range(nfold):
# perform the slicing using the indexes determined by the above methods
dtrain = dall.slice(in_idset[k], allow_groups=True)
dtrain.set_group(group_sizes[in_group_idset[k]])
dtest = dall.slice(out_idset[k], allow_groups=True)
dtest.set_group(group_sizes[out_group_idset[k]])
# run preprocessing on the data set if needed
if fpreproc is not None:
dtrain, dtest, tparam = fpreproc(dtrain, dtest, param.copy())
else:
tparam = param
plst = list(tparam.items()) + [('eval_metric', itm) for itm in evals]
ret.append(CVPack(dtrain, dtest, plst))
return ret
def mknfold(dall, nfold, param, seed, evals=(), fpreproc=None, stratified=False,
folds=None, shuffle=True):
"""
Make an n-fold list of CVPack from random indices.
"""
evals = list(evals)
np.random.seed(seed)
if stratified is False and folds is None:
# Do standard k-fold cross validation. Automatically determine the folds.
if len(dall.get_uint_info('group_ptr')) > 1:
return mkgroupfold(dall, nfold, param, evals=evals, fpreproc=fpreproc, shuffle=shuffle)
if shuffle is True:
idx = np.random.permutation(dall.num_row())
else:
idx = np.arange(dall.num_row())
out_idset = np.array_split(idx, nfold)
in_idset = [np.concatenate([out_idset[i] for i in range(nfold) if k != i])
for k in range(nfold)]
elif folds is not None:
# Use user specified custom split using indices
try:
in_idset = [x[0] for x in folds]
out_idset = [x[1] for x in folds]
except TypeError:
# Custom stratification using Sklearn KFoldSplit object
splits = list(folds.split(X=dall.get_label(), y=dall.get_label()))
in_idset = [x[0] for x in splits]
out_idset = [x[1] for x in splits]
nfold = len(out_idset)
else:
# Do standard stratefied shuffle k-fold split
sfk = XGBStratifiedKFold(n_splits=nfold, shuffle=True, random_state=seed)
splits = list(sfk.split(X=dall.get_label(), y=dall.get_label()))
in_idset = [x[0] for x in splits]
out_idset = [x[1] for x in splits]
nfold = len(out_idset)
ret = []
for k in range(nfold):
# perform the slicing using the indexes determined by the above methods
dtrain = dall.slice(in_idset[k])
dtest = dall.slice(out_idset[k])
# run preprocessing on the data set if needed
if fpreproc is not None:
dtrain, dtest, tparam = fpreproc(dtrain, dtest, param.copy())
else:
tparam = param
plst = list(tparam.items()) + [('eval_metric', itm) for itm in evals]
ret.append(CVPack(dtrain, dtest, plst))
return ret
def cv(params, dtrain, num_boost_round=10, nfold=3, stratified=False, folds=None,
metrics=(), obj=None, feval=None, maximize=None, early_stopping_rounds=None,
fpreproc=None, as_pandas=True, verbose_eval=None, show_stdv=True,
seed=0, callbacks=None, shuffle=True):
# pylint: disable = invalid-name
"""Cross-validation with given parameters.
Parameters
----------
params : dict
Booster params.
dtrain : DMatrix
Data to be trained.
num_boost_round : int
Number of boosting iterations.
nfold : int
Number of folds in CV.
stratified : bool
Perform stratified sampling.
folds : a KFold or StratifiedKFold instance or list of fold indices
Sklearn KFolds or StratifiedKFolds object.
Alternatively may explicitly pass sample indices for each fold.
For ``n`` folds, **folds** should be a length ``n`` list of tuples.
Each tuple is ``(in,out)`` where ``in`` is a list of indices to be used
as the training samples for the ``n`` th fold and ``out`` is a list of
indices to be used as the testing samples for the ``n`` th fold.
metrics : string or list of strings
Evaluation metrics to be watched in CV.
obj : function
Custom objective function.
feval : function
Custom evaluation function.
maximize : bool
Whether to maximize feval.
early_stopping_rounds: int
Activates early stopping. Cross-Validation metric (average of validation
metric computed over CV folds) needs to improve at least once in
every **early_stopping_rounds** round(s) to continue training.
The last entry in the evaluation history will represent the best iteration.
If there's more than one metric in the **eval_metric** parameter given in
**params**, the last metric will be used for early stopping.
fpreproc : function
Preprocessing function that takes (dtrain, dtest, param) and returns
transformed versions of those.
as_pandas : bool, default True
Return pd.DataFrame when pandas is installed.
If False or pandas is not installed, return np.ndarray
verbose_eval : bool, int, or None, default None
Whether to display the progress. If None, progress will be displayed
when np.ndarray is returned. If True, progress will be displayed at
boosting stage. If an integer is given, progress will be displayed
at every given `verbose_eval` boosting stage.
show_stdv : bool, default True
Whether to display the standard deviation in progress.
Results are not affected, and always contains std.
seed : int
Seed used to generate the folds (passed to numpy.random.seed).
callbacks : list of callback functions
List of callback functions that are applied at end of each iteration.
It is possible to use predefined callbacks by using
:ref:`Callback API <callback_api>`.
Example:
.. code-block:: python
[xgb.callback.LearningRateScheduler(custom_rates)]
shuffle : bool
Shuffle data before creating folds.
Returns
-------
evaluation history : list(string)
"""
if stratified is True and not SKLEARN_INSTALLED:
raise XGBoostError('sklearn needs to be installed in order to use stratified cv')
if isinstance(metrics, str):
metrics = [metrics]
if isinstance(params, list):
_metrics = [x[1] for x in params if x[0] == 'eval_metric']
params = dict(params)
if 'eval_metric' in params:
params['eval_metric'] = _metrics
else:
params = dict((k, v) for k, v in params.items())
if (not metrics) and 'eval_metric' in params:
if isinstance(params['eval_metric'], list):
metrics = params['eval_metric']
else:
metrics = [params['eval_metric']]
params.pop("eval_metric", None)
results = {}
cvfolds = mknfold(dtrain, nfold, params, seed, metrics, fpreproc,
stratified, folds, shuffle)
# setup callbacks
callbacks = [] if callbacks is None else callbacks
is_new_callback = _is_new_callback(callbacks)
if is_new_callback:
assert all(isinstance(c, callback.TrainingCallback)
for c in callbacks), "You can't mix new and old callback styles."
if verbose_eval:
verbose_eval = 1 if verbose_eval is True else verbose_eval
callbacks.append(
callback.EvaluationMonitor(period=verbose_eval, show_stdv=show_stdv)
)
if early_stopping_rounds:
callbacks.append(
callback.EarlyStopping(rounds=early_stopping_rounds, maximize=maximize)
)
callbacks = callback.CallbackContainer(callbacks, metric=feval, is_cv=True)
else:
callbacks = _configure_deprecated_callbacks(
verbose_eval, early_stopping_rounds, maximize, 0,
num_boost_round, feval, None, callbacks,
show_stdv=show_stdv, cvfolds=cvfolds)
booster = _PackedBooster(cvfolds)
callbacks.before_training(booster)
for i in range(num_boost_round):
if callbacks.before_iteration(booster, i, dtrain, None):
break
booster.update(i, obj)
should_break = callbacks.after_iteration(booster, i, dtrain, None)
res = callbacks.aggregated_cv
for key, mean, std in res:
if key + '-mean' not in results:
results[key + '-mean'] = []
if key + '-std' not in results:
results[key + '-std'] = []
results[key + '-mean'].append(mean)
results[key + '-std'].append(std)
if should_break:
for k in results.keys(): # pylint: disable=consider-iterating-dictionary
results[k] = results[k][:(booster.best_iteration + 1)]
break
if as_pandas:
try:
import pandas as pd
results = pd.DataFrame.from_dict(results)
except ImportError:
pass
callbacks.after_training(booster)
return results