forked from dmlc/xgboost
/
xgb.DMatrix.R
376 lines (355 loc) · 12.2 KB
/
xgb.DMatrix.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
#' Construct xgb.DMatrix object
#'
#' Construct xgb.DMatrix object from either a dense matrix, a sparse matrix, or a local file.
#' Supported input file formats are either a LIBSVM text file or a binary file that was created previously by
#' \code{\link{xgb.DMatrix.save}}).
#'
#' @param data a \code{matrix} object (either numeric or integer), a \code{dgCMatrix} object, or a character
#' string representing a filename.
#' @param info a named list of additional information to store in the \code{xgb.DMatrix} object.
#' See \code{\link{setinfo}} for the specific allowed kinds of
#' @param missing a float value to represents missing values in data (used only when input is a dense matrix).
#' It is useful when a 0 or some other extreme value represents missing values in data.
#' @param silent whether to suppress printing an informational message after loading from a file.
#' @param nthread Number of threads used for creating DMatrix.
#' @param ... the \code{info} data could be passed directly as parameters, without creating an \code{info} list.
#'
#' @examples
#' data(agaricus.train, package='xgboost')
#' dtrain <- with(agaricus.train, xgb.DMatrix(data, label = label))
#' xgb.DMatrix.save(dtrain, 'xgb.DMatrix.data')
#' dtrain <- xgb.DMatrix('xgb.DMatrix.data')
#' if (file.exists('xgb.DMatrix.data')) file.remove('xgb.DMatrix.data')
#' @export
xgb.DMatrix <- function(data, info = list(), missing = NA, silent = FALSE, nthread = NULL, ...) {
cnames <- NULL
if (typeof(data) == "character") {
if (length(data) > 1)
stop("'data' has class 'character' and length ", length(data),
".\n 'data' accepts either a numeric matrix or a single filename.")
data <- path.expand(data)
handle <- .Call(XGDMatrixCreateFromFile_R, data, as.integer(silent))
} else if (is.matrix(data)) {
handle <- .Call(XGDMatrixCreateFromMat_R, data, missing, as.integer(NVL(nthread, -1)))
cnames <- colnames(data)
} else if (inherits(data, "dgCMatrix")) {
handle <- .Call(XGDMatrixCreateFromCSC_R, data@p, data@i, data@x, nrow(data))
cnames <- colnames(data)
} else {
stop("xgb.DMatrix does not support construction from ", typeof(data))
}
dmat <- handle
attributes(dmat) <- list(.Dimnames = list(NULL, cnames), class = "xgb.DMatrix")
info <- append(info, list(...))
for (i in seq_along(info)) {
p <- info[i]
setinfo(dmat, names(p), p[[1]])
}
return(dmat)
}
# get dmatrix from data, label
# internal helper method
xgb.get.DMatrix <- function(data, label = NULL, missing = NA, weight = NULL, nthread = NULL) {
if (inherits(data, "dgCMatrix") || is.matrix(data)) {
if (is.null(label)) {
stop("label must be provided when data is a matrix")
}
dtrain <- xgb.DMatrix(data, label = label, missing = missing, nthread = nthread)
if (!is.null(weight)){
setinfo(dtrain, "weight", weight)
}
} else {
if (!is.null(label)) {
warning("xgboost: label will be ignored.")
}
if (is.character(data)) {
data <- path.expand(data)
dtrain <- xgb.DMatrix(data[1])
} else if (inherits(data, "xgb.DMatrix")) {
dtrain <- data
} else if (inherits(data, "data.frame")) {
stop("xgboost doesn't support data.frame as input. Convert it to matrix first.")
} else {
stop("xgboost: invalid input data")
}
}
return (dtrain)
}
#' Dimensions of xgb.DMatrix
#'
#' Returns a vector of numbers of rows and of columns in an \code{xgb.DMatrix}.
#' @param x Object of class \code{xgb.DMatrix}
#'
#' @details
#' Note: since \code{nrow} and \code{ncol} internally use \code{dim}, they can also
#' be directly used with an \code{xgb.DMatrix} object.
#'
#' @examples
#' data(agaricus.train, package='xgboost')
#' train <- agaricus.train
#' dtrain <- xgb.DMatrix(train$data, label=train$label)
#'
#' stopifnot(nrow(dtrain) == nrow(train$data))
#' stopifnot(ncol(dtrain) == ncol(train$data))
#' stopifnot(all(dim(dtrain) == dim(train$data)))
#'
#' @export
dim.xgb.DMatrix <- function(x) {
c(.Call(XGDMatrixNumRow_R, x), .Call(XGDMatrixNumCol_R, x))
}
#' Handling of column names of \code{xgb.DMatrix}
#'
#' Only column names are supported for \code{xgb.DMatrix}, thus setting of
#' row names would have no effect and returned row names would be NULL.
#'
#' @param x object of class \code{xgb.DMatrix}
#' @param value a list of two elements: the first one is ignored
#' and the second one is column names
#'
#' @details
#' Generic \code{dimnames} methods are used by \code{colnames}.
#' Since row names are irrelevant, it is recommended to use \code{colnames} directly.
#'
#' @examples
#' data(agaricus.train, package='xgboost')
#' train <- agaricus.train
#' dtrain <- xgb.DMatrix(train$data, label=train$label)
#' dimnames(dtrain)
#' colnames(dtrain)
#' colnames(dtrain) <- make.names(1:ncol(train$data))
#' print(dtrain, verbose=TRUE)
#'
#' @rdname dimnames.xgb.DMatrix
#' @export
dimnames.xgb.DMatrix <- function(x) {
attr(x, '.Dimnames')
}
#' @rdname dimnames.xgb.DMatrix
#' @export
`dimnames<-.xgb.DMatrix` <- function(x, value) {
if (!is.list(value) || length(value) != 2L)
stop("invalid 'dimnames' given: must be a list of two elements")
if (!is.null(value[[1L]]))
stop("xgb.DMatrix does not have rownames")
if (is.null(value[[2]])) {
attr(x, '.Dimnames') <- NULL
return(x)
}
if (ncol(x) != length(value[[2]]))
stop("can't assign ", length(value[[2]]), " colnames to a ",
ncol(x), " column xgb.DMatrix")
attr(x, '.Dimnames') <- value
x
}
#' Get information of an xgb.DMatrix object
#'
#' Get information of an xgb.DMatrix object
#' @param object Object of class \code{xgb.DMatrix}
#' @param name the name of the information field to get (see details)
#' @param ... other parameters
#'
#' @details
#' The \code{name} field can be one of the following:
#'
#' \itemize{
#' \item \code{label}: label XGBoost learn from ;
#' \item \code{weight}: to do a weight rescale ;
#' \item \code{base_margin}: base margin is the base prediction XGBoost will boost from ;
#' \item \code{nrow}: number of rows of the \code{xgb.DMatrix}.
#'
#' }
#'
#' \code{group} can be setup by \code{setinfo} but can't be retrieved by \code{getinfo}.
#'
#' @examples
#' data(agaricus.train, package='xgboost')
#' dtrain <- with(agaricus.train, xgb.DMatrix(data, label = label))
#'
#' labels <- getinfo(dtrain, 'label')
#' setinfo(dtrain, 'label', 1-labels)
#'
#' labels2 <- getinfo(dtrain, 'label')
#' stopifnot(all(labels2 == 1-labels))
#' @rdname getinfo
#' @export
getinfo <- function(object, ...) UseMethod("getinfo")
#' @rdname getinfo
#' @export
getinfo.xgb.DMatrix <- function(object, name, ...) {
if (typeof(name) != "character" ||
length(name) != 1 ||
!name %in% c('label', 'weight', 'base_margin', 'nrow',
'label_lower_bound', 'label_upper_bound')) {
stop("getinfo: name must be one of the following\n",
" 'label', 'weight', 'base_margin', 'nrow', 'label_lower_bound', 'label_upper_bound'")
}
if (name != "nrow"){
ret <- .Call(XGDMatrixGetInfo_R, object, name)
} else {
ret <- nrow(object)
}
if (length(ret) == 0) return(NULL)
return(ret)
}
#' Set information of an xgb.DMatrix object
#'
#' Set information of an xgb.DMatrix object
#'
#' @param object Object of class "xgb.DMatrix"
#' @param name the name of the field to get
#' @param info the specific field of information to set
#' @param ... other parameters
#'
#' @details
#' The \code{name} field can be one of the following:
#'
#' \itemize{
#' \item \code{label}: label XGBoost learn from ;
#' \item \code{weight}: to do a weight rescale ;
#' \item \code{base_margin}: base margin is the base prediction XGBoost will boost from ;
#' \item \code{group}: number of rows in each group (to use with \code{rank:pairwise} objective).
#' }
#'
#' @examples
#' data(agaricus.train, package='xgboost')
#' dtrain <- with(agaricus.train, xgb.DMatrix(data, label = label))
#'
#' labels <- getinfo(dtrain, 'label')
#' setinfo(dtrain, 'label', 1-labels)
#' labels2 <- getinfo(dtrain, 'label')
#' stopifnot(all.equal(labels2, 1-labels))
#' @rdname setinfo
#' @export
setinfo <- function(object, ...) UseMethod("setinfo")
#' @rdname setinfo
#' @export
setinfo.xgb.DMatrix <- function(object, name, info, ...) {
if (name == "label") {
if (length(info) != nrow(object))
stop("The length of labels must equal to the number of rows in the input data")
.Call(XGDMatrixSetInfo_R, object, name, as.numeric(info))
return(TRUE)
}
if (name == "label_lower_bound") {
if (length(info) != nrow(object))
stop("The length of lower-bound labels must equal to the number of rows in the input data")
.Call(XGDMatrixSetInfo_R, object, name, as.numeric(info))
return(TRUE)
}
if (name == "label_upper_bound") {
if (length(info) != nrow(object))
stop("The length of upper-bound labels must equal to the number of rows in the input data")
.Call(XGDMatrixSetInfo_R, object, name, as.numeric(info))
return(TRUE)
}
if (name == "weight") {
.Call(XGDMatrixSetInfo_R, object, name, as.numeric(info))
return(TRUE)
}
if (name == "base_margin") {
# if (length(info)!=nrow(object))
# stop("The length of base margin must equal to the number of rows in the input data")
.Call(XGDMatrixSetInfo_R, object, name, as.numeric(info))
return(TRUE)
}
if (name == "group") {
if (sum(info) != nrow(object))
stop("The sum of groups must equal to the number of rows in the input data")
.Call(XGDMatrixSetInfo_R, object, name, as.integer(info))
return(TRUE)
}
stop("setinfo: unknown info name ", name)
return(FALSE)
}
#' Get a new DMatrix containing the specified rows of
#' original xgb.DMatrix object
#'
#' Get a new DMatrix containing the specified rows of
#' original xgb.DMatrix object
#'
#' @param object Object of class "xgb.DMatrix"
#' @param idxset a integer vector of indices of rows needed
#' @param colset currently not used (columns subsetting is not available)
#' @param ... other parameters (currently not used)
#'
#' @examples
#' data(agaricus.train, package='xgboost')
#' dtrain <- with(agaricus.train, xgb.DMatrix(data, label = label))
#'
#' dsub <- slice(dtrain, 1:42)
#' labels1 <- getinfo(dsub, 'label')
#' dsub <- dtrain[1:42, ]
#' labels2 <- getinfo(dsub, 'label')
#' all.equal(labels1, labels2)
#'
#' @rdname slice.xgb.DMatrix
#' @export
slice <- function(object, ...) UseMethod("slice")
#' @rdname slice.xgb.DMatrix
#' @export
slice.xgb.DMatrix <- function(object, idxset, ...) {
if (!inherits(object, "xgb.DMatrix")) {
stop("object must be xgb.DMatrix")
}
ret <- .Call(XGDMatrixSliceDMatrix_R, object, idxset)
attr_list <- attributes(object)
nr <- nrow(object)
len <- sapply(attr_list, NROW)
ind <- which(len == nr)
if (length(ind) > 0) {
nms <- names(attr_list)[ind]
for (i in seq_along(ind)) {
obj_attr <- attr(object, nms[i])
if (NCOL(obj_attr) > 1) {
attr(ret, nms[i]) <- obj_attr[idxset, ]
} else {
attr(ret, nms[i]) <- obj_attr[idxset]
}
}
}
return(structure(ret, class = "xgb.DMatrix"))
}
#' @rdname slice.xgb.DMatrix
#' @export
`[.xgb.DMatrix` <- function(object, idxset, colset = NULL) {
slice(object, idxset)
}
#' Print xgb.DMatrix
#'
#' Print information about xgb.DMatrix.
#' Currently it displays dimensions and presence of info-fields and colnames.
#'
#' @param x an xgb.DMatrix object
#' @param verbose whether to print colnames (when present)
#' @param ... not currently used
#'
#' @examples
#' data(agaricus.train, package='xgboost')
#' dtrain <- with(agaricus.train, xgb.DMatrix(data, label = label))
#'
#' dtrain
#' print(dtrain, verbose=TRUE)
#'
#' @method print xgb.DMatrix
#' @export
print.xgb.DMatrix <- function(x, verbose = FALSE, ...) {
cat('xgb.DMatrix dim:', nrow(x), 'x', ncol(x), ' info: ')
infos <- character(0)
if (length(getinfo(x, 'label')) > 0) infos <- 'label'
if (length(getinfo(x, 'weight')) > 0) infos <- c(infos, 'weight')
if (length(getinfo(x, 'base_margin')) > 0) infos <- c(infos, 'base_margin')
if (length(infos) == 0) infos <- 'NA'
cat(infos)
cnames <- colnames(x)
cat(' colnames:')
if (verbose & !is.null(cnames)) {
cat("\n'")
cat(cnames, sep = "','")
cat("'")
} else {
if (is.null(cnames)) cat(' no')
else cat(' yes')
}
cat("\n")
invisible(x)
}