From b0be833c7585177ee835ede70ae58b9c7e347282 Mon Sep 17 00:00:00 2001 From: hetong007 Date: Thu, 27 Aug 2015 14:30:23 -0700 Subject: [PATCH 01/53] add save_period --- R-package/R/xgb.train.R | 10 +++++++++- R-package/R/xgboost.R | 7 +++++-- R-package/man/xgb.train.Rd | 7 ++++++- R-package/man/xgboost.Rd | 6 +++++- 4 files changed, 25 insertions(+), 5 deletions(-) diff --git a/R-package/R/xgb.train.R b/R-package/R/xgb.train.R index fb403143a843..b1d79d8660cd 100644 --- a/R-package/R/xgb.train.R +++ b/R-package/R/xgb.train.R @@ -72,6 +72,8 @@ #' keeps getting worse consecutively for \code{k} rounds. #' @param maximize If \code{feval} and \code{early.stop.round} are set, then \code{maximize} must be set as well. #' \code{maximize=TRUE} means the larger the evaluation score the better. +#' @param save_period save the model to the disk in every \code{save_period} rounds, 0 means no such action. +#' @param save_name the name or path for periodically saved model file. #' @param ... other parameters to pass to \code{params}. #' #' @details @@ -120,7 +122,8 @@ #' xgb.train <- function(params=list(), data, nrounds, watchlist = list(), obj = NULL, feval = NULL, verbose = 1, print.every.n=1L, - early.stop.round = NULL, maximize = NULL, ...) { + early.stop.round = NULL, maximize = NULL, + save_period = 0, save_name = "xgboost.model", ...) { dtrain <- data if (typeof(params) != "list") { stop("xgb.train: first argument params must be list") @@ -215,6 +218,11 @@ xgb.train <- function(params=list(), data, nrounds, watchlist = list(), } } } + if (save_period > 0) { + if (i %% save_period == 0) { + xgb.save(bst, save_name) + } + } } bst <- xgb.Booster.check(bst) if (!is.null(early.stop.round)) { diff --git a/R-package/R/xgboost.R b/R-package/R/xgboost.R index 63077f866930..c96a7d89e831 100644 --- a/R-package/R/xgboost.R +++ b/R-package/R/xgboost.R @@ -36,6 +36,8 @@ #' keeps getting worse consecutively for \code{k} rounds. #' @param maximize If \code{feval} and \code{early.stop.round} are set, then \code{maximize} must be set as well. #' \code{maximize=TRUE} means the larger the evaluation score the better. +#' @param save_period save the model to the disk in every \code{save_period} rounds, 0 means no such action. +#' @param save_name the name or path for periodically saved model file. #' @param ... other parameters to pass to \code{params}. #' #' @details @@ -58,7 +60,7 @@ #' xgboost <- function(data = NULL, label = NULL, missing = NULL, params = list(), nrounds, verbose = 1, print.every.n = 1L, early.stop.round = NULL, - maximize = NULL, ...) { + maximize = NULL, save_period = 0, save_name = "xgboost.model", ...) { if (is.null(missing)) { dtrain <- xgb.get.DMatrix(data, label) } else { @@ -74,7 +76,8 @@ xgboost <- function(data = NULL, label = NULL, missing = NULL, params = list(), } bst <- xgb.train(params, dtrain, nrounds, watchlist, verbose = verbose, print.every.n=print.every.n, - early.stop.round = early.stop.round) + early.stop.round = early.stop.round, maximize = maximize, + save_period = save_period, save_name = save_name) return(bst) } diff --git a/R-package/man/xgb.train.Rd b/R-package/man/xgb.train.Rd index 7b1893ba7482..15a0b0ba7743 100644 --- a/R-package/man/xgb.train.Rd +++ b/R-package/man/xgb.train.Rd @@ -6,7 +6,8 @@ \usage{ xgb.train(params = list(), data, nrounds, watchlist = list(), obj = NULL, feval = NULL, verbose = 1, print.every.n = 1L, - early.stop.round = NULL, maximize = NULL, ...) + early.stop.round = NULL, maximize = NULL, save_period = 0, + save_name = "xgboost.model", ...) } \arguments{ \item{params}{the list of parameters. @@ -87,6 +88,10 @@ keeps getting worse consecutively for \code{k} rounds.} \item{maximize}{If \code{feval} and \code{early.stop.round} are set, then \code{maximize} must be set as well. \code{maximize=TRUE} means the larger the evaluation score the better.} +\item{save_period}{save the model to the disk in every \code{save_period} rounds, 0 means no such action.} + +\item{save_name}{the name or path for periodically saved model file.} + \item{...}{other parameters to pass to \code{params}.} } \description{ diff --git a/R-package/man/xgboost.Rd b/R-package/man/xgboost.Rd index 64bd003692e5..5dfeeaceea6b 100644 --- a/R-package/man/xgboost.Rd +++ b/R-package/man/xgboost.Rd @@ -6,7 +6,7 @@ \usage{ xgboost(data = NULL, label = NULL, missing = NULL, params = list(), nrounds, verbose = 1, print.every.n = 1L, early.stop.round = NULL, - maximize = NULL, ...) + maximize = NULL, save_period = 0, save_name = "xgboost.model", ...) } \arguments{ \item{data}{takes \code{matrix}, \code{dgCMatrix}, local data file or @@ -51,6 +51,10 @@ keeps getting worse consecutively for \code{k} rounds.} \item{maximize}{If \code{feval} and \code{early.stop.round} are set, then \code{maximize} must be set as well. \code{maximize=TRUE} means the larger the evaluation score the better.} +\item{save_period}{save the model to the disk in every \code{save_period} rounds, 0 means no such action.} + +\item{save_name}{the name or path for periodically saved model file.} + \item{...}{other parameters to pass to \code{params}.} } \description{ From 635c39c4c3c7b0dac0bbf2539ddd0e7cf732fefb Mon Sep 17 00:00:00 2001 From: Tong He Date: Thu, 27 Aug 2015 15:35:53 -0700 Subject: [PATCH 02/53] Update README.md --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 121462d3851c..67dfbbd9e3ec 100644 --- a/README.md +++ b/README.md @@ -29,6 +29,7 @@ Contents What's New ---------- +* XGBoost helps Owen Zhang to win the [Avito Context Ad Click competition](https://www.kaggle.com/c/avito-context-ad-clicks). Check out the [interview from Kaggle](http://blog.kaggle.com/2015/08/26/avito-winners-interview-1st-place-owen-zhang/). * XGBoost helps Chenglong Chen to win [Kaggle CrowdFlower Competition](https://www.kaggle.com/c/crowdflower-search-relevance) Check out the [winning solution](https://github.com/ChenglongChen/Kaggle_CrowdFlower) * XGBoost-0.4 release, see [CHANGES.md](CHANGES.md#xgboost-04) From 4554da0537539b42feca0dd6000b774902c11d7c Mon Sep 17 00:00:00 2001 From: hetong007 Date: Thu, 27 Aug 2015 15:56:35 -0700 Subject: [PATCH 03/53] add test module in R --- R-package/DESCRIPTION | 3 ++- R-package/tests/testthat.R | 4 ++++ R-package/tests/testthat/test_basic.R | 34 +++++++++++++++++++++++++++ 3 files changed, 40 insertions(+), 1 deletion(-) create mode 100644 R-package/tests/testthat.R create mode 100644 R-package/tests/testthat/test_basic.R diff --git a/R-package/DESCRIPTION b/R-package/DESCRIPTION index 19410d65a44a..59728f3c2f79 100644 --- a/R-package/DESCRIPTION +++ b/R-package/DESCRIPTION @@ -23,7 +23,8 @@ Suggests: ggplot2 (>= 1.0.0), DiagrammeR (>= 0.6), Ckmeans.1d.dp (>= 3.3.1), - vcd (>= 1.3) + vcd (>= 1.3), + testthat Depends: R (>= 2.10) Imports: diff --git a/R-package/tests/testthat.R b/R-package/tests/testthat.R new file mode 100644 index 000000000000..53cc6caba6db --- /dev/null +++ b/R-package/tests/testthat.R @@ -0,0 +1,4 @@ +library(testthat) +library(xgboost) + +test_check("xgboost") diff --git a/R-package/tests/testthat/test_basic.R b/R-package/tests/testthat/test_basic.R new file mode 100644 index 000000000000..1c4a85921f43 --- /dev/null +++ b/R-package/tests/testthat/test_basic.R @@ -0,0 +1,34 @@ +require(xgboost) + +context("basic functions") + +test_that("data loading", { + data(agaricus.train, package='xgboost') + data(agaricus.test, package='xgboost') +}) + +test_that("train and prediction",{ + train = agaricus.train + test = agaricus.test + bst = xgboost(data = train$data, label = train$label, max.depth = 2, + eta = 1, nthread = 2, nround = 2, objective = "binary:logistic") + pred = predict(bst, test$data) +}) + +test_that("early stopping", { + res = xgb.cv(data = train$data, label = train$label, max.depth = 2, nfold = 5, + eta = 0.3, nthread = 2, nround = 20, objective = "binary:logistic", + early.stop.round = 3, maximize = FALSE) + expect_true(nrow(res)<20) + bst = xgboost(data = train$data, label = train$label, max.depth = 2, + eta = 0.3, nthread = 2, nround = 20, objective = "binary:logistic", + early.stop.round = 3, maximize = FALSE) + pred = predict(bst, test$data) +}) + +test_that("save_period", { + bst = xgboost(data = train$data, label = train$label, max.depth = 2, + eta = 0.3, nthread = 2, nround = 20, objective = "binary:logistic", + save_period = 10, save_name = "xgb.model") + pred = predict(bst, test$data) +}) From 5773d4d3c40822d22b007032d223305c19cc577b Mon Sep 17 00:00:00 2001 From: hetong007 Date: Thu, 27 Aug 2015 16:02:41 -0700 Subject: [PATCH 04/53] fix test --- R-package/tests/testthat/test_basic.R | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/R-package/tests/testthat/test_basic.R b/R-package/tests/testthat/test_basic.R index 1c4a85921f43..791f1246c30c 100644 --- a/R-package/tests/testthat/test_basic.R +++ b/R-package/tests/testthat/test_basic.R @@ -2,19 +2,18 @@ require(xgboost) context("basic functions") -test_that("data loading", { - data(agaricus.train, package='xgboost') - data(agaricus.test, package='xgboost') -}) +data(agaricus.train, package='xgboost') +data(agaricus.test, package='xgboost') +train = agaricus.train +test = agaricus.test -test_that("train and prediction",{ - train = agaricus.train - test = agaricus.test +test_that("train and predict", { bst = xgboost(data = train$data, label = train$label, max.depth = 2, eta = 1, nthread = 2, nround = 2, objective = "binary:logistic") pred = predict(bst, test$data) }) + test_that("early stopping", { res = xgb.cv(data = train$data, label = train$label, max.depth = 2, nfold = 5, eta = 0.3, nthread = 2, nround = 20, objective = "binary:logistic", From 632fdc3e190411073cc97fda5fa2526232ffe804 Mon Sep 17 00:00:00 2001 From: okaoka Date: Sat, 29 Aug 2015 19:45:11 +0900 Subject: [PATCH 05/53] Fix a typo --- doc/parameter.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/parameter.md b/doc/parameter.md index 4e0f365bf3db..228b11780581 100644 --- a/doc/parameter.md +++ b/doc/parameter.md @@ -105,7 +105,7 @@ The following parameters are only used in the console version of xgboost * task [default=train] options: train, pred, eval, dump - train: training using data - pred: making prediction for test:data - - eval: for evaluating statistics specified by eval[name]=filenam + - eval: for evaluating statistics specified by eval[name]=filename - dump: for dump the learned model into text format(preliminary) * model_in [default=NULL] - path to input model, needed for test, eval, dump, if it is specified in training, xgboost will continue training from the input model From 3d6c831e8acafe2f05f4c974193d462ed8615192 Mon Sep 17 00:00:00 2001 From: unknown Date: Wed, 2 Sep 2015 21:43:23 -0700 Subject: [PATCH 06/53] add error for data.frame, add weight to xgboost --- R-package/R/utils.R | 9 ++++++++- R-package/R/xgboost.R | 10 ++++------ R-package/man/xgboost.Rd | 9 ++++++--- 3 files changed, 18 insertions(+), 10 deletions(-) diff --git a/R-package/R/utils.R b/R-package/R/utils.R index e58601df8b61..732ef0d11b5a 100644 --- a/R-package/R/utils.R +++ b/R-package/R/utils.R @@ -103,17 +103,21 @@ xgb.Booster.check <- function(bst, saveraw = TRUE) ## ----the following are low level iteratively function, not needed if ## you do not want to use them --------------------------------------- # get dmatrix from data, label -xgb.get.DMatrix <- function(data, label = NULL, missing = NULL) { +xgb.get.DMatrix <- function(data, label = NULL, missing = NULL, weight = NULL) { inClass <- class(data) if (inClass == "dgCMatrix" || inClass == "matrix") { if (is.null(label)) { stop("xgboost: need label when data is a matrix") } + dtrain <- xgb.DMatrix(data, label = label) if (is.null(missing)){ dtrain <- xgb.DMatrix(data, label = label) } else { dtrain <- xgb.DMatrix(data, label = label, missing = missing) } + if (!is.null(weight)){ + xgb.setinfo(dtrain, "weight", weight) + } } else { if (!is.null(label)) { warning("xgboost: label will be ignored.") @@ -122,6 +126,9 @@ xgb.get.DMatrix <- function(data, label = NULL, missing = NULL) { dtrain <- xgb.DMatrix(data) } else if (inClass == "xgb.DMatrix") { dtrain <- data + } else if (inClass == "data.frame") { + stop("xgboost only support numerical matrix input, + use 'data.frame' to transform the data.") } else { stop("xgboost: Invalid input of data") } diff --git a/R-package/R/xgboost.R b/R-package/R/xgboost.R index c96a7d89e831..164dc1838539 100644 --- a/R-package/R/xgboost.R +++ b/R-package/R/xgboost.R @@ -31,6 +31,7 @@ #' @param print.every.n Print every N progress messages when \code{verbose>0}. Default is 1 which means all messages are printed. #' @param missing Missing is only used when input is dense matrix, pick a float #' value that represents missing value. Sometimes a data use 0 or other extreme value to represents missing values. +#' @param weight a vector indicating the weight for each row of the input. #' @param early.stop.round If \code{NULL}, the early stopping function is not triggered. #' If set to an integer \code{k}, training with a validation set will stop if the performance #' keeps getting worse consecutively for \code{k} rounds. @@ -58,14 +59,11 @@ #' #' @export #' -xgboost <- function(data = NULL, label = NULL, missing = NULL, params = list(), nrounds, +xgboost <- function(data = NULL, label = NULL, missing = NULL, weight = NULL, + params = list(), nrounds, verbose = 1, print.every.n = 1L, early.stop.round = NULL, maximize = NULL, save_period = 0, save_name = "xgboost.model", ...) { - if (is.null(missing)) { - dtrain <- xgb.get.DMatrix(data, label) - } else { - dtrain <- xgb.get.DMatrix(data, label, missing) - } + dtrain <- xgb.get.DMatrix(data, label, missing, weight) params <- append(params, list(...)) diff --git a/R-package/man/xgboost.Rd b/R-package/man/xgboost.Rd index 5dfeeaceea6b..a05560a19506 100644 --- a/R-package/man/xgboost.Rd +++ b/R-package/man/xgboost.Rd @@ -4,9 +4,10 @@ \alias{xgboost} \title{eXtreme Gradient Boosting (Tree) library} \usage{ -xgboost(data = NULL, label = NULL, missing = NULL, params = list(), - nrounds, verbose = 1, print.every.n = 1L, early.stop.round = NULL, - maximize = NULL, save_period = 0, save_name = "xgboost.model", ...) +xgboost(data = NULL, label = NULL, missing = NULL, weight = NULL, + params = list(), nrounds, verbose = 1, print.every.n = 1L, + early.stop.round = NULL, maximize = NULL, save_period = 0, + save_name = "xgboost.model", ...) } \arguments{ \item{data}{takes \code{matrix}, \code{dgCMatrix}, local data file or @@ -18,6 +19,8 @@ if data is local data file or \code{xgb.DMatrix}.} \item{missing}{Missing is only used when input is dense matrix, pick a float value that represents missing value. Sometimes a data use 0 or other extreme value to represents missing values.} +\item{weight}{a vector indicating the weight for each row of the input.} + \item{params}{the list of parameters. Commonly used ones are: From dbc5c9b82dd12581fc9842088e8605e560df583f Mon Sep 17 00:00:00 2001 From: Far0n Date: Sat, 5 Sep 2015 12:36:42 +0200 Subject: [PATCH 07/53] alpha & lambda for gbtree alpha & lambda descriptions to "Parameters for Tree Booster" added (issue #466) --- doc/parameter.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/doc/parameter.md b/doc/parameter.md index 228b11780581..8d05b9b6c2b7 100644 --- a/doc/parameter.md +++ b/doc/parameter.md @@ -46,6 +46,10 @@ Parameters for Tree Booster * colsample_bytree [default=1] - subsample ratio of columns when constructing each tree. - range: (0,1] +* lambda [default=0] + - L2 regularization term on weights +* alpha [default=0] + - L1 regularization term on weights Parameters for Linear Booster ----------------------------- From a9f884bd47b49751337b5473e6e728909c19e852 Mon Sep 17 00:00:00 2001 From: Far0n Date: Sat, 5 Sep 2015 21:50:53 +0200 Subject: [PATCH 08/53] alpha = 1 as default value for gbtree --- doc/parameter.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/parameter.md b/doc/parameter.md index 8d05b9b6c2b7..86eda22029e6 100644 --- a/doc/parameter.md +++ b/doc/parameter.md @@ -48,7 +48,7 @@ Parameters for Tree Booster - range: (0,1] * lambda [default=0] - L2 regularization term on weights -* alpha [default=0] +* alpha [default=1] - L1 regularization term on weights Parameters for Linear Booster From cfcb1fc491d166474777c3581d52af9d6124057d Mon Sep 17 00:00:00 2001 From: Far0n Date: Sat, 5 Sep 2015 21:53:37 +0200 Subject: [PATCH 09/53] default values for gbtree: lambda=1, alpha=0 --- doc/parameter.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/parameter.md b/doc/parameter.md index 86eda22029e6..ba0a18870df9 100644 --- a/doc/parameter.md +++ b/doc/parameter.md @@ -46,9 +46,9 @@ Parameters for Tree Booster * colsample_bytree [default=1] - subsample ratio of columns when constructing each tree. - range: (0,1] -* lambda [default=0] +* lambda [default=1] - L2 regularization term on weights -* alpha [default=1] +* alpha [default=0] - L1 regularization term on weights Parameters for Linear Booster From 92b996513ef25a764d85e4c837edd73908e8b942 Mon Sep 17 00:00:00 2001 From: terrytangyuan Date: Sat, 5 Sep 2015 22:50:27 -0400 Subject: [PATCH 10/53] TST: Added R unit test for glm --- R-package/tests/testthat/test_glm.R | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) create mode 100644 R-package/tests/testthat/test_glm.R diff --git a/R-package/tests/testthat/test_glm.R b/R-package/tests/testthat/test_glm.R new file mode 100644 index 000000000000..485aad82ddd4 --- /dev/null +++ b/R-package/tests/testthat/test_glm.R @@ -0,0 +1,19 @@ +context('Test generalized linear models') + +require(xgboost) + +test_that("glm works", { + data(agaricus.train, package='xgboost') + data(agaricus.test, package='xgboost') + dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label) + dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label) + expect_equal(class(dtrain), "xgb.DMatrix") + expect_equal(class(dtest), "xgb.DMatrix") + param <- list(objective = "binary:logistic", booster = "gblinear", + nthread = 2, alpha = 0.0001, lambda = 1) + watchlist <- list(eval = dtest, train = dtrain) + num_round <- 2 + expect_that(bst <- xgb.train(param, dtrain, num_round, watchlist), not(throws_error())) + expect_that(ypred <- predict(bst, dtest), not(throws_error())) + expect_equal(length(getinfo(dtest, 'label')), 1611) +}) From 339a53d9d4f8ac2a4d71b9034231f57477d24245 Mon Sep 17 00:00:00 2001 From: "Yuan Tang (Terry)" Date: Sun, 6 Sep 2015 20:00:25 -0400 Subject: [PATCH 11/53] fixed unit test in R --- R-package/tests/testthat/test_glm.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R-package/tests/testthat/test_glm.R b/R-package/tests/testthat/test_glm.R index 485aad82ddd4..dc7b6efabfc1 100644 --- a/R-package/tests/testthat/test_glm.R +++ b/R-package/tests/testthat/test_glm.R @@ -13,7 +13,7 @@ test_that("glm works", { nthread = 2, alpha = 0.0001, lambda = 1) watchlist <- list(eval = dtest, train = dtrain) num_round <- 2 - expect_that(bst <- xgb.train(param, dtrain, num_round, watchlist), not(throws_error())) - expect_that(ypred <- predict(bst, dtest), not(throws_error())) + bst <- xgb.train(param, dtrain, num_round, watchlist) + ypred <- predict(bst, dtest) expect_equal(length(getinfo(dtest, 'label')), 1611) }) From 35944a13b49836853be7f40d7e508dad7e31ac09 Mon Sep 17 00:00:00 2001 From: Den Raskovalov Date: Sun, 6 Sep 2015 20:41:55 -0700 Subject: [PATCH 12/53] make XGBClassifier.score compatible with arrays --- python-package/xgboost/sklearn.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python-package/xgboost/sklearn.py b/python-package/xgboost/sklearn.py index 6f176972aced..a2761c5abcf7 100644 --- a/python-package/xgboost/sklearn.py +++ b/python-package/xgboost/sklearn.py @@ -319,7 +319,7 @@ def predict(self, data): if len(class_probs.shape) > 1: column_indexes = np.argmax(class_probs, axis=1) else: - column_indexes = np.repeat(0, data.shape[0]) + column_indexes = np.repeat(0, class_probs.shape[0]) column_indexes[class_probs > 0.5] = 1 return self._le.inverse_transform(column_indexes) From 78afd6c7729929f1bea0ed7e902f80fe939d8c1c Mon Sep 17 00:00:00 2001 From: terrytangyuan Date: Mon, 7 Sep 2015 21:36:52 -0400 Subject: [PATCH 13/53] TST: Added test for dump --- R-package/tests/testthat/test_helpers.R | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) create mode 100644 R-package/tests/testthat/test_helpers.R diff --git a/R-package/tests/testthat/test_helpers.R b/R-package/tests/testthat/test_helpers.R new file mode 100644 index 000000000000..a7656f07213e --- /dev/null +++ b/R-package/tests/testthat/test_helpers.R @@ -0,0 +1,17 @@ +context('Test helper functions') + +require(xgboost) + +data(agaricus.train, package='xgboost') +data(agaricus.test, package='xgboost') +train <- agaricus.train +test <- agaricus.test + +bst <- xgboost(data = train$data, label = train$label, max.depth = 2, + eta = 1, nthread = 2, nround = 2,objective = "binary:logistic") + +test_that("dump works", { + capture.output(print(xgb.dump(bst))) +}) + + From d833038ba1e455b19e5f57be36acc8d624f5f6a9 Mon Sep 17 00:00:00 2001 From: terrytangyuan Date: Mon, 7 Sep 2015 21:48:57 -0400 Subject: [PATCH 14/53] TST: Added test for xgb.importance --- R-package/tests/testthat/test_helpers.R | 24 +++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/R-package/tests/testthat/test_helpers.R b/R-package/tests/testthat/test_helpers.R index a7656f07213e..57f4c48fb67e 100644 --- a/R-package/tests/testthat/test_helpers.R +++ b/R-package/tests/testthat/test_helpers.R @@ -1,17 +1,27 @@ context('Test helper functions') require(xgboost) +require(data.table) +require(Matrix) +require(vcd) +data(Arthritis) data(agaricus.train, package='xgboost') -data(agaricus.test, package='xgboost') -train <- agaricus.train -test <- agaricus.test +df <- data.table(Arthritis, keep.rownames = F) +df[,AgeDiscret:= as.factor(round(Age/10,0))] +df[,AgeCat:= as.factor(ifelse(Age > 30, "Old", "Young"))] +df[,ID:=NULL] +sparse_matrix = sparse.model.matrix(Improved~.-1, data = df) +output_vector = df[,Y:=0][Improved == "Marked",Y:=1][,Y] +bst <- xgboost(data = sparse_matrix, label = output_vector, max.depth = 9, + eta = 1, nthread = 2, nround = 10,objective = "binary:logistic") -bst <- xgboost(data = train$data, label = train$label, max.depth = 2, - eta = 1, nthread = 2, nround = 2,objective = "binary:logistic") -test_that("dump works", { +test_that("xgb.dump works", { capture.output(print(xgb.dump(bst))) }) - +test_that("xgb.importance works", { + xgb.dump(bst, 'xgb.model.dump', with.stats = T) + importance <- xgb.importance(sparse_matrix@Dimnames[[2]], 'xgb.model.dump') +}) From 408c3a62a8255c84f777219c54b2f985e6c37bf7 Mon Sep 17 00:00:00 2001 From: terrytangyuan Date: Mon, 7 Sep 2015 21:49:27 -0400 Subject: [PATCH 15/53] TST: Added test for xgb.plot.tree --- R-package/tests/testthat/test_helpers.R | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/R-package/tests/testthat/test_helpers.R b/R-package/tests/testthat/test_helpers.R index 57f4c48fb67e..e26ec155b241 100644 --- a/R-package/tests/testthat/test_helpers.R +++ b/R-package/tests/testthat/test_helpers.R @@ -25,3 +25,7 @@ test_that("xgb.importance works", { xgb.dump(bst, 'xgb.model.dump', with.stats = T) importance <- xgb.importance(sparse_matrix@Dimnames[[2]], 'xgb.model.dump') }) + +test_that("xgb.plot.tree works", { + xgb.plot.tree(agaricus.train$data@Dimnames[[2]], model = bst) +}) \ No newline at end of file From 886955148ddcc3cc711ff1307d4625f3406e3931 Mon Sep 17 00:00:00 2001 From: terrytangyuan Date: Mon, 7 Sep 2015 21:55:17 -0400 Subject: [PATCH 16/53] TST: Added test for models with custom objective --- .../tests/testthat/test_custom_objective.R | 43 +++++++++++++++++++ 1 file changed, 43 insertions(+) create mode 100644 R-package/tests/testthat/test_custom_objective.R diff --git a/R-package/tests/testthat/test_custom_objective.R b/R-package/tests/testthat/test_custom_objective.R new file mode 100644 index 000000000000..a941e39c8b4a --- /dev/null +++ b/R-package/tests/testthat/test_custom_objective.R @@ -0,0 +1,43 @@ +context('Test models with custom objective') + +require(xgboost) + +test_that("custom objective works", { + data(agaricus.train, package='xgboost') + data(agaricus.test, package='xgboost') + dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label) + dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label) + + watchlist <- list(eval = dtest, train = dtrain) + num_round <- 2 + + logregobj <- function(preds, dtrain) { + labels <- getinfo(dtrain, "label") + preds <- 1/(1 + exp(-preds)) + grad <- preds - labels + hess <- preds * (1 - preds) + return(list(grad = grad, hess = hess)) + } + evalerror <- function(preds, dtrain) { + labels <- getinfo(dtrain, "label") + err <- as.numeric(sum(labels != (preds > 0)))/length(labels) + return(list(metric = "error", value = err)) + } + + param <- list(max.depth=2, eta=1, nthread = 2, silent=1, + objective=logregobj, eval_metric=evalerror) + + bst <- xgb.train(param, dtrain, num_round, watchlist) + attr(dtrain, 'label') <- getinfo(dtrain, 'label') + + logregobjattr <- function(preds, dtrain) { + labels <- attr(dtrain, 'label') + preds <- 1/(1 + exp(-preds)) + grad <- preds - labels + hess <- preds * (1 - preds) + return(list(grad = grad, hess = hess)) + } + param <- list(max.depth=2, eta=1, nthread = 2, silent=1, + objective=logregobjattr, eval_metric=evalerror) + bst <- xgb.train(param, dtrain, num_round, watchlist) +}) \ No newline at end of file From 3a49e1bdb1c83c720a1cc1a471bcc9da7085b43c Mon Sep 17 00:00:00 2001 From: terrytangyuan Date: Mon, 7 Sep 2015 21:56:50 -0400 Subject: [PATCH 17/53] TST: Added more checks for testing custom objective --- R-package/tests/testthat/test_custom_objective.R | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/R-package/tests/testthat/test_custom_objective.R b/R-package/tests/testthat/test_custom_objective.R index a941e39c8b4a..9fcbeca4d230 100644 --- a/R-package/tests/testthat/test_custom_objective.R +++ b/R-package/tests/testthat/test_custom_objective.R @@ -28,6 +28,8 @@ test_that("custom objective works", { objective=logregobj, eval_metric=evalerror) bst <- xgb.train(param, dtrain, num_round, watchlist) + expect_equal(class(bst), "xgb.Booster") + expect_equal(length(bst$raw), 1064) attr(dtrain, 'label') <- getinfo(dtrain, 'label') logregobjattr <- function(preds, dtrain) { @@ -40,4 +42,6 @@ test_that("custom objective works", { param <- list(max.depth=2, eta=1, nthread = 2, silent=1, objective=logregobjattr, eval_metric=evalerror) bst <- xgb.train(param, dtrain, num_round, watchlist) + expect_equal(class(bst), "xgb.Booster") + expect_equal(length(bst$raw), 1064) }) \ No newline at end of file From c50cf6d7ff367ca40a4453f10d07edf569614f9f Mon Sep 17 00:00:00 2001 From: terrytangyuan Date: Mon, 7 Sep 2015 22:03:28 -0400 Subject: [PATCH 18/53] TST: Added test for poisson regression --- R-package/tests/testthat/test_poisson_regression.R | 13 +++++++++++++ 1 file changed, 13 insertions(+) create mode 100644 R-package/tests/testthat/test_poisson_regression.R diff --git a/R-package/tests/testthat/test_poisson_regression.R b/R-package/tests/testthat/test_poisson_regression.R new file mode 100644 index 000000000000..5d3d78e27ca0 --- /dev/null +++ b/R-package/tests/testthat/test_poisson_regression.R @@ -0,0 +1,13 @@ +context('Test poisson regression model') + +require(xgboost) + +test_that("poisson regression works", { + data(mtcars) + bst = xgboost(data=as.matrix(mtcars[,-11]),label=mtcars[,11], + objective='count:poisson',nrounds=5) + expect_equal(class(bst), "xgb.Booster") + pred = predict(bst,as.matrix(mtcars[,-11])) + expect_equal(length(pred), 32) + sqrt(mean((pred-mtcars[,11])^2)) +}) \ No newline at end of file From cb3afeec53dc8d0241ca96057a3882a8e6e6c438 Mon Sep 17 00:00:00 2001 From: "Yuan Tang (Terry)" Date: Mon, 7 Sep 2015 22:23:47 -0400 Subject: [PATCH 19/53] DOC: Typo in README.md in tests folder --- tests/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/README.md b/tests/README.md index 19e34d5dfac2..ee7c8fcc6857 100644 --- a/tests/README.md +++ b/tests/README.md @@ -1 +1 @@ -This folder contains tetstcases for xgboost. \ No newline at end of file +This folder contains testcases for xgboost. From fbf2a5feedb43e5d81ca6122b060b51565d7f1c9 Mon Sep 17 00:00:00 2001 From: terrytangyuan Date: Mon, 7 Sep 2015 22:49:10 -0400 Subject: [PATCH 20/53] DOC: Updated CONTRIBUTORS.md --- CONTRIBUTORS.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index 6ae79f795aee..ab9c980c89a6 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -34,6 +34,7 @@ List of Contributors * [Zygmunt Zając](https://github.com/zygmuntz) - Zygmunt is the master behind the early stopping feature frequently used by kagglers. * [Ajinkya Kale](https://github.com/ajkl) +* [Yuan Tang](https://github.com/terrytangyuan) * [Boliang Chen](https://github.com/cblsjtu) * [Vadim Khotilovich](https://github.com/khotilov) * [Yangqing Men](https://github.com/yanqingmen) From 33f1ab3ae1237f788f09e81a0106aff46869b2bd Mon Sep 17 00:00:00 2001 From: terrytangyuan Date: Mon, 7 Sep 2015 22:51:14 -0400 Subject: [PATCH 21/53] TST: Added one minor check for xgb.importance --- R-package/tests/testthat/test_helpers.R | 1 + 1 file changed, 1 insertion(+) diff --git a/R-package/tests/testthat/test_helpers.R b/R-package/tests/testthat/test_helpers.R index e26ec155b241..4d80146e30a1 100644 --- a/R-package/tests/testthat/test_helpers.R +++ b/R-package/tests/testthat/test_helpers.R @@ -24,6 +24,7 @@ test_that("xgb.dump works", { test_that("xgb.importance works", { xgb.dump(bst, 'xgb.model.dump', with.stats = T) importance <- xgb.importance(sparse_matrix@Dimnames[[2]], 'xgb.model.dump') + expect_equal(dim(importance), c(7, 4)) }) test_that("xgb.plot.tree works", { From eb1b185d700d130f8ba4aaba7ca7a6070687822d Mon Sep 17 00:00:00 2001 From: terrytangyuan Date: Tue, 8 Sep 2015 09:47:48 -0400 Subject: [PATCH 22/53] TST: Added glm test for Python --- tests/python/test_models.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) create mode 100644 tests/python/test_models.py diff --git a/tests/python/test_models.py b/tests/python/test_models.py new file mode 100644 index 000000000000..a12198f5953a --- /dev/null +++ b/tests/python/test_models.py @@ -0,0 +1,15 @@ +import numpy as np +import xgboost as xgb + +dpath = 'demo/data/' + +def test_glm(): + dtrain = xgb.DMatrix('../data/agaricus.txt.train') + dtest = xgb.DMatrix('../data/agaricus.txt.test') + param = {'silent':1, 'objective':'binary:logistic', 'booster':'gblinear', + 'alpha': 0.0001, 'lambda': 1 } + watchlist = [(dtest,'eval'), (dtrain,'train')] + num_round = 4 + bst = xgb.train(param, dtrain, num_round, watchlist) + preds = bst.predict(dtest) + labels = dtest.get_label() \ No newline at end of file From 82a43f448eb5466e3a47b52d619dc01e0ac87f5f Mon Sep 17 00:00:00 2001 From: terrytangyuan Date: Tue, 8 Sep 2015 09:54:38 -0400 Subject: [PATCH 23/53] TST: Added Python test for custom objective functions --- tests/python/test_models.py | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/tests/python/test_models.py b/tests/python/test_models.py index a12198f5953a..b0eb7482d2ce 100644 --- a/tests/python/test_models.py +++ b/tests/python/test_models.py @@ -2,14 +2,31 @@ import xgboost as xgb dpath = 'demo/data/' +dtrain = xgb.DMatrix(dpath + 'agaricus.txt.train') +dtest = xgb.DMatrix(dpath + 'agaricus.txt.test') def test_glm(): - dtrain = xgb.DMatrix('../data/agaricus.txt.train') - dtest = xgb.DMatrix('../data/agaricus.txt.test') param = {'silent':1, 'objective':'binary:logistic', 'booster':'gblinear', 'alpha': 0.0001, 'lambda': 1 } watchlist = [(dtest,'eval'), (dtrain,'train')] num_round = 4 bst = xgb.train(param, dtrain, num_round, watchlist) preds = bst.predict(dtest) - labels = dtest.get_label() \ No newline at end of file + labels = dtest.get_label() + +def test_custom_objective(): + param = {'max_depth':2, 'eta':1, 'silent':1 } + watchlist = [(dtest,'eval'), (dtrain,'train')] + num_round = 2 + def logregobj(preds, dtrain): + labels = dtrain.get_label() + preds = 1.0 / (1.0 + np.exp(-preds)) + grad = preds - labels + hess = preds * (1.0-preds) + return grad, hess + def evalerror(preds, dtrain): + labels = dtrain.get_label() + return 'error', float(sum(labels != (preds > 0.0))) / len(labels) + bst = xgb.train(param, dtrain, num_round, watchlist, logregobj, evalerror) + + From 8196d5d680f0beeaf390d5fbbaa8d8f32207669b Mon Sep 17 00:00:00 2001 From: terrytangyuan Date: Tue, 8 Sep 2015 10:14:28 -0400 Subject: [PATCH 24/53] TST: More thorough checks for Python tests --- tests/python/test_models.py | 31 +++++++++++++++++++------------ 1 file changed, 19 insertions(+), 12 deletions(-) diff --git a/tests/python/test_models.py b/tests/python/test_models.py index b0eb7482d2ce..8c06d9de9528 100644 --- a/tests/python/test_models.py +++ b/tests/python/test_models.py @@ -6,27 +6,34 @@ dtest = xgb.DMatrix(dpath + 'agaricus.txt.test') def test_glm(): - param = {'silent':1, 'objective':'binary:logistic', 'booster':'gblinear', - 'alpha': 0.0001, 'lambda': 1 } - watchlist = [(dtest,'eval'), (dtrain,'train')] + param = {'silent':1, 'objective':'binary:logistic', 'booster':'gblinear', 'alpha': 0.0001, 'lambda': 1 } + watchlist = [(dtest,'eval'), (dtrain,'train')] num_round = 4 bst = xgb.train(param, dtrain, num_round, watchlist) + assert isinstance(bst, xgb.core.Booster) preds = bst.predict(dtest) labels = dtest.get_label() + err = sum(1 for i in range(len(preds)) if int(preds[i]>0.5)!=labels[i]) / float(len(preds)) + assert err < 0.1 def test_custom_objective(): param = {'max_depth':2, 'eta':1, 'silent':1 } watchlist = [(dtest,'eval'), (dtrain,'train')] num_round = 2 def logregobj(preds, dtrain): - labels = dtrain.get_label() - preds = 1.0 / (1.0 + np.exp(-preds)) - grad = preds - labels - hess = preds * (1.0-preds) - return grad, hess - def evalerror(preds, dtrain): - labels = dtrain.get_label() - return 'error', float(sum(labels != (preds > 0.0))) / len(labels) - bst = xgb.train(param, dtrain, num_round, watchlist, logregobj, evalerror) + labels = dtrain.get_label() + preds = 1.0 / (1.0 + np.exp(-preds)) + grad = preds - labels + hess = preds * (1.0-preds) + return grad, hess + def evalerror(preds, dtrain): + labels = dtrain.get_label() + return 'error', float(sum(labels != (preds > 0.0))) / len(labels) + bst = xgb.train(param, dtrain, num_round, watchlist, logregobj, evalerror) + assert isinstance(bst, xgb.core.Booster) + preds = bst.predict(dtest) + labels = dtest.get_label() + err = sum(1 for i in range(len(preds)) if int(preds[i]>0.5)!=labels[i]) / float(len(preds)) + assert err < 0.1 From d3bb466026a9fdf0f44e445272f3cff8f6b730c1 Mon Sep 17 00:00:00 2001 From: terrytangyuan Date: Tue, 8 Sep 2015 10:51:20 -0400 Subject: [PATCH 25/53] ENH/DOC: Added R package demo using caret library to train xgbTree model --- R-package/demo/caret_wrapper.R | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) create mode 100644 R-package/demo/caret_wrapper.R diff --git a/R-package/demo/caret_wrapper.R b/R-package/demo/caret_wrapper.R new file mode 100644 index 000000000000..5c53c99156b2 --- /dev/null +++ b/R-package/demo/caret_wrapper.R @@ -0,0 +1,32 @@ +# install development version of caret library that contains xgboost models +devtools::install_github("topepo/caret/pkg/caret") +require(caret) +require(xgboost) +require(data.table) +require(vcd) +require(e1071) + +# Load Arthritis dataset in memory. +data(Arthritis) +# Create a copy of the dataset with data.table package (data.table is 100% compliant with R dataframe but its syntax is a lot more consistent and its performance are really good). +df <- data.table(Arthritis, keep.rownames = F) + +# Let's add some new categorical features to see if it helps. Of course these feature are highly correlated to the Age feature. Usually it's not a good thing in ML, but Tree algorithms (including boosted trees) are able to select the best features, even in case of highly correlated features. +# For the first feature we create groups of age by rounding the real age. Note that we transform it to factor (categorical data) so the algorithm treat them as independant values. +df[,AgeDiscret:= as.factor(round(Age/10,0))] + +# Here is an even stronger simplification of the real age with an arbitrary split at 30 years old. I choose this value based on nothing. We will see later if simplifying the information based on arbitrary values is a good strategy (I am sure you already have an idea of how well it will work!). +df[,AgeCat:= as.factor(ifelse(Age > 30, "Old", "Young"))] + +# We remove ID as there is nothing to learn from this feature (it will just add some noise as the dataset is small). +df[,ID:=NULL] + +#-------------Basic Training using XGBoost in caret Library----------------- +# set up control parameters for caret::train +# here we use 10-fold cross-validation, repeating twice +fitControl <- trainControl(method = "cv", number = 10, repeats = 2) +# train a xgbTree model using caret::train +model <- train(factor(Improved)~., data = df, method = "xgbTree", trControl = fitControl) + +# See model results +print(model) \ No newline at end of file From 9ead44531e1784974fb7a719dc07110e9d8cd591 Mon Sep 17 00:00:00 2001 From: terrytangyuan Date: Tue, 8 Sep 2015 10:54:07 -0400 Subject: [PATCH 26/53] DOC: Added new demo to index --- R-package/demo/00Index | 1 + R-package/demo/runall.R | 1 + 2 files changed, 2 insertions(+) diff --git a/R-package/demo/00Index b/R-package/demo/00Index index 0112eb9e19c7..f3d241470799 100644 --- a/R-package/demo/00Index +++ b/R-package/demo/00Index @@ -1,4 +1,5 @@ basic_walkthrough Basic feature walkthrough +caret_wrapper Use xgboost to train in caret library custom_objective Cutomize loss function, and evaluation metric boost_from_prediction Boosting from existing prediction predict_first_ntree Predicting using first n trees diff --git a/R-package/demo/runall.R b/R-package/demo/runall.R index 7311ec95ed94..c337f81643d0 100644 --- a/R-package/demo/runall.R +++ b/R-package/demo/runall.R @@ -9,3 +9,4 @@ demo(create_sparse_matrix) demo(predict_leaf_indices) demo(early_stopping) demo(poisson_regression) +demo(caret_wrapper) From 0c0e26effa66c0053e863f02e5c9b87ae1b0c106 Mon Sep 17 00:00:00 2001 From: Tianqi Chen Date: Tue, 8 Sep 2015 19:45:39 -0700 Subject: [PATCH 27/53] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 67dfbbd9e3ec..4c14e722bd1f 100644 --- a/README.md +++ b/README.md @@ -19,7 +19,7 @@ Contents * [Build Instruction](doc/build.md) * [Features](#features) * [Distributed XGBoost](multi-node) -* [Usecases](doc/README.md#highlight-links) +* [Usecases](doc/index.md#highlight-links) * [Bug Reporting](#bug-reporting) * [Contributing to XGBoost](#contributing-to-xgboost) * [Committers and Contributors](CONTRIBUTORS.md) From 62e95dcc600991f0367fbd9ebd9e0781f23f68c2 Mon Sep 17 00:00:00 2001 From: "Yuan Tang (Terry)" Date: Thu, 10 Sep 2015 23:23:30 -0400 Subject: [PATCH 28/53] DOC: Added caret_wrapper.R link into demo/README.md --- R-package/demo/README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/R-package/demo/README.md b/R-package/demo/README.md index be08ee54fbe1..5ee1aa797b4f 100644 --- a/R-package/demo/README.md +++ b/R-package/demo/README.md @@ -1,6 +1,7 @@ XGBoost R Feature Walkthrough ==== -* [Basic walkthrough of wrappers](basic_walkthrough.R) +* [Basic walkthrough of wrappers](basic_walkthrough.R) +* [Train a xgboost model from caret library](caret_wrapper.R) * [Cutomize loss function, and evaluation metric](custom_objective.R) * [Boosting from existing prediction](boost_from_prediction.R) * [Predicting using first n trees](predict_first_ntree.R) From 424bcc05fa25bf186acbaa99c9262325aa1b6d24 Mon Sep 17 00:00:00 2001 From: terrytangyuan Date: Thu, 10 Sep 2015 23:41:36 -0400 Subject: [PATCH 29/53] ENH: More comments and explanation on demo using xgboost from caret --- R-package/demo/caret_wrapper.R | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/R-package/demo/caret_wrapper.R b/R-package/demo/caret_wrapper.R index 5c53c99156b2..13e05e5622c0 100644 --- a/R-package/demo/caret_wrapper.R +++ b/R-package/demo/caret_wrapper.R @@ -22,11 +22,14 @@ df[,AgeCat:= as.factor(ifelse(Age > 30, "Old", "Young"))] df[,ID:=NULL] #-------------Basic Training using XGBoost in caret Library----------------- -# set up control parameters for caret::train -# here we use 10-fold cross-validation, repeating twice -fitControl <- trainControl(method = "cv", number = 10, repeats = 2) +# Set up control parameters for caret::train +# Here we use 10-fold cross-validation, repeating twice, and using random search for tuning hyper-parameters. +fitControl <- trainControl(method = "cv", number = 10, repeats = 2, search = "random") # train a xgbTree model using caret::train model <- train(factor(Improved)~., data = df, method = "xgbTree", trControl = fitControl) +# Instead of tree for our boosters, you can also fit a linear regression or logistic regression model using xgbLinear +# model <- train(factor(Improved)~., data = df, method = "xgbLinear", trControl = fitControl) + # See model results -print(model) \ No newline at end of file +print(model) From 6506a1c4905807b7dc4de0db3fbd0b28806d8c13 Mon Sep 17 00:00:00 2001 From: sinhrks Date: Sat, 12 Sep 2015 11:32:51 +0900 Subject: [PATCH 30/53] ENH: allow python to handle feature names --- python-package/xgboost/core.py | 76 ++++++++++++++++++++++++++++++++-- wrapper/xgboost_wrapper.cpp | 8 ++++ wrapper/xgboost_wrapper.h | 7 ++++ 3 files changed, 88 insertions(+), 3 deletions(-) diff --git a/python-package/xgboost/core.py b/python-package/xgboost/core.py index 41943cd61218..2a4782eeae4c 100644 --- a/python-package/xgboost/core.py +++ b/python-package/xgboost/core.py @@ -4,6 +4,7 @@ from __future__ import absolute_import import os +import re import sys import ctypes import platform @@ -131,7 +132,11 @@ class DMatrix(object): which is optimized for both memory efficiency and training speed. You can construct DMatrix from numpy.arrays """ - def __init__(self, data, label=None, missing=0.0, weight=None, silent=False): + + feature_names = None # for previous version's pickle + + def __init__(self, data, label=None, missing=0.0, + weight=None, silent=False, feature_names=None): """ Data matrix used in XGBoost. @@ -149,6 +154,8 @@ def __init__(self, data, label=None, missing=0.0, weight=None, silent=False): Weight for each instance. silent : boolean, optional Whether print messages during construction + feature_names : list, optional + Labels for features. """ # force into void_p, mac need to pass things in as void_p if data is None: @@ -176,6 +183,18 @@ def __init__(self, data, label=None, missing=0.0, weight=None, silent=False): if weight is not None: self.set_weight(weight) + # validate feature name + if not isinstance(feature_names, list): + feature_names = list(feature_names) + if len(feature_names) != len(set(feature_names)): + raise ValueError('feature_names must be unique') + if len(feature_names) != self.num_col(): + raise ValueError('feature_names must have the same length as data') + if not all(isinstance(f, STRING_TYPES) and f.isalnum() + for f in feature_names): + raise ValueError('all feature_names must be alphanumerics') + self.feature_names = feature_names + def _init_from_csr(self, csr): """ Initialize data from a CSR matrix. @@ -391,6 +410,18 @@ def num_row(self): ctypes.byref(ret))) return ret.value + def num_col(self): + """Get the number of columns in the DMatrix. + + Returns + ------- + number of columns : int + """ + ret = ctypes.c_ulong() + _check_call(_LIB.XGDMatrixNumCol(self.handle, + ctypes.byref(ret))) + return ret.value + def slice(self, rindex): """Slice the DMatrix and return a new DMatrix that only contains `rindex`. @@ -404,7 +435,7 @@ def slice(self, rindex): res : DMatrix A new DMatrix containing only selected indices. """ - res = DMatrix(None) + res = DMatrix(None, feature_names=self.feature_names) res.handle = ctypes.c_void_p() _check_call(_LIB.XGDMatrixSliceDMatrix(self.handle, c_array(ctypes.c_int, rindex), @@ -419,6 +450,9 @@ class Booster(object): Booster is the model of xgboost, that contains low level routines for training, prediction and evaluation. """ + + feature_names = None + def __init__(self, params=None, cache=(), model_file=None): # pylint: disable=invalid-name """Initialize the Booster. @@ -435,6 +469,7 @@ def __init__(self, params=None, cache=(), model_file=None): for d in cache: if not isinstance(d, DMatrix): raise TypeError('invalid cache item: {}'.format(type(d).__name__)) + self._validate_feature_names(d) dmats = c_array(ctypes.c_void_p, [d.handle for d in cache]) self.handle = ctypes.c_void_p() _check_call(_LIB.XGBoosterCreate(dmats, len(cache), ctypes.byref(self.handle))) @@ -519,6 +554,8 @@ def update(self, dtrain, iteration, fobj=None): """ if not isinstance(dtrain, DMatrix): raise TypeError('invalid training matrix: {}'.format(type(dtrain).__name__)) + self._validate_feature_names(dtrain) + if fobj is None: _check_call(_LIB.XGBoosterUpdateOneIter(self.handle, iteration, dtrain.handle)) else: @@ -543,6 +580,8 @@ def boost(self, dtrain, grad, hess): raise ValueError('grad / hess length mismatch: {} / {}'.format(len(grad), len(hess))) if not isinstance(dtrain, DMatrix): raise TypeError('invalid training matrix: {}'.format(type(dtrain).__name__)) + self._validate_feature_names(dtrain) + _check_call(_LIB.XGBoosterBoostOneIter(self.handle, dtrain.handle, c_array(ctypes.c_float, grad), c_array(ctypes.c_float, hess), @@ -572,6 +611,8 @@ def eval_set(self, evals, iteration=0, feval=None): raise TypeError('expected DMatrix, got {}'.format(type(d[0]).__name__)) if not isinstance(d[1], STRING_TYPES): raise TypeError('expected string, got {}'.format(type(d[1]).__name__)) + self._validate_feature_names(d) + dmats = c_array(ctypes.c_void_p, [d[0].handle for d in evals]) evnames = c_array(ctypes.c_char_p, [c_str(d[1]) for d in evals]) msg = ctypes.c_char_p() @@ -605,6 +646,7 @@ def eval(self, data, name='eval', iteration=0): result: str Evaluation result string. """ + self._validate_feature_names(data) return self.eval_set([(data, name)], iteration) def predict(self, data, output_margin=False, ntree_limit=0, pred_leaf=False): @@ -642,6 +684,9 @@ def predict(self, data, output_margin=False, ntree_limit=0, pred_leaf=False): option_mask |= 0x01 if pred_leaf: option_mask |= 0x02 + + self._validate_feature_names(data) + length = ctypes.c_ulong() preds = ctypes.POINTER(ctypes.c_float)() _check_call(_LIB.XGBoosterPredict(self.handle, data.handle, @@ -731,6 +776,7 @@ def get_dump(self, fmap='', with_stats=False): """ Returns the dump the model as a list of strings. """ + res = [] length = ctypes.c_ulong() sarr = ctypes.POINTER(ctypes.c_char_p)() _check_call(_LIB.XGBoosterDumpModel(self.handle, @@ -738,9 +784,19 @@ def get_dump(self, fmap='', with_stats=False): int(with_stats), ctypes.byref(length), ctypes.byref(sarr))) - res = [] for i in range(length.value): res.append(str(sarr[i].decode('ascii'))) + + if self.feature_names is not None: + defaults = ['f{0}'.format(i) for i in + range(len(self.feature_names))] + rep = dict((re.escape(k), v) for k, v in + zip(defaults, self.feature_names)) + pattern = re.compile("|".join(rep)) + def _replace(expr): + """ Replace matched group to corresponding values """ + return pattern.sub(lambda m: rep[re.escape(m.group(0))], expr) + res = [_replace(r) for r in res] return res def get_fscore(self, fmap=''): @@ -765,3 +821,17 @@ def get_fscore(self, fmap=''): else: fmap[fid] += 1 return fmap + + def _validate_feature_names(self, data): + """ + Validate Booster and data's feature_names are identical + """ + if self.feature_names is None: + self.feature_names = data.feature_names + else: + # Booster can't accept data with different feature names + if self.feature_names != data.feature_names: + msg = 'feature_names mismatch: {0} {1}' + raise ValueError(msg.format(self.feature_names, + data.feature_names)) + diff --git a/wrapper/xgboost_wrapper.cpp b/wrapper/xgboost_wrapper.cpp index 6956b567d27f..a6151a567d47 100644 --- a/wrapper/xgboost_wrapper.cpp +++ b/wrapper/xgboost_wrapper.cpp @@ -435,6 +435,7 @@ int XGDMatrixGetUIntInfo(const DMatrixHandle handle, *out_dptr = BeginPtr(vec); API_END(); } + int XGDMatrixNumRow(const DMatrixHandle handle, bst_ulong *out) { API_BEGIN(); @@ -442,6 +443,13 @@ int XGDMatrixNumRow(const DMatrixHandle handle, API_END(); } +int XGDMatrixNumCol(const DMatrixHandle handle, + bst_ulong *out) { + API_BEGIN(); + *out = static_cast(static_cast(handle)->info.num_col()); + API_END(); +} + // xgboost implementation int XGBoosterCreate(DMatrixHandle dmats[], bst_ulong len, diff --git a/wrapper/xgboost_wrapper.h b/wrapper/xgboost_wrapper.h index 6d3a619fbc74..0d688b236427 100644 --- a/wrapper/xgboost_wrapper.h +++ b/wrapper/xgboost_wrapper.h @@ -184,6 +184,13 @@ XGB_DLL int XGDMatrixGetUIntInfo(const DMatrixHandle handle, */ XGB_DLL int XGDMatrixNumRow(DMatrixHandle handle, bst_ulong *out); +/*! + * \brief get number of columns + * \param handle the handle to the DMatrix + * \return 0 when success, -1 when failure happens + */ +XGB_DLL int XGDMatrixNumCol(DMatrixHandle handle, + bst_ulong *out); // --- start XGBoost class /*! * \brief create xgboost learner From 13c8d2ba7467d0aa996d0f331334d9e563bd0d17 Mon Sep 17 00:00:00 2001 From: phunterlau Date: Sun, 13 Sep 2015 17:34:37 -0700 Subject: [PATCH 31/53] add multi-thread static link for MAC --- Makefile | 10 +++++++++- build.sh | 12 ++++++++++++ 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/Makefile b/Makefile index c790f6b726ee..dfafc61cf647 100644 --- a/Makefile +++ b/Makefile @@ -21,9 +21,17 @@ endif ifeq ($(no_omp),1) CFLAGS += -DDISABLE_OPENMP else - CFLAGS += -fopenmp + #CFLAGS += -fopenmp + ifeq ($(omp_mac_static),1) + #CFLAGS += -fopenmp -Bstatic + CFLAGS += -static-libgcc -static-libstdc++ -L. -fopenmp + #LDFLAGS += -Wl,--whole-archive -lpthread -Wl --no-whole-archive + else + CFLAGS += -fopenmp + endif endif + # by default use c++11 ifeq ($(cxx11),1) CFLAGS += -std=c++11 diff --git a/build.sh b/build.sh index 3a899d6d43b2..3ca795c0b1a2 100755 --- a/build.sh +++ b/build.sh @@ -6,6 +6,18 @@ # See additional instruction in doc/build.md +#for building static OpenMP lib in MAC for easier installation in MAC +#doesn't work with XCode clang/LLVM since Apple doesn't support, +#needs brew install gcc 4.9+ with OpenMP +static_omp=1 +if ((${static_omp}==1)); then + rm libgomp.a + ln -s `g++ -print-file-name=libgomp.a` + make clean + make omp_mac_static=1 + echo "Successfully build multi-thread static link xgboost" + exit 0 +fi if make; then echo "Successfully build multi-thread xgboost" From 529b80406cb9393d29c41d8b4ce160a39fcf7acc Mon Sep 17 00:00:00 2001 From: phunterlau Date: Sun, 13 Sep 2015 17:36:49 -0700 Subject: [PATCH 32/53] switch back to dynamic build by default --- build.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/build.sh b/build.sh index 3ca795c0b1a2..2a285597ab69 100755 --- a/build.sh +++ b/build.sh @@ -8,8 +8,8 @@ #for building static OpenMP lib in MAC for easier installation in MAC #doesn't work with XCode clang/LLVM since Apple doesn't support, -#needs brew install gcc 4.9+ with OpenMP -static_omp=1 +#needs brew install gcc 4.9+ with OpenMP. By default the static link is OFF +static_omp=0 if ((${static_omp}==1)); then rm libgomp.a ln -s `g++ -print-file-name=libgomp.a` From 0406c64a5d422d8c80b60944f478bdd54580db82 Mon Sep 17 00:00:00 2001 From: Far0n Date: Mon, 14 Sep 2015 11:25:41 +0200 Subject: [PATCH 33/53] bugfix evals_result regex --- python-package/xgboost/training.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python-package/xgboost/training.py b/python-package/xgboost/training.py index 1f2d722aca0f..2f890a063c65 100644 --- a/python-package/xgboost/training.py +++ b/python-package/xgboost/training.py @@ -118,7 +118,7 @@ def train(params, dtrain, num_boost_round=10, evals=(), obj=None, feval=None, sys.stderr.write(msg + '\n') if evals_result is not None: - res = re.findall(":-([0-9.]+).", msg) + res = re.findall(":-?([0-9.]+).", msg) for key, val in zip(evals_name, res): evals_result[key].append(val) From 48ac946d9f2b462701e90a198c3d690751a76a6f Mon Sep 17 00:00:00 2001 From: sinhrks Date: Sat, 12 Sep 2015 14:36:17 +0900 Subject: [PATCH 34/53] Use ctypes --- python-package/xgboost/core.py | 88 +++++++++++++++++++++------------- tests/python/test_basic.py | 20 ++++++++ wrapper/xgboost_wrapper.cpp | 23 +++++++-- wrapper/xgboost_wrapper.h | 20 ++++++++ 4 files changed, 115 insertions(+), 36 deletions(-) diff --git a/python-package/xgboost/core.py b/python-package/xgboost/core.py index 2a4782eeae4c..bcb68580e325 100644 --- a/python-package/xgboost/core.py +++ b/python-package/xgboost/core.py @@ -1,10 +1,9 @@ # coding: utf-8 -# pylint: disable=too-many-arguments +# pylint: disable=too-many-arguments, too-many-branches """Core XGBoost Library.""" from __future__ import absolute_import import os -import re import sys import ctypes import platform @@ -24,8 +23,9 @@ class XGBoostError(Exception): if sys.version_info[0] == 3: - # pylint: disable=invalid-name + # pylint: disable=invalid-name, redefined-builtin STRING_TYPES = str, + unicode = str else: # pylint: disable=invalid-name STRING_TYPES = basestring, @@ -184,15 +184,18 @@ def __init__(self, data, label=None, missing=0.0, self.set_weight(weight) # validate feature name - if not isinstance(feature_names, list): - feature_names = list(feature_names) - if len(feature_names) != len(set(feature_names)): - raise ValueError('feature_names must be unique') - if len(feature_names) != self.num_col(): - raise ValueError('feature_names must have the same length as data') - if not all(isinstance(f, STRING_TYPES) and f.isalnum() - for f in feature_names): - raise ValueError('all feature_names must be alphanumerics') + if not feature_names is None: + if not isinstance(feature_names, list): + feature_names = list(feature_names) + if len(feature_names) != len(set(feature_names)): + raise ValueError('feature_names must be unique') + if len(feature_names) != self.num_col(): + msg = 'feature_names must have the same length as data' + raise ValueError(msg) + # prohibit to use symbols may affect to parse. e.g. ``[]=.`` + if not all(isinstance(f, STRING_TYPES) and f.isalnum() + for f in feature_names): + raise ValueError('all feature_names must be alphanumerics') self.feature_names = feature_names def _init_from_csr(self, csr): @@ -411,13 +414,13 @@ def num_row(self): return ret.value def num_col(self): - """Get the number of columns in the DMatrix. + """Get the number of columns (features) in the DMatrix. Returns ------- number of columns : int """ - ret = ctypes.c_ulong() + ret = ctypes.c_uint() _check_call(_LIB.XGDMatrixNumCol(self.handle, ctypes.byref(ret))) return ret.value @@ -611,7 +614,7 @@ def eval_set(self, evals, iteration=0, feval=None): raise TypeError('expected DMatrix, got {}'.format(type(d[0]).__name__)) if not isinstance(d[1], STRING_TYPES): raise TypeError('expected string, got {}'.format(type(d[1]).__name__)) - self._validate_feature_names(d) + self._validate_feature_names(d[0]) dmats = c_array(ctypes.c_void_p, [d[0].handle for d in evals]) evnames = c_array(ctypes.c_char_p, [c_str(d[1]) for d in evals]) @@ -776,27 +779,46 @@ def get_dump(self, fmap='', with_stats=False): """ Returns the dump the model as a list of strings. """ - res = [] + length = ctypes.c_ulong() sarr = ctypes.POINTER(ctypes.c_char_p)() - _check_call(_LIB.XGBoosterDumpModel(self.handle, - c_str(fmap), - int(with_stats), - ctypes.byref(length), - ctypes.byref(sarr))) + if self.feature_names is not None and fmap == '': + flen = int(len(self.feature_names)) + fname = (ctypes.c_char_p * flen)() + ftype = (ctypes.c_char_p * flen)() + + # supports quantitative type only + # {'q': quantitative, 'i': indicator} + if sys.version_info[0] == 3: + features = [bytes(f, 'utf-8') for f in self.feature_names] + types = [bytes('q', 'utf-8')] * flen + else: + features = [f.encode('utf-8') if isinstance(f, unicode) else f + for f in self.feature_names] + types = ['q'] * flen + + fname[:] = features + ftype[:] = types + _check_call(_LIB.XGBoosterDumpModelWithFeatures(self.handle, + flen, + fname, + ftype, + int(with_stats), + ctypes.byref(length), + ctypes.byref(sarr))) + else: + _check_call(_LIB.XGBoosterDumpModel(self.handle, + c_str(fmap), + int(with_stats), + ctypes.byref(length), + ctypes.byref(sarr))) + + res = [] for i in range(length.value): - res.append(str(sarr[i].decode('ascii'))) - - if self.feature_names is not None: - defaults = ['f{0}'.format(i) for i in - range(len(self.feature_names))] - rep = dict((re.escape(k), v) for k, v in - zip(defaults, self.feature_names)) - pattern = re.compile("|".join(rep)) - def _replace(expr): - """ Replace matched group to corresponding values """ - return pattern.sub(lambda m: rep[re.escape(m.group(0))], expr) - res = [_replace(r) for r in res] + try: + res.append(str(sarr[i].decode('ascii'))) + except UnicodeDecodeError: + res.append(unicode(sarr[i].decode('utf-8'))) return res def get_fscore(self, fmap=''): diff --git a/tests/python/test_basic.py b/tests/python/test_basic.py index 93ebaa7fdadf..70de2626c753 100644 --- a/tests/python/test_basic.py +++ b/tests/python/test_basic.py @@ -29,6 +29,26 @@ def test_basic(): # assert they are the same assert np.sum(np.abs(preds2-preds)) == 0 +def test_feature_names(): + data = np.random.randn(100, 5) + target = np.array([0, 1] * 50) + + features = ['Feature1', 'Feature2', 'Feature3', 'Feature4', 'Feature5'] + dm = xgb.DMatrix(data, label=target, + feature_names=features) + assert dm.feature_names == features + assert dm.num_row() == 100 + assert dm.num_col() == 5 + + params={'objective': 'multi:softprob', + 'eval_metric': 'mlogloss', + 'eta': 0.3, + 'num_class': 3} + + bst = xgb.train(params, dm, num_boost_round=10) + scores = bst.get_fscore() + assert list(sorted(k for k in scores)) == features + def test_plotting(): bst2 = xgb.Booster(model_file='xgb.model') # plotting diff --git a/wrapper/xgboost_wrapper.cpp b/wrapper/xgboost_wrapper.cpp index a6151a567d47..6d547fe183c7 100644 --- a/wrapper/xgboost_wrapper.cpp +++ b/wrapper/xgboost_wrapper.cpp @@ -445,9 +445,9 @@ int XGDMatrixNumRow(const DMatrixHandle handle, int XGDMatrixNumCol(const DMatrixHandle handle, bst_ulong *out) { - API_BEGIN(); - *out = static_cast(static_cast(handle)->info.num_col()); - API_END(); + API_BEGIN(); + *out = static_cast(static_cast(handle)->info.num_col()); + API_END(); } // xgboost implementation @@ -580,3 +580,20 @@ int XGBoosterDumpModel(BoosterHandle handle, featmap, with_stats != 0, len); API_END(); } + +int XGBoosterDumpModelWithFeatures(BoosterHandle handle, + int fnum, + const char **fname, + const char **ftype, + int with_stats, + bst_ulong *len, + const char ***out_models) { + API_BEGIN(); + utils::FeatMap featmap; + for (int i = 0; i < fnum; ++i) { + featmap.PushBack(i, fname[i], ftype[i]); + } + *out_models = static_cast(handle)->GetModelDump( + featmap, with_stats != 0, len); + API_END(); +} diff --git a/wrapper/xgboost_wrapper.h b/wrapper/xgboost_wrapper.h index 0d688b236427..8d0e78a913a7 100644 --- a/wrapper/xgboost_wrapper.h +++ b/wrapper/xgboost_wrapper.h @@ -331,4 +331,24 @@ XGB_DLL int XGBoosterDumpModel(BoosterHandle handle, int with_stats, bst_ulong *out_len, const char ***out_dump_array); + +/*! + * \brief dump model, return array of strings representing model dump + * \param handle handle + * \param fnum number of features + * \param fnum names of features + * \param fnum types of features + * \param with_stats whether to dump with statistics + * \param out_len length of output array + * \param out_dump_array pointer to hold representing dump of each model + * \return 0 when success, -1 when failure happens + */ +XGB_DLL int XGBoosterDumpModelWithFeatures(BoosterHandle handle, + int fnum, + const char **fname, + const char **ftype, + int with_stats, + bst_ulong *len, + const char ***out_models); + #endif // XGBOOST_WRAPPER_H_ From 6063d243eb4efac77751a910362a2f143db3e141 Mon Sep 17 00:00:00 2001 From: sinhrks Date: Tue, 15 Sep 2015 01:52:41 +0900 Subject: [PATCH 35/53] Mac build fix --- scripts/travis_osx_install.sh | 12 -------- scripts/travis_script.sh | 54 ++++++++++++++++++++++------------- tests/python/test_basic.py | 35 +++++++++++++---------- 3 files changed, 54 insertions(+), 47 deletions(-) diff --git a/scripts/travis_osx_install.sh b/scripts/travis_osx_install.sh index adc620a52922..8c449c843688 100755 --- a/scripts/travis_osx_install.sh +++ b/scripts/travis_osx_install.sh @@ -5,15 +5,3 @@ if [ ${TRAVIS_OS_NAME} != "osx" ]; then fi brew update - -if [ ${TASK} == "python-package" ]; then - brew install python git graphviz - easy_install pip - pip install numpy scipy matplotlib nose -fi - -if [ ${TASK} == "python-package3" ]; then - brew install python3 git graphviz - sudo pip3 install --upgrade setuptools - pip3 install numpy scipy matplotlib nose graphviz -fi diff --git a/scripts/travis_script.sh b/scripts/travis_script.sh index c5708b0c8b9e..f633f9d7b81e 100755 --- a/scripts/travis_script.sh +++ b/scripts/travis_script.sh @@ -33,30 +33,44 @@ if [ ${TASK} == "R-package" ]; then scripts/travis_R_script.sh || exit -1 fi -if [ ${TASK} == "python-package" ]; then - sudo apt-get install graphviz - sudo apt-get install python-numpy python-scipy python-matplotlib python-nose - sudo python -m pip install graphviz - make all CXX=${CXX} || exit -1 - nosetests tests/python || exit -1 -fi - -if [ ${TASK} == "python-package3" ]; then - sudo apt-get install graphviz - # python3-matplotlib is unavailale on Ubuntu 12.04 - sudo apt-get install python3-dev - sudo apt-get install python3-numpy python3-scipy python3-nose python3-setuptools +if [ ${TASK} == "python-package" -o ${TASK} == "python-package3" ]; then - make all CXX=${CXX} || exit -1 + if [ ${TRAVIS_OS_NAME} == "osx" ]; then + brew install graphviz + if [ ${TASK} == "python-package3" ]; then + wget -O conda.sh https://repo.continuum.io/miniconda/Miniconda3-latest-MacOSX-x86_64.sh + else + wget -O conda.sh https://repo.continuum.io/miniconda/Miniconda-latest-MacOSX-x86_64.sh + fi + else + sudo apt-get install graphviz + if [ ${TASK} == "python-package3" ]; then + wget -O conda.sh https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh + else + wget -O conda.sh https://repo.continuum.io/miniconda/Miniconda-latest-Linux-x86_64.sh + fi + fi + bash conda.sh -b -p $HOME/miniconda + export PATH="$HOME/miniconda/bin:$PATH" + hash -r + conda config --set always_yes yes --set changeps1 no + conda update -q conda + # Useful for debugging any issues with conda + conda info -a - if [ ${TRAVIS_OS_NAME} != "osx" ]; then - sudo easy_install3 pip - sudo easy_install3 -U distribute - sudo pip install graphviz matplotlib - nosetests3 tests/python || exit -1 + if [ ${TASK} == "python-package3" ]; then + conda create -n myenv python=3.4 else - nosetests tests/python || exit -1 + conda create -n myenv python=2.7 fi + source activate myenv + conda install numpy scipy matplotlib nose + python -m pip install graphviz + + make all CXX=${CXX} || exit -1 + + python -m nose tests/python || exit -1 + python --version fi # only test java under linux for now diff --git a/tests/python/test_basic.py b/tests/python/test_basic.py index 70de2626c753..bb6654f51d74 100644 --- a/tests/python/test_basic.py +++ b/tests/python/test_basic.py @@ -1,3 +1,4 @@ +# -*- coding: utf-8 -*- import numpy as np import xgboost as xgb @@ -33,21 +34,25 @@ def test_feature_names(): data = np.random.randn(100, 5) target = np.array([0, 1] * 50) - features = ['Feature1', 'Feature2', 'Feature3', 'Feature4', 'Feature5'] - dm = xgb.DMatrix(data, label=target, - feature_names=features) - assert dm.feature_names == features - assert dm.num_row() == 100 - assert dm.num_col() == 5 - - params={'objective': 'multi:softprob', - 'eval_metric': 'mlogloss', - 'eta': 0.3, - 'num_class': 3} - - bst = xgb.train(params, dm, num_boost_round=10) - scores = bst.get_fscore() - assert list(sorted(k for k in scores)) == features + cases = [['Feature1', 'Feature2', 'Feature3', 'Feature4', 'Feature5'], + [u'要因1', u'要因2', u'要因3', u'要因4', u'要因5']] + + for features in cases: + dm = xgb.DMatrix(data, label=target, + feature_names=features) + assert dm.feature_names == features + assert dm.num_row() == 100 + assert dm.num_col() == 5 + + params={'objective': 'multi:softprob', + 'eval_metric': 'mlogloss', + 'eta': 0.3, + 'num_class': 3} + + bst = xgb.train(params, dm, num_boost_round=10) + scores = bst.get_fscore() + assert list(sorted(k for k in scores)) == features + def test_plotting(): bst2 = xgb.Booster(model_file='xgb.model') From 030a4e4e25302fd9e72c31373a1be67fde5e81f2 Mon Sep 17 00:00:00 2001 From: Alex Miller Date: Wed, 16 Sep 2015 01:23:31 -0700 Subject: [PATCH 36/53] spelling and grammar changes --- doc/faq.md | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/doc/faq.md b/doc/faq.md index 32dc5a1b33f8..d5be6f85ed15 100644 --- a/doc/faq.md +++ b/doc/faq.md @@ -1,6 +1,6 @@ -Frequent Asked Questions +Frequently Asked Questions ======================== -This document contains the frequent asked question to xgboost. +This document contains frequently asked questions about xgboost. How to tune parameters ---------------------- @@ -13,7 +13,7 @@ See [Introduction to Boosted Trees](model.md) I have a big dataset -------------------- -XGBoost is designed to be memory efficient. Usually it could handle problems as long as the data fit into your memory +XGBoost is designed to be memory efficient. Usually it can handle problems as long as the data fit into your memory (This usually means millions of instances). If you are running out of memory, checkout [external memory version](external_memory.md) or [distributed version](https://github.com/dmlc/wormhole/tree/master/learn/xgboost) of xgboost. @@ -23,30 +23,30 @@ Running xgboost on Platform X (Hadoop/Yarn, Mesos) -------------------------------------------------- The distributed version of XGBoost is designed to be portable to various environment. Distributed XGBoost can be ported to any platform that supports [rabit](https://github.com/dmlc/rabit). -You can directly run xgboost on Yarn. In theory Mesos and other resource allocation engine can be easily supported as well. +You can directly run xgboost on Yarn. In theory Mesos and other resource allocation engines can be easily supported as well. Why not implement distributed xgboost on top of X (Spark, Hadoop) ----------------------------------------------------------------- The first fact we need to know is going distributed does not necessarily solve all the problems. -Instead, it creates more problems such as more communication over head and fault tolerance. -The ultimate question will still come back into how to push the limit of each computation node +Instead, it creates more problems such as more communication overhead and fault tolerance. +The ultimate question will still come back to how to push the limit of each computation node and use less resources to complete the task (thus with less communication and chance of failure). To achieve these, we decide to reuse the optimizations in the single node xgboost and build distributed version on top of it. -The demand of communication in machine learning is rather simple, in a sense that we can depend on a limited set of API (in our case rabit). -Such design allows us to reuse most of the code, and being portable to major platforms such as Hadoop/Yarn, MPI, SGE. -Most importantly, pushs the limit of the computation resources we can use. +The demand of communication in machine learning is rather simple, in the sense that we can depend on a limited set of API (in our case rabit). +Such design allows us to reuse most of the code, while being portable to major platforms such as Hadoop/Yarn, MPI, SGE. +Most importantly, it pushes the limit of the computation resources we can use. How can I port the model to my own system ----------------------------------------- -The model and data format of XGBoost is exchangable. -Which means the model trained by one langauge can be loaded in another. +The model and data format of XGBoost is exchangable, +which means the model trained by one language can be loaded in another. This means you can train the model using R, while running prediction using -Java or C++, which are more common in production system. -You can also train the model using distributed version, -and load them in from python to do some interactive analysis. +Java or C++, which are more common in production systems. +You can also train the model using distributed versions, +and load them in from Python to do some interactive analysis. Do you support LambdaMART From 7f3bc03990d48d938d637011b82945c09dcb1674 Mon Sep 17 00:00:00 2001 From: Alex Miller Date: Wed, 16 Sep 2015 01:33:28 -0700 Subject: [PATCH 37/53] spelling and grammar --- doc/model.md | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/doc/model.md b/doc/model.md index f4373b3fc537..d9ecd2620f7c 100644 --- a/doc/model.md +++ b/doc/model.md @@ -2,29 +2,29 @@ Introduction to Boosted Trees ============================= XGBoost is short for "Extreme Gradient Boosting", where the term "Gradient Boosting" is proposed in the paper _Greedy Function Approximation: A Gradient Boosting Machine_, Friedman. Based on this original model. This is a tutorial on boosted trees, most of content are based on this [slide](http://homes.cs.washington.edu/~tqchen/pdf/BoostedTree.pdf) by the author of xgboost. -The GBM(boosted trees) has been around for really a while, and there are a lot of materials on the topic. This tutorial tries to explain boosted trees in a self-contained and principled way of supervised learning. We think this explaination is cleaner, more formal, and motivates the variant used in xgboost. +The GBM(boosted trees) has been around for really a while, and there are a lot of materials on the topic. This tutorial tries to explain boosted trees in a self-contained and principled way of supervised learning. We think this explanation is cleaner, more formal, and motivates the variant used in xgboost. Elements of Supervised Learning ------------------------------- XGBoost is used for supervised learning problems, where we use the training data ``$ x_i $`` to predict a target variable ``$ y_i $``. -Before we get dived into trees, let us start from reviwing the basic elements in supervised learning. +Before we dive into trees, let us start by reviewing the basic elements in supervised learning. ### Model and Parameters The ***model*** in supervised learning usually refers to the mathematical structure on how to given the prediction ``$ y_i $`` given ``$ x_i $``. For example, a common model is *linear model*, where the prediction is given by ``$ \hat{y}_i = \sum_j w_j x_{ij} $``, a linear combination of weighted input features. The prediction value can have different interpretations, depending on the task. -For example, it can be logistic transformed to get the probability of postitive class in logistic regression, it can also be used as ranking score when we want to rank the outputs. +For example, it can be logistic transformed to get the probability of positive class in logistic regression, and it can also be used as ranking score when we want to rank the outputs. The ***parameters*** are the undermined part that we need to learn from data. In linear regression problem, the parameters are the co-efficients ``$ w $``. Usually we will use ``$ \Theta $`` to denote the parameters. -### Object Function : Training Loss + Regularization +### Objective Function : Training Loss + Regularization Based on different understanding or assumption of ``$ y_i $``, we can have different problems as regression, classification, ordering, etc. We need to find a way to find the best parameters given the training data. In order to do so, we need to define a so called ***objective function***, to measure the performance of the model under certain set of parameters. -A very important about objective functions, is they ***must always*** contains two parts: training loss and regularization. +A very important fact about objective functions, is they ***must always*** contains two parts: training loss and regularization. ```math Obj(\Theta) = L(\Theta) + \Omega(\Theta) @@ -42,8 +42,8 @@ Another commonly used loss function is logistic loss for logistic regression L(\theta) = \sum_i[ y_i\ln (1+e^{-\hat{y}_i}) + (1-y_i)\ln (1+e^{\hat{y}_i})] ``` -The ***regularization term*** is usually people forget to add. The regularization term controls the complexity of the model, this helps us to avoid overfitting. -This sounds a bit abstract, let us consider the following problem in the following picture. You are asked to *fit* visually a step function given the input data points +The ***regularization term*** is what people usually forget to add. The regularization term controls the complexity of the model, which helps us to avoid overfitting. +This sounds a bit abstract, so let us consider the following problem in the following picture. You are asked to *fit* visually a step function given the input data points on the upper left corner of the image, which solution among the tree you think is the best fit? ![Step function](img/step_fit.png) @@ -55,12 +55,12 @@ The tradeoff between the two is also referred as bias-variance tradeoff in machi ### Why introduce the general principle The elements introduced in above forms the basic elements of supervised learning, and they are naturally the building blocks of machine learning toolkits. For example, you should be able to answer what is the difference and common parts between boosted trees and random forest. -Understanding the process in a formalized way also helps us to understand the objective what we are learning and getting the reason behind the heurestics such as +Understanding the process in a formalized way also helps us to understand the objective that we are learning and the reason behind the heurestics such as pruning and smoothing. Tree Ensemble ------------- -Now we have introduce the elements of supervised learning, let us getting started with real trees. +Now that we have introduced the elements of supervised learning, let us get started with real trees. To begin with, let us first learn what is the ***model*** of xgboost: tree ensembles. The tree ensemble model is a set of classification and regression trees (CART). Here's a simple example of a CART that classifies is someone will like computer games. @@ -69,17 +69,17 @@ that classifies is someone will like computer games. We classify the members in thie family into different leaves, and assign them the score on corresponding leaf. A CART is a bit different from decision trees, where the leaf only contain decision values. In CART, a real score -is associated with each of the leaves, this allows gives us richer interpretations that go beyond classification. +is associated with each of the leaves, which gives us richer interpretations that go beyond classification. This also makes the unified optimization step easier, as we will see in later part of this tutorial. Usually, a single tree is not so strong enough to be used in practice. What is actually used is the so called -tree ensemble model, that sumes the prediction of multiple trees together. +tree ensemble model, that sums the prediction of multiple trees together. ![TwoCART](img/twocart.png) Here is an example of tree ensemble of two trees. The prediction scores of each individual tree are summed up to get the final score. -If you look at the example, an important fact is that the two trees tries to *complement* each other. -Mathematically, we can write our model into the form +If you look at the example, an important fact is that the two trees try to *complement* each other. +Mathematically, we can write our model in the form ```math \hat{y}_i = \sum_{k=1}^K f_k(x_i), f_k \in \mathcal{F} @@ -219,7 +219,7 @@ This formula can be decomposited as 1) the score on the new left leaf 2) the sco We can find an important fact here: if the gain is smaller than ``$\gamma$``, we would better not to add that branch. This is exactly the ***prunning*** techniques in tree based models! By using the principles of supervised learning, we can naturally comes up with the reason these techniques :) -For real valued data, we usually want to search for an optimal split. To efficiently doing so, we place all the instances in a sorted way, like the following picture. +For real valued data, we usually want to search for an optimal split. To efficiently do so, we place all the instances in a sorted way, like the following picture. ![Best split](img/split_find.png) Then a left to right scan is sufficient to calculate the structure score of all possible split solutions, and we can find the best split efficiently. From 0b143e6d221be39d06a5029b5a38268eea026ad4 Mon Sep 17 00:00:00 2001 From: Alex Miller Date: Wed, 16 Sep 2015 01:39:01 -0700 Subject: [PATCH 38/53] spelling changes --- R-package/vignettes/xgboostPresentation.Rmd | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/R-package/vignettes/xgboostPresentation.Rmd b/R-package/vignettes/xgboostPresentation.Rmd index 89d27fb45dc2..45d2e8b8ea27 100644 --- a/R-package/vignettes/xgboostPresentation.Rmd +++ b/R-package/vignettes/xgboostPresentation.Rmd @@ -160,7 +160,7 @@ bstDense <- xgboost(data = as.matrix(train$data), label = train$label, max.depth #### xgb.DMatrix -**XGBoost** offers a way to group them in a `xgb.DMatrix`. You can even add other meta data in it. It will be usefull for the most advanced features we will discover later. +**XGBoost** offers a way to group them in a `xgb.DMatrix`. You can even add other meta data in it. It will be useful for the most advanced features we will discover later. ```{r trainingDmatrix, message=F, warning=F} dtrain <- xgb.DMatrix(data = train$data, label = train$label) @@ -169,7 +169,7 @@ bstDMatrix <- xgboost(data = dtrain, max.depth = 2, eta = 1, nthread = 2, nround #### Verbose option -**XGBoost** has severa features to help you to view how the learning progress internally. The purpose is to help you to set the best parameters, which is the key of your model quality. +**XGBoost** has several features to help you to view how the learning progress internally. The purpose is to help you to set the best parameters, which is the key of your model quality. One of the simplest way to see the training progress is to set the `verbose` option (see below for more advanced technics). @@ -194,7 +194,7 @@ Basic prediction using XGBoost Perform the prediction ---------------------- -The pupose of the model we have built is to classify new data. As explained before, we will use the `test` dataset for this step. +The purpose of the model we have built is to classify new data. As explained before, we will use the `test` dataset for this step. ```{r predicting, message=F, warning=F} pred <- predict(bst, test$data) @@ -267,7 +267,7 @@ Measure learning progress with xgb.train Both `xgboost` (simple) and `xgb.train` (advanced) functions train models. -One of the special feature of `xgb.train` is the capacity to follow the progress of the learning after each round. Because of the way boosting works, there is a time when having too many rounds lead to an overfitting. You can see this feature as a cousin of cross-validation method. The following technics will help you to avoid overfitting or optimizing the learning time in stopping it as soon as possible. +One of the special feature of `xgb.train` is the capacity to follow the progress of the learning after each round. Because of the way boosting works, there is a time when having too many rounds lead to an overfitting. You can see this feature as a cousin of cross-validation method. The following techniques will help you to avoid overfitting or optimizing the learning time in stopping it as soon as possible. One way to measure progress in learning of a model is to provide to **XGBoost** a second dataset already classified. Therefore it can learn on the first dataset and test its model on the second one. Some metrics are measured after each round during the learning. @@ -285,7 +285,7 @@ bst <- xgb.train(data=dtrain, max.depth=2, eta=1, nthread = 2, nround=2, watchli Both training and test error related metrics are very similar, and in some way, it makes sense: what we have learned from the training dataset matches the observations from the test dataset. -If with your own dataset you have not such results, you should think about how you did to divide your dataset in training and test. May be there is something to fix. Again, `caret` package may [help](http://topepo.github.io/caret/splitting.html). +If with your own dataset you have not such results, you should think about how you divided your dataset in training and test. May be there is something to fix. Again, `caret` package may [help](http://topepo.github.io/caret/splitting.html). For a better understanding of the learning progression, you may want to have some specific metric or even use multiple evaluation metrics. @@ -306,7 +306,7 @@ bst <- xgb.train(data=dtrain, booster = "gblinear", max.depth=2, nthread = 2, nr In this specific case, *linear boosting* gets sligtly better performance metrics than decision trees based algorithm. -In simple cases, it will happem because there is nothing better than a linear algorithm to catch a linear link. However, decision trees are much better to catch a non linear link between predictors and outcome. Because there is no silver bullet, we advise you to check both algorithms with your own datasets to have an idea of what to use. +In simple cases, it will happen because there is nothing better than a linear algorithm to catch a linear link. However, decision trees are much better to catch a non linear link between predictors and outcome. Because there is no silver bullet, we advise you to check both algorithms with your own datasets to have an idea of what to use. Manipulating xgb.DMatrix ------------------------ @@ -368,7 +368,7 @@ xgb.plot.tree(model = bst) Save and load models -------------------- -May be your dataset is big, and it takes time to train a model on it? May be you are not a big fan of loosing time in redoing the same task again and again? In these very rare cases, you will want to save your model and load it when required. +Maybe your dataset is big, and it takes time to train a model on it? May be you are not a big fan of losing time in redoing the same task again and again? In these very rare cases, you will want to save your model and load it when required. Hopefully for you, **XGBoost** implements such functions. @@ -379,7 +379,7 @@ xgb.save(bst, "xgboost.model") > `xgb.save` function should return `r TRUE` if everything goes well and crashes otherwise. -An interesting test to see how identic is our saved model with the original one would be to compare the two predictions. +An interesting test to see how identical our saved model is to the original one would be to compare the two predictions. ```{r loadModel, message=F, warning=F} # load binary model to R From db0c9e1c2d38d0aff7f8e5012dd265f1b9a91b96 Mon Sep 17 00:00:00 2001 From: sinhrks Date: Wed, 16 Sep 2015 21:53:51 +0900 Subject: [PATCH 39/53] BUG: incorrect model_file results in segfault --- python-package/xgboost/core.py | 5 ++++- tests/python/test_basic.py | 11 +++++++++++ 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/python-package/xgboost/core.py b/python-package/xgboost/core.py index bcb68580e325..2718ca7045d4 100644 --- a/python-package/xgboost/core.py +++ b/python-package/xgboost/core.py @@ -743,7 +743,10 @@ def load_model(self, fname): Input file name or memory buffer(see also save_raw) """ if isinstance(fname, str): # assume file name - _LIB.XGBoosterLoadModel(self.handle, c_str(fname)) + if os.path.exists(fname): + _LIB.XGBoosterLoadModel(self.handle, c_str(fname)) + else: + raise ValueError("No such file: {0}") else: buf = fname length = ctypes.c_ulong(len(buf)) diff --git a/tests/python/test_basic.py b/tests/python/test_basic.py index bb6654f51d74..111d389a0d55 100644 --- a/tests/python/test_basic.py +++ b/tests/python/test_basic.py @@ -1,9 +1,20 @@ # -*- coding: utf-8 -*- import numpy as np import xgboost as xgb +import unittest + dpath = 'demo/data/' + +class TestBasic(unittest.TestCase): + + def test_load_file_invalid(self): + + self.assertRaises(ValueError, xgb.Booster, + model_file='incorrect_path') + + def test_basic(): dtrain = xgb.DMatrix(dpath + 'agaricus.txt.train') dtest = xgb.DMatrix(dpath + 'agaricus.txt.test') From bb6b7ded5557334cb4f457afc9a8701d9d12c12a Mon Sep 17 00:00:00 2001 From: sinhrks Date: Wed, 16 Sep 2015 20:37:19 +0900 Subject: [PATCH 40/53] Cleanup str roundtrip using ctypes --- python-package/xgboost/core.py | 77 ++++++++---- tests/python/test_basic.py | 209 ++++++++++++++++++--------------- 2 files changed, 170 insertions(+), 116 deletions(-) diff --git a/python-package/xgboost/core.py b/python-package/xgboost/core.py index 2718ca7045d4..74170ea84c50 100644 --- a/python-package/xgboost/core.py +++ b/python-package/xgboost/core.py @@ -21,16 +21,66 @@ class XGBoostError(Exception): """Error throwed by xgboost trainer.""" pass +PY3 = (sys.version_info[0] == 3) -if sys.version_info[0] == 3: +if PY3: # pylint: disable=invalid-name, redefined-builtin STRING_TYPES = str, - unicode = str else: # pylint: disable=invalid-name STRING_TYPES = basestring, +def from_pystr_to_cstr(data): + """Convert a list of Python str to C pointer + + Parameters + ---------- + data : list + list of str + """ + + if isinstance(data, list): + pointers = (ctypes.c_char_p * len(data))() + if PY3: + data = [bytes(d, 'utf-8') for d in data] + else: + data = [d.encode('utf-8') if isinstance(d, unicode) else d + for d in data] + pointers[:] = data + return pointers + else: + # copy from above when we actually use it + raise NotImplementedError + + +def from_cstr_to_pystr(data, length): + """Revert C pointer to Python str + + Parameters + ---------- + data : ctypes pointer + pointer to data + length : ctypes pointer + pointer to length of data + """ + if PY3: + res = [] + for i in range(length.value): + try: + res.append(str(data[i].decode('ascii'))) + except UnicodeDecodeError: + res.append(str(data[i].decode('utf-8'))) + else: + res = [] + for i in range(length.value): + try: + res.append(str(data[i].decode('ascii'))) + except UnicodeDecodeError: + res.append(unicode(data[i].decode('utf-8'))) + return res + + def find_lib_path(): """Load find the path to xgboost dynamic library files. @@ -787,21 +837,12 @@ def get_dump(self, fmap='', with_stats=False): sarr = ctypes.POINTER(ctypes.c_char_p)() if self.feature_names is not None and fmap == '': flen = int(len(self.feature_names)) - fname = (ctypes.c_char_p * flen)() - ftype = (ctypes.c_char_p * flen)() + + fname = from_pystr_to_cstr(self.feature_names) # supports quantitative type only # {'q': quantitative, 'i': indicator} - if sys.version_info[0] == 3: - features = [bytes(f, 'utf-8') for f in self.feature_names] - types = [bytes('q', 'utf-8')] * flen - else: - features = [f.encode('utf-8') if isinstance(f, unicode) else f - for f in self.feature_names] - types = ['q'] * flen - - fname[:] = features - ftype[:] = types + ftype = from_pystr_to_cstr(['q'] * flen) _check_call(_LIB.XGBoosterDumpModelWithFeatures(self.handle, flen, fname, @@ -815,13 +856,7 @@ def get_dump(self, fmap='', with_stats=False): int(with_stats), ctypes.byref(length), ctypes.byref(sarr))) - - res = [] - for i in range(length.value): - try: - res.append(str(sarr[i].decode('ascii'))) - except UnicodeDecodeError: - res.append(unicode(sarr[i].decode('utf-8'))) + res = from_cstr_to_pystr(sarr, length) return res def get_fscore(self, fmap=''): diff --git a/tests/python/test_basic.py b/tests/python/test_basic.py index 111d389a0d55..404d4354f688 100644 --- a/tests/python/test_basic.py +++ b/tests/python/test_basic.py @@ -6,106 +6,125 @@ dpath = 'demo/data/' - class TestBasic(unittest.TestCase): + def test_basic(self): + dtrain = xgb.DMatrix(dpath + 'agaricus.txt.train') + dtest = xgb.DMatrix(dpath + 'agaricus.txt.test') + param = {'max_depth':2, 'eta':1, 'silent':1, 'objective':'binary:logistic' } + # specify validations set to watch performance + watchlist = [(dtest,'eval'), (dtrain,'train')] + num_round = 2 + bst = xgb.train(param, dtrain, num_round, watchlist) + # this is prediction + preds = bst.predict(dtest) + labels = dtest.get_label() + err = sum(1 for i in range(len(preds)) if int(preds[i]>0.5)!=labels[i]) / float(len(preds)) + # error must be smaller than 10% + assert err < 0.1 + + # save dmatrix into binary buffer + dtest.save_binary('dtest.buffer') + # save model + bst.save_model('xgb.model') + # load model and data in + bst2 = xgb.Booster(model_file='xgb.model') + dtest2 = xgb.DMatrix('dtest.buffer') + preds2 = bst2.predict(dtest2) + # assert they are the same + assert np.sum(np.abs(preds2-preds)) == 0 + + def test_dmatrix_init(self): + data = np.random.randn(5, 5) + + # different length + self.assertRaises(ValueError, xgb.DMatrix, data, + feature_names=list('abcdef')) + # contains duplicates + self.assertRaises(ValueError, xgb.DMatrix, data, + feature_names=['a', 'b', 'c', 'd', 'd']) + # contains symbol + self.assertRaises(ValueError, xgb.DMatrix, data, + feature_names=['a', 'b', 'c', 'd', 'e=1']) + + def test_feature_names(self): + data = np.random.randn(100, 5) + target = np.array([0, 1] * 50) + + cases = [['Feature1', 'Feature2', 'Feature3', 'Feature4', 'Feature5'], + [u'要因1', u'要因2', u'要因3', u'要因4', u'要因5']] + + for features in cases: + dm = xgb.DMatrix(data, label=target, + feature_names=features) + assert dm.feature_names == features + assert dm.num_row() == 100 + assert dm.num_col() == 5 + + params={'objective': 'multi:softprob', + 'eval_metric': 'mlogloss', + 'eta': 0.3, + 'num_class': 3} + + bst = xgb.train(params, dm, num_boost_round=10) + scores = bst.get_fscore() + assert list(sorted(k for k in scores)) == features + + dummy = np.random.randn(5, 5) + dm = xgb.DMatrix(dummy, feature_names=features) + bst.predict(dm) + + # different feature name must raises error + dm = xgb.DMatrix(dummy, feature_names=list('abcde')) + self.assertRaises(ValueError, bst.predict, dm) + def test_load_file_invalid(self): self.assertRaises(ValueError, xgb.Booster, model_file='incorrect_path') + def test_plotting(self): + bst2 = xgb.Booster(model_file='xgb.model') + # plotting + + import matplotlib + matplotlib.use('Agg') + + from matplotlib.axes import Axes + from graphviz import Digraph + + ax = xgb.plot_importance(bst2) + assert isinstance(ax, Axes) + assert ax.get_title() == 'Feature importance' + assert ax.get_xlabel() == 'F score' + assert ax.get_ylabel() == 'Features' + assert len(ax.patches) == 4 + + ax = xgb.plot_importance(bst2, color='r', + title='t', xlabel='x', ylabel='y') + assert isinstance(ax, Axes) + assert ax.get_title() == 't' + assert ax.get_xlabel() == 'x' + assert ax.get_ylabel() == 'y' + assert len(ax.patches) == 4 + for p in ax.patches: + assert p.get_facecolor() == (1.0, 0, 0, 1.0) # red + + + ax = xgb.plot_importance(bst2, color=['r', 'r', 'b', 'b'], + title=None, xlabel=None, ylabel=None) + assert isinstance(ax, Axes) + assert ax.get_title() == '' + assert ax.get_xlabel() == '' + assert ax.get_ylabel() == '' + assert len(ax.patches) == 4 + assert ax.patches[0].get_facecolor() == (1.0, 0, 0, 1.0) # red + assert ax.patches[1].get_facecolor() == (1.0, 0, 0, 1.0) # red + assert ax.patches[2].get_facecolor() == (0, 0, 1.0, 1.0) # blue + assert ax.patches[3].get_facecolor() == (0, 0, 1.0, 1.0) # blue + + g = xgb.to_graphviz(bst2, num_trees=0) + assert isinstance(g, Digraph) + ax = xgb.plot_tree(bst2, num_trees=0) + assert isinstance(ax, Axes) -def test_basic(): - dtrain = xgb.DMatrix(dpath + 'agaricus.txt.train') - dtest = xgb.DMatrix(dpath + 'agaricus.txt.test') - param = {'max_depth':2, 'eta':1, 'silent':1, 'objective':'binary:logistic' } - # specify validations set to watch performance - watchlist = [(dtest,'eval'), (dtrain,'train')] - num_round = 2 - bst = xgb.train(param, dtrain, num_round, watchlist) - # this is prediction - preds = bst.predict(dtest) - labels = dtest.get_label() - err = sum(1 for i in range(len(preds)) if int(preds[i]>0.5)!=labels[i]) / float(len(preds)) - # error must be smaller than 10% - assert err < 0.1 - - # save dmatrix into binary buffer - dtest.save_binary('dtest.buffer') - # save model - bst.save_model('xgb.model') - # load model and data in - bst2 = xgb.Booster(model_file='xgb.model') - dtest2 = xgb.DMatrix('dtest.buffer') - preds2 = bst2.predict(dtest2) - # assert they are the same - assert np.sum(np.abs(preds2-preds)) == 0 - -def test_feature_names(): - data = np.random.randn(100, 5) - target = np.array([0, 1] * 50) - - cases = [['Feature1', 'Feature2', 'Feature3', 'Feature4', 'Feature5'], - [u'要因1', u'要因2', u'要因3', u'要因4', u'要因5']] - - for features in cases: - dm = xgb.DMatrix(data, label=target, - feature_names=features) - assert dm.feature_names == features - assert dm.num_row() == 100 - assert dm.num_col() == 5 - - params={'objective': 'multi:softprob', - 'eval_metric': 'mlogloss', - 'eta': 0.3, - 'num_class': 3} - - bst = xgb.train(params, dm, num_boost_round=10) - scores = bst.get_fscore() - assert list(sorted(k for k in scores)) == features - - -def test_plotting(): - bst2 = xgb.Booster(model_file='xgb.model') - # plotting - - import matplotlib - matplotlib.use('Agg') - - from matplotlib.axes import Axes - from graphviz import Digraph - - ax = xgb.plot_importance(bst2) - assert isinstance(ax, Axes) - assert ax.get_title() == 'Feature importance' - assert ax.get_xlabel() == 'F score' - assert ax.get_ylabel() == 'Features' - assert len(ax.patches) == 4 - - ax = xgb.plot_importance(bst2, color='r', - title='t', xlabel='x', ylabel='y') - assert isinstance(ax, Axes) - assert ax.get_title() == 't' - assert ax.get_xlabel() == 'x' - assert ax.get_ylabel() == 'y' - assert len(ax.patches) == 4 - for p in ax.patches: - assert p.get_facecolor() == (1.0, 0, 0, 1.0) # red - - - ax = xgb.plot_importance(bst2, color=['r', 'r', 'b', 'b'], - title=None, xlabel=None, ylabel=None) - assert isinstance(ax, Axes) - assert ax.get_title() == '' - assert ax.get_xlabel() == '' - assert ax.get_ylabel() == '' - assert len(ax.patches) == 4 - assert ax.patches[0].get_facecolor() == (1.0, 0, 0, 1.0) # red - assert ax.patches[1].get_facecolor() == (1.0, 0, 0, 1.0) # red - assert ax.patches[2].get_facecolor() == (0, 0, 1.0, 1.0) # blue - assert ax.patches[3].get_facecolor() == (0, 0, 1.0, 1.0) # blue - - g = xgb.to_graphviz(bst2, num_trees=0) - assert isinstance(g, Digraph) - ax = xgb.plot_tree(bst2, num_trees=0) - assert isinstance(ax, Axes) From 6af98bec160222c475a7e449cf23486de7ae67ed Mon Sep 17 00:00:00 2001 From: Huayi Zhang Date: Thu, 17 Sep 2015 12:26:42 +0800 Subject: [PATCH 41/53] Fix python setup: avoid import numpy in setup.py Currently `pip install xgboost` will raise traceback like this ``` Traceback (most recent call last): File "", line 20, in File "/tmp/pip-build-IAdqYE/xgboost/setup.py", line 20, in import xgboost File "./xgboost/__init__.py", line 8, in from .core import DMatrix, Booster File "./xgboost/core.py", line 12, in import numpy as np ImportError: No module named numpy ``` We should avoid importing numpy in setup.py and let pip install numpy and scipy automatically. That's what `install_requires` for. --- python-package/setup.py | 16 ++++++++--- python-package/xgboost/VERSION | 1 + python-package/xgboost/__init__.py | 6 +++- python-package/xgboost/core.py | 39 +------------------------- python-package/xgboost/libpath.py | 44 ++++++++++++++++++++++++++++++ 5 files changed, 63 insertions(+), 43 deletions(-) create mode 100644 python-package/xgboost/VERSION create mode 100644 python-package/xgboost/libpath.py diff --git a/python-package/setup.py b/python-package/setup.py index 3f8ad9a1d875..1c5a257b6673 100644 --- a/python-package/setup.py +++ b/python-package/setup.py @@ -17,9 +17,17 @@ output = build_sh.communicate() print(output) -import xgboost -LIB_PATH = xgboost.core.find_lib_path() +CURRENT_DIR = os.path.dirname(__file__) + +# We can not import `xgboost.libpath` in setup.py directly since xgboost/__init__.py +# import `xgboost.core` and finally will import `numpy` and `scipy` which are setup +# `install_requires`. That's why using `execfile` here. +libpath_py = os.path.join(CURRENT_DIR, 'xgboost/libpath.py') +libpath = {'__file__': libpath_py} +execfile(libpath_py, libpath, libpath) + +LIB_PATH = libpath['find_lib_path']() #print LIB_PATH #to deploy to pip, please use @@ -27,9 +35,9 @@ #python setup.py register sdist upload #and be sure to test it firstly using "python setup.py register sdist upload -r pypitest" setup(name='xgboost', - version=xgboost.__version__, + version=open(os.path.join(CURRENT_DIR, 'xgboost/VERSION')).read().strip(), #version='0.4a13', - description=xgboost.__doc__, + description=open(os.path.join(CURRENT_DIR, 'README.md')).read(), install_requires=[ 'numpy', 'scipy', diff --git a/python-package/xgboost/VERSION b/python-package/xgboost/VERSION new file mode 100644 index 000000000000..e6adf3fc7bb7 --- /dev/null +++ b/python-package/xgboost/VERSION @@ -0,0 +1 @@ +0.4 \ No newline at end of file diff --git a/python-package/xgboost/__init__.py b/python-package/xgboost/__init__.py index b251b450119b..06892851fe8b 100644 --- a/python-package/xgboost/__init__.py +++ b/python-package/xgboost/__init__.py @@ -5,12 +5,16 @@ """ from __future__ import absolute_import + +import os + from .core import DMatrix, Booster from .training import train, cv from .sklearn import XGBModel, XGBClassifier, XGBRegressor from .plotting import plot_importance, plot_tree, to_graphviz -__version__ = '0.4' +VERSION_FILE = os.path.join(os.path.dirname(__file__), 'VERSION') +__version__ = open(VERSION_FILE).read().strip() __all__ = ['DMatrix', 'Booster', 'train', 'cv', diff --git a/python-package/xgboost/core.py b/python-package/xgboost/core.py index 74170ea84c50..acd954f3dc70 100644 --- a/python-package/xgboost/core.py +++ b/python-package/xgboost/core.py @@ -6,16 +6,13 @@ import os import sys import ctypes -import platform import collections import numpy as np import scipy.sparse +from .libpath import find_lib_path -class XGBoostLibraryNotFound(Exception): - """Error throwed by when xgboost is not found""" - pass class XGBoostError(Exception): """Error throwed by xgboost trainer.""" @@ -81,40 +78,6 @@ def from_cstr_to_pystr(data, length): return res -def find_lib_path(): - """Load find the path to xgboost dynamic library files. - - Returns - ------- - lib_path: list(string) - List of all found library path to xgboost - """ - curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__))) - #make pythonpack hack: copy this directory one level upper for setup.py - dll_path = [curr_path, os.path.join(curr_path, '../../wrapper/') - , os.path.join(curr_path, './wrapper/')] - if os.name == 'nt': - if platform.architecture()[0] == '64bit': - dll_path.append(os.path.join(curr_path, '../../windows/x64/Release/')) - #hack for pip installation when copy all parent source directory here - dll_path.append(os.path.join(curr_path, './windows/x64/Release/')) - else: - dll_path.append(os.path.join(curr_path, '../../windows/Release/')) - #hack for pip installation when copy all parent source directory here - dll_path.append(os.path.join(curr_path, './windows/Release/')) - if os.name == 'nt': - dll_path = [os.path.join(p, 'xgboost_wrapper.dll') for p in dll_path] - else: - dll_path = [os.path.join(p, 'libxgboostwrapper.so') for p in dll_path] - lib_path = [p for p in dll_path if os.path.exists(p) and os.path.isfile(p)] - if len(lib_path) == 0 and not os.environ.get('XGBOOST_BUILD_DOC', False): - raise XGBoostLibraryNotFound( - 'Cannot find XGBoost Libarary in the candicate path, ' + - 'did you run build.sh in root path?\n' - 'List of candidates:\n' + ('\n'.join(dll_path))) - return lib_path - - def _load_lib(): """Load xgboost Library.""" lib_path = find_lib_path() diff --git a/python-package/xgboost/libpath.py b/python-package/xgboost/libpath.py new file mode 100644 index 000000000000..293719f01bc4 --- /dev/null +++ b/python-package/xgboost/libpath.py @@ -0,0 +1,44 @@ +# coding: utf-8 +"""Find the path to xgboost dynamic library files.""" + +import os +import platform + + +class XGBoostLibraryNotFound(Exception): + """Error throwed by when xgboost is not found""" + pass + + +def find_lib_path(): + """Load find the path to xgboost dynamic library files. + + Returns + ------- + lib_path: list(string) + List of all found library path to xgboost + """ + curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__))) + # make pythonpack hack: copy this directory one level upper for setup.py + dll_path = [curr_path, os.path.join(curr_path, '../../wrapper/'), + os.path.join(curr_path, './wrapper/')] + if os.name == 'nt': + if platform.architecture()[0] == '64bit': + dll_path.append(os.path.join(curr_path, '../../windows/x64/Release/')) + # hack for pip installation when copy all parent source directory here + dll_path.append(os.path.join(curr_path, './windows/x64/Release/')) + else: + dll_path.append(os.path.join(curr_path, '../../windows/Release/')) + # hack for pip installation when copy all parent source directory here + dll_path.append(os.path.join(curr_path, './windows/Release/')) + if os.name == 'nt': + dll_path = [os.path.join(p, 'xgboost_wrapper.dll') for p in dll_path] + else: + dll_path = [os.path.join(p, 'libxgboostwrapper.so') for p in dll_path] + lib_path = [p for p in dll_path if os.path.exists(p) and os.path.isfile(p)] + if len(lib_path) == 0 and not os.environ.get('XGBOOST_BUILD_DOC', False): + raise XGBoostLibraryNotFound( + 'Cannot find XGBoost Libarary in the candicate path, ' + + 'did you run build.sh in root path?\n' + 'List of candidates:\n' + ('\n'.join(dll_path))) + return lib_path From f7d434aec2bc8ef302f24e5912900968d8ceb5c0 Mon Sep 17 00:00:00 2001 From: sinhrks Date: Wed, 16 Sep 2015 20:47:37 +0900 Subject: [PATCH 42/53] Fix numpy array check logic --- python-package/xgboost/core.py | 10 +++++++--- tests/python/test_basic.py | 26 +++++++++++++++++++++++++- 2 files changed, 32 insertions(+), 4 deletions(-) diff --git a/python-package/xgboost/core.py b/python-package/xgboost/core.py index 74170ea84c50..b528e2800a2b 100644 --- a/python-package/xgboost/core.py +++ b/python-package/xgboost/core.py @@ -220,7 +220,7 @@ def __init__(self, data, label=None, missing=0.0, self._init_from_csr(data) elif isinstance(data, scipy.sparse.csc_matrix): self._init_from_csc(data) - elif isinstance(data, np.ndarray) and len(data.shape) == 2: + elif isinstance(data, np.ndarray): self._init_from_npy2d(data, missing) else: try: @@ -278,6 +278,8 @@ def _init_from_npy2d(self, mat, missing): """ Initialize data from a 2-D numpy matrix. """ + if len(mat.shape) != 2: + raise ValueError('Input numpy.ndarray must be 2 dimensional') data = np.array(mat.reshape(mat.size), dtype=np.float32) self.handle = ctypes.c_void_p() _check_call(_LIB.XGDMatrixCreateFromMat(data.ctypes.data_as(ctypes.POINTER(ctypes.c_float)), @@ -792,11 +794,11 @@ def load_model(self, fname): fname : string or a memory buffer Input file name or memory buffer(see also save_raw) """ - if isinstance(fname, str): # assume file name + if isinstance(fname, STRING_TYPES): # assume file name if os.path.exists(fname): _LIB.XGBoosterLoadModel(self.handle, c_str(fname)) else: - raise ValueError("No such file: {0}") + raise ValueError("No such file: {0}".format(fname)) else: buf = fname length = ctypes.c_ulong(len(buf)) @@ -851,6 +853,8 @@ def get_dump(self, fmap='', with_stats=False): ctypes.byref(length), ctypes.byref(sarr))) else: + if fmap != '' and not os.path.exists(fmap): + raise ValueError("No such file: {0}".format(fmap)) _check_call(_LIB.XGBoosterDumpModel(self.handle, c_str(fmap), int(with_stats), diff --git a/tests/python/test_basic.py b/tests/python/test_basic.py index 404d4354f688..e7c0629ca45c 100644 --- a/tests/python/test_basic.py +++ b/tests/python/test_basic.py @@ -83,6 +83,31 @@ def test_load_file_invalid(self): self.assertRaises(ValueError, xgb.Booster, model_file='incorrect_path') + self.assertRaises(ValueError, xgb.Booster, + model_file=u'不正なパス') + + def test_dmatrix_numpy_init(self): + data = np.random.randn(5, 5) + dm = xgb.DMatrix(data) + assert dm.num_row() == 5 + assert dm.num_col() == 5 + + data = np.matrix([[1, 2], [3, 4]]) + dm = xgb.DMatrix(data) + assert dm.num_row() == 2 + assert dm.num_col() == 2 + + # 0d array + self.assertRaises(ValueError, xgb.DMatrix, np.array(1)) + # 1d array + self.assertRaises(ValueError, xgb.DMatrix, np.array([1, 2, 3])) + # 3d array + data = np.random.randn(5, 5, 5) + self.assertRaises(ValueError, xgb.DMatrix, data) + # object dtype + data = np.array([['a', 'b'], ['c', 'd']]) + self.assertRaises(ValueError, xgb.DMatrix, data) + def test_plotting(self): bst2 = xgb.Booster(model_file='xgb.model') # plotting @@ -127,4 +152,3 @@ def test_plotting(self): assert isinstance(g, Digraph) ax = xgb.plot_tree(bst2, num_trees=0) assert isinstance(ax, Axes) - From c49c6565e5e67d6f147c9f2b0906306a1a501571 Mon Sep 17 00:00:00 2001 From: Huayi Zhang Date: Fri, 18 Sep 2015 10:35:41 +0800 Subject: [PATCH 43/53] Add contributor --- CONTRIBUTORS.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index ab9c980c89a6..32a6745f01f4 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -49,3 +49,4 @@ List of Contributors - Masaaki is the initial creator of xgboost python plotting module. * [Hongliang Liu](https://github.com/phunterlau) - Hongliang is the maintainer of xgboost python PyPI package for pip installation. +* [Huayi Zhang](https://github.com/irachex) From 0bca4c8c3b7bb6bb1c3e00a11cf29566d3f8d013 Mon Sep 17 00:00:00 2001 From: Jamie Hall Date: Sat, 19 Sep 2015 10:46:57 +1000 Subject: [PATCH 44/53] Restore Python3 compatibility --- python-package/setup.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python-package/setup.py b/python-package/setup.py index 1c5a257b6673..443b2b57193b 100644 --- a/python-package/setup.py +++ b/python-package/setup.py @@ -22,10 +22,10 @@ # We can not import `xgboost.libpath` in setup.py directly since xgboost/__init__.py # import `xgboost.core` and finally will import `numpy` and `scipy` which are setup -# `install_requires`. That's why using `execfile` here. +# `install_requires`. That's why we're using `exec` here. libpath_py = os.path.join(CURRENT_DIR, 'xgboost/libpath.py') libpath = {'__file__': libpath_py} -execfile(libpath_py, libpath, libpath) +exec(compile(open(libpath_py, "rb").read(), libpath_py, 'exec'), libpath, libpath) LIB_PATH = libpath['find_lib_path']() #print LIB_PATH From e558d45208b6dcbbcba115b5ff0e0072f32fc179 Mon Sep 17 00:00:00 2001 From: Tianqi Chen Date: Fri, 18 Sep 2015 18:45:18 -0700 Subject: [PATCH 45/53] Update .travis.yml --- .travis.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.travis.yml b/.travis.yml index 5e10c3360e37..c7049be94f36 100644 --- a/.travis.yml +++ b/.travis.yml @@ -30,6 +30,8 @@ addons: - wget - libcurl4-openssl-dev - unzip + - python-numpy + - python-scipy before_install: - scripts/travis_osx_install.sh From f28459497dbfa0535323cce52a4ea4dc2daa2bf9 Mon Sep 17 00:00:00 2001 From: Tianqi Chen Date: Fri, 18 Sep 2015 20:22:54 -0700 Subject: [PATCH 46/53] fix pylint in setup --- python-package/setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python-package/setup.py b/python-package/setup.py index 443b2b57193b..c9dfa415ccbe 100644 --- a/python-package/setup.py +++ b/python-package/setup.py @@ -1,4 +1,4 @@ -# pylint: disable=invalid-name +# pylint: disable=invalid-name, exec-used """Setup xgboost package.""" from __future__ import absolute_import import sys From 6c3e4d7d0d639238b5da1ab97e3bef782000411e Mon Sep 17 00:00:00 2001 From: Jamie Hall Date: Mon, 21 Sep 2015 08:55:42 +1000 Subject: [PATCH 47/53] Use homebrew gcc if available --- Makefile | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Makefile b/Makefile index dfafc61cf647..3e893bd58173 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,6 @@ -export CC = gcc -#build on the fly -export CXX = g++ +export CC = $(if $(shell which gcc-5),gcc-5,gcc) +export CXX = $(if $(shell which g++-5),g++-5,gcc) + export MPICXX = mpicxx export LDFLAGS= -pthread -lm export CFLAGS = -Wall -O3 -msse2 -Wno-unknown-pragmas -funroll-loops From f5920f8cbd8d811c7dd563176746f67af1bfc264 Mon Sep 17 00:00:00 2001 From: Jamie Hall Date: Tue, 22 Sep 2015 07:18:15 +1000 Subject: [PATCH 48/53] Fix makefile typo --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 3e893bd58173..6685b0c6daa7 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,5 @@ export CC = $(if $(shell which gcc-5),gcc-5,gcc) -export CXX = $(if $(shell which g++-5),g++-5,gcc) +export CXX = $(if $(shell which g++-5),g++-5,g++) export MPICXX = mpicxx export LDFLAGS= -pthread -lm From db692a30e5c0930d48689a8f88be06f64aa698a9 Mon Sep 17 00:00:00 2001 From: sinhrks Date: Thu, 17 Sep 2015 22:46:12 +0900 Subject: [PATCH 49/53] Add feature_types --- python-package/xgboost/core.py | 128 +++++++++++++++++++++++------ python-package/xgboost/plotting.py | 27 ++++-- tests/python/test_basic.py | 17 ++++ 3 files changed, 137 insertions(+), 35 deletions(-) diff --git a/python-package/xgboost/core.py b/python-package/xgboost/core.py index 77a73d95b561..1e14fac7b177 100644 --- a/python-package/xgboost/core.py +++ b/python-package/xgboost/core.py @@ -147,9 +147,11 @@ class DMatrix(object): """ feature_names = None # for previous version's pickle + feature_types = None def __init__(self, data, label=None, missing=0.0, - weight=None, silent=False, feature_names=None): + weight=None, silent=False, + feature_names=None, feature_types=None): """ Data matrix used in XGBoost. @@ -169,6 +171,8 @@ def __init__(self, data, label=None, missing=0.0, Whether print messages during construction feature_names : list, optional Labels for features. + feature_types : list, optional + Labels for features. """ # force into void_p, mac need to pass things in as void_p if data is None: @@ -196,20 +200,8 @@ def __init__(self, data, label=None, missing=0.0, if weight is not None: self.set_weight(weight) - # validate feature name - if not feature_names is None: - if not isinstance(feature_names, list): - feature_names = list(feature_names) - if len(feature_names) != len(set(feature_names)): - raise ValueError('feature_names must be unique') - if len(feature_names) != self.num_col(): - msg = 'feature_names must have the same length as data' - raise ValueError(msg) - # prohibit to use symbols may affect to parse. e.g. ``[]=.`` - if not all(isinstance(f, STRING_TYPES) and f.isalnum() - for f in feature_names): - raise ValueError('all feature_names must be alphanumerics') - self.feature_names = feature_names + self.set_feature_names(feature_names) + self.set_feature_types(feature_types) def _init_from_csr(self, csr): """ @@ -389,6 +381,66 @@ def set_group(self, group): c_array(ctypes.c_uint, group), len(group))) + def set_feature_names(self, feature_names): + """Set feature names (column labels). + + Parameters + ---------- + feature_names : list or None + Labels for features. None will reset existing feature names + """ + if not feature_names is None: + # validate feature name + if not isinstance(feature_names, list): + feature_names = list(feature_names) + if len(feature_names) != len(set(feature_names)): + raise ValueError('feature_names must be unique') + if len(feature_names) != self.num_col(): + msg = 'feature_names must have the same length as data' + raise ValueError(msg) + # prohibit to use symbols may affect to parse. e.g. ``[]=.`` + if not all(isinstance(f, STRING_TYPES) and f.isalnum() + for f in feature_names): + raise ValueError('all feature_names must be alphanumerics') + else: + # reset feature_types also + self.set_feature_types(None) + self.feature_names = feature_names + + def set_feature_types(self, feature_types): + """Set feature types (column types). + + This is for displaying the results and unrelated + to the learning process. + + Parameters + ---------- + feature_types : list or None + Labels for features. None will reset existing feature names + """ + if not feature_types is None: + + if self.feature_names is None: + msg = 'Unable to set feature types before setting names' + raise ValueError(msg) + + if isinstance(feature_types, STRING_TYPES): + # single string will be applied to all columns + feature_types = [feature_types] * self.num_col() + + if not isinstance(feature_types, list): + feature_types = list(feature_types) + if len(feature_types) != self.num_col(): + msg = 'feature_types must have the same length as data' + raise ValueError(msg) + # prohibit to use symbols may affect to parse. e.g. ``[]=.`` + + valid = ('q', 'i', 'int', 'float') + if not all(isinstance(f, STRING_TYPES) and f in valid + for f in feature_types): + raise ValueError('all feature_names must be {i, q, int, float}') + self.feature_types = feature_types + def get_label(self): """Get the label of the DMatrix. @@ -416,6 +468,24 @@ def get_base_margin(self): """ return self.get_float_info('base_margin') + def get_feature_names(self): + """Get feature names (column labels). + + Returns + ------- + feature_names : list or None + """ + return self.feature_names + + def get_feature_types(self): + """Get feature types (column types). + + Returns + ------- + feature_types : list or None + """ + return self.feature_types + def num_row(self): """Get the number of rows in the DMatrix. @@ -487,7 +557,8 @@ def __init__(self, params=None, cache=(), model_file=None): for d in cache: if not isinstance(d, DMatrix): raise TypeError('invalid cache item: {}'.format(type(d).__name__)) - self._validate_feature_names(d) + self._validate_features(d) + dmats = c_array(ctypes.c_void_p, [d.handle for d in cache]) self.handle = ctypes.c_void_p() _check_call(_LIB.XGBoosterCreate(dmats, len(cache), ctypes.byref(self.handle))) @@ -572,7 +643,7 @@ def update(self, dtrain, iteration, fobj=None): """ if not isinstance(dtrain, DMatrix): raise TypeError('invalid training matrix: {}'.format(type(dtrain).__name__)) - self._validate_feature_names(dtrain) + self._validate_features(dtrain) if fobj is None: _check_call(_LIB.XGBoosterUpdateOneIter(self.handle, iteration, dtrain.handle)) @@ -598,7 +669,7 @@ def boost(self, dtrain, grad, hess): raise ValueError('grad / hess length mismatch: {} / {}'.format(len(grad), len(hess))) if not isinstance(dtrain, DMatrix): raise TypeError('invalid training matrix: {}'.format(type(dtrain).__name__)) - self._validate_feature_names(dtrain) + self._validate_features(dtrain) _check_call(_LIB.XGBoosterBoostOneIter(self.handle, dtrain.handle, c_array(ctypes.c_float, grad), @@ -629,7 +700,7 @@ def eval_set(self, evals, iteration=0, feval=None): raise TypeError('expected DMatrix, got {}'.format(type(d[0]).__name__)) if not isinstance(d[1], STRING_TYPES): raise TypeError('expected string, got {}'.format(type(d[1]).__name__)) - self._validate_feature_names(d[0]) + self._validate_features(d[0]) dmats = c_array(ctypes.c_void_p, [d[0].handle for d in evals]) evnames = c_array(ctypes.c_char_p, [c_str(d[1]) for d in evals]) @@ -664,7 +735,7 @@ def eval(self, data, name='eval', iteration=0): result: str Evaluation result string. """ - self._validate_feature_names(data) + self._validate_features(data) return self.eval_set([(data, name)], iteration) def predict(self, data, output_margin=False, ntree_limit=0, pred_leaf=False): @@ -703,7 +774,7 @@ def predict(self, data, output_margin=False, ntree_limit=0, pred_leaf=False): if pred_leaf: option_mask |= 0x02 - self._validate_feature_names(data) + self._validate_features(data) length = ctypes.c_ulong() preds = ctypes.POINTER(ctypes.c_float)() @@ -805,9 +876,12 @@ def get_dump(self, fmap='', with_stats=False): fname = from_pystr_to_cstr(self.feature_names) - # supports quantitative type only - # {'q': quantitative, 'i': indicator} - ftype = from_pystr_to_cstr(['q'] * flen) + if self.feature_types is None: + # use quantitative as default + # {'q': quantitative, 'i': indicator} + ftype = from_pystr_to_cstr(['q'] * flen) + else: + ftype = from_pystr_to_cstr(self.feature_types) _check_call(_LIB.XGBoosterDumpModelWithFeatures(self.handle, flen, fname, @@ -849,12 +923,14 @@ def get_fscore(self, fmap=''): fmap[fid] += 1 return fmap - def _validate_feature_names(self, data): + def _validate_features(self, data): """ - Validate Booster and data's feature_names are identical + Validate Booster and data's feature_names are identical. + Set feature_names and feature_types from DMatrix """ if self.feature_names is None: self.feature_names = data.feature_names + self.feature_types = data.feature_types else: # Booster can't accept data with different feature names if self.feature_names != data.feature_names: diff --git a/python-package/xgboost/plotting.py b/python-package/xgboost/plotting.py index 9c9b2a97d189..50a844a1e6a1 100644 --- a/python-package/xgboost/plotting.py +++ b/python-package/xgboost/plotting.py @@ -92,7 +92,7 @@ def plot_importance(booster, ax=None, height=0.2, _NODEPAT = re.compile(r'(\d+):\[(.+)\]') _LEAFPAT = re.compile(r'(\d+):(leaf=.+)') _EDGEPAT = re.compile(r'yes=(\d+),no=(\d+),missing=(\d+)') - +_EDGEPAT2 = re.compile(r'yes=(\d+),no=(\d+)') def _parse_node(graph, text): """parse dumped node""" @@ -111,15 +111,24 @@ def _parse_node(graph, text): def _parse_edge(graph, node, text, yes_color='#0000FF', no_color='#FF0000'): """parse dumped edge""" - match = _EDGEPAT.match(text) + try: + match = _EDGEPAT.match(text) + if match is not None: + yes, no, missing = match.groups() + if yes == missing: + graph.edge(node, yes, label='yes, missing', color=yes_color) + graph.edge(node, no, label='no', color=no_color) + else: + graph.edge(node, yes, label='yes', color=yes_color) + graph.edge(node, no, label='no, missing', color=no_color) + return + except ValueError: + pass + match = _EDGEPAT2.match(text) if match is not None: - yes, no, missing = match.groups() - if yes == missing: - graph.edge(node, yes, label='yes, missing', color=yes_color) - graph.edge(node, no, label='no', color=no_color) - else: - graph.edge(node, yes, label='yes', color=yes_color) - graph.edge(node, no, label='no, missing', color=no_color) + yes, no = match.groups() + graph.edge(node, yes, label='yes', color=yes_color) + graph.edge(node, no, label='no', color=no_color) return raise ValueError('Unable to parse edge: {0}'.format(text)) diff --git a/tests/python/test_basic.py b/tests/python/test_basic.py index e7c0629ca45c..7dc90579459d 100644 --- a/tests/python/test_basic.py +++ b/tests/python/test_basic.py @@ -47,6 +47,23 @@ def test_dmatrix_init(self): self.assertRaises(ValueError, xgb.DMatrix, data, feature_names=['a', 'b', 'c', 'd', 'e=1']) + dm = xgb.DMatrix(data) + dm.set_feature_names(list('abcde')) + assert dm.get_feature_names() == list('abcde') + + dm.set_feature_types('q') + assert dm.get_feature_types() == list('qqqqq') + + dm.set_feature_types(list('qiqiq')) + assert dm.get_feature_types() == list('qiqiq') + + self.assertRaises(ValueError, dm.set_feature_types, list('abcde')) + + # reset + dm.set_feature_names(None) + assert dm.get_feature_names() is None + assert dm.get_feature_types() is None + def test_feature_names(self): data = np.random.randn(100, 5) target = np.array([0, 1] * 50) From f6f3473d17d3ae0acfd614994ebe6194cafca1c3 Mon Sep 17 00:00:00 2001 From: sinhrks Date: Mon, 28 Sep 2015 22:36:39 +0900 Subject: [PATCH 50/53] Change to properties --- python-package/xgboost/core.py | 168 +++++++++++++++++---------------- tests/python/test_basic.py | 22 +++-- 2 files changed, 98 insertions(+), 92 deletions(-) diff --git a/python-package/xgboost/core.py b/python-package/xgboost/core.py index 1e14fac7b177..aaddc43fb56d 100644 --- a/python-package/xgboost/core.py +++ b/python-package/xgboost/core.py @@ -146,8 +146,8 @@ class DMatrix(object): You can construct DMatrix from numpy.arrays """ - feature_names = None # for previous version's pickle - feature_types = None + _feature_names = None # for previous version's pickle + _feature_types = None def __init__(self, data, label=None, missing=0.0, weight=None, silent=False, @@ -200,8 +200,8 @@ def __init__(self, data, label=None, missing=0.0, if weight is not None: self.set_weight(weight) - self.set_feature_names(feature_names) - self.set_feature_types(feature_types) + self.feature_names = feature_names + self.feature_types = feature_types def _init_from_csr(self, csr): """ @@ -381,66 +381,6 @@ def set_group(self, group): c_array(ctypes.c_uint, group), len(group))) - def set_feature_names(self, feature_names): - """Set feature names (column labels). - - Parameters - ---------- - feature_names : list or None - Labels for features. None will reset existing feature names - """ - if not feature_names is None: - # validate feature name - if not isinstance(feature_names, list): - feature_names = list(feature_names) - if len(feature_names) != len(set(feature_names)): - raise ValueError('feature_names must be unique') - if len(feature_names) != self.num_col(): - msg = 'feature_names must have the same length as data' - raise ValueError(msg) - # prohibit to use symbols may affect to parse. e.g. ``[]=.`` - if not all(isinstance(f, STRING_TYPES) and f.isalnum() - for f in feature_names): - raise ValueError('all feature_names must be alphanumerics') - else: - # reset feature_types also - self.set_feature_types(None) - self.feature_names = feature_names - - def set_feature_types(self, feature_types): - """Set feature types (column types). - - This is for displaying the results and unrelated - to the learning process. - - Parameters - ---------- - feature_types : list or None - Labels for features. None will reset existing feature names - """ - if not feature_types is None: - - if self.feature_names is None: - msg = 'Unable to set feature types before setting names' - raise ValueError(msg) - - if isinstance(feature_types, STRING_TYPES): - # single string will be applied to all columns - feature_types = [feature_types] * self.num_col() - - if not isinstance(feature_types, list): - feature_types = list(feature_types) - if len(feature_types) != self.num_col(): - msg = 'feature_types must have the same length as data' - raise ValueError(msg) - # prohibit to use symbols may affect to parse. e.g. ``[]=.`` - - valid = ('q', 'i', 'int', 'float') - if not all(isinstance(f, STRING_TYPES) and f in valid - for f in feature_types): - raise ValueError('all feature_names must be {i, q, int, float}') - self.feature_types = feature_types - def get_label(self): """Get the label of the DMatrix. @@ -468,24 +408,6 @@ def get_base_margin(self): """ return self.get_float_info('base_margin') - def get_feature_names(self): - """Get feature names (column labels). - - Returns - ------- - feature_names : list or None - """ - return self.feature_names - - def get_feature_types(self): - """Get feature types (column types). - - Returns - ------- - feature_types : list or None - """ - return self.feature_types - def num_row(self): """Get the number of rows in the DMatrix. @@ -531,6 +453,88 @@ def slice(self, rindex): ctypes.byref(res.handle))) return res + @property + def feature_names(self): + """Get feature names (column labels). + + Returns + ------- + feature_names : list or None + """ + return self._feature_names + + @property + def feature_types(self): + """Get feature types (column types). + + Returns + ------- + feature_types : list or None + """ + return self._feature_types + + @feature_names.setter + def feature_names(self, feature_names): + """Set feature names (column labels). + + Parameters + ---------- + feature_names : list or None + Labels for features. None will reset existing feature names + """ + if not feature_names is None: + # validate feature name + if not isinstance(feature_names, list): + feature_names = list(feature_names) + if len(feature_names) != len(set(feature_names)): + raise ValueError('feature_names must be unique') + if len(feature_names) != self.num_col(): + msg = 'feature_names must have the same length as data' + raise ValueError(msg) + # prohibit to use symbols may affect to parse. e.g. ``[]=.`` + if not all(isinstance(f, STRING_TYPES) and f.isalnum() + for f in feature_names): + raise ValueError('all feature_names must be alphanumerics') + else: + # reset feature_types also + self.feature_types = None + self._feature_names = feature_names + + @feature_types.setter + def feature_types(self, feature_types): + """Set feature types (column types). + + This is for displaying the results and unrelated + to the learning process. + + Parameters + ---------- + feature_types : list or None + Labels for features. None will reset existing feature names + """ + if not feature_types is None: + + if self.feature_names is None: + msg = 'Unable to set feature types before setting names' + raise ValueError(msg) + + if isinstance(feature_types, STRING_TYPES): + # single string will be applied to all columns + feature_types = [feature_types] * self.num_col() + + if not isinstance(feature_types, list): + feature_types = list(feature_types) + if len(feature_types) != self.num_col(): + msg = 'feature_types must have the same length as data' + raise ValueError(msg) + # prohibit to use symbols may affect to parse. e.g. ``[]=.`` + + valid = ('q', 'i', 'int', 'float') + if not all(isinstance(f, STRING_TYPES) and f in valid + for f in feature_types): + raise ValueError('all feature_names must be {i, q, int, float}') + self._feature_types = feature_types + class Booster(object): """"A Booster of of XGBoost. diff --git a/tests/python/test_basic.py b/tests/python/test_basic.py index 7dc90579459d..afbc53e1edd4 100644 --- a/tests/python/test_basic.py +++ b/tests/python/test_basic.py @@ -48,21 +48,23 @@ def test_dmatrix_init(self): feature_names=['a', 'b', 'c', 'd', 'e=1']) dm = xgb.DMatrix(data) - dm.set_feature_names(list('abcde')) - assert dm.get_feature_names() == list('abcde') + dm.feature_names = list('abcde') + assert dm.feature_names == list('abcde') - dm.set_feature_types('q') - assert dm.get_feature_types() == list('qqqqq') + dm.feature_types = 'q' + assert dm.feature_types == list('qqqqq') - dm.set_feature_types(list('qiqiq')) - assert dm.get_feature_types() == list('qiqiq') + dm.feature_types = list('qiqiq') + assert dm.feature_types == list('qiqiq') - self.assertRaises(ValueError, dm.set_feature_types, list('abcde')) + def incorrect_type_set(): + dm.feature_types = list('abcde') + self.assertRaises(ValueError, incorrect_type_set) # reset - dm.set_feature_names(None) - assert dm.get_feature_names() is None - assert dm.get_feature_types() is None + dm.feature_names = None + assert dm.feature_names is None + assert dm.feature_types is None def test_feature_names(self): data = np.random.randn(100, 5) From b943becc618819ac2e866b11168b79be48a8a420 Mon Sep 17 00:00:00 2001 From: sinhrks Date: Thu, 1 Oct 2015 22:39:56 +0900 Subject: [PATCH 51/53] python DMatrix now accepts pandas DataFrame --- python-package/xgboost/core.py | 31 ++++++++++++++++++++++++++++++- scripts/travis_script.sh | 2 +- tests/python/test_basic.py | 21 +++++++++++++++++++++ 3 files changed, 52 insertions(+), 2 deletions(-) diff --git a/python-package/xgboost/core.py b/python-package/xgboost/core.py index aaddc43fb56d..8c256782014f 100644 --- a/python-package/xgboost/core.py +++ b/python-package/xgboost/core.py @@ -138,6 +138,28 @@ def c_array(ctype, values): return (ctype * len(values))(*values) +def _maybe_from_pandas(data, feature_names, feature_types): + """ Extract internal data from pd.DataFrame """ + try: + import pandas as pd + except ImportError: + return data, feature_names, feature_types + + if not isinstance(data, pd.DataFrame): + return data, feature_names, feature_types + + dtypes = data.dtypes + if not all(dtype.name in ('int64', 'float64', 'bool') for dtype in dtypes): + raise ValueError('DataFrame.dtypes must be int, float or bool') + + if feature_names is None: + feature_names = data.columns.tolist() + if feature_types is None: + mapper = {'int64': 'int', 'float64': 'q', 'bool': 'i'} + feature_types = [mapper[dtype.name] for dtype in dtypes] + data = data.values.astype('float') + return data, feature_names, feature_types + class DMatrix(object): """Data Matrix used in XGBoost. @@ -157,7 +179,7 @@ def __init__(self, data, label=None, missing=0.0, Parameters ---------- - data : string/numpy array/scipy.sparse + data : string/numpy array/scipy.sparse/pd.DataFrame Data source of DMatrix. When data is string type, it represents the path libsvm format txt file, or binary file that xgboost can read from. @@ -178,6 +200,13 @@ def __init__(self, data, label=None, missing=0.0, if data is None: self.handle = None return + + klass = getattr(getattr(data, '__class__', None), '__name__', None) + if klass == 'DataFrame': + # once check class name to avoid unnecessary pandas import + data, feature_names, feature_types = _maybe_from_pandas(data, feature_names, + feature_types) + if isinstance(data, STRING_TYPES): self.handle = ctypes.c_void_p() _check_call(_LIB.XGDMatrixCreateFromFile(c_str(data), diff --git a/scripts/travis_script.sh b/scripts/travis_script.sh index f633f9d7b81e..3a026966dc78 100755 --- a/scripts/travis_script.sh +++ b/scripts/travis_script.sh @@ -64,7 +64,7 @@ if [ ${TASK} == "python-package" -o ${TASK} == "python-package3" ]; then conda create -n myenv python=2.7 fi source activate myenv - conda install numpy scipy matplotlib nose + conda install numpy scipy pandas matplotlib nose python -m pip install graphviz make all CXX=${CXX} || exit -1 diff --git a/tests/python/test_basic.py b/tests/python/test_basic.py index afbc53e1edd4..9778d8338e74 100644 --- a/tests/python/test_basic.py +++ b/tests/python/test_basic.py @@ -97,6 +97,27 @@ def test_feature_names(self): dm = xgb.DMatrix(dummy, feature_names=list('abcde')) self.assertRaises(ValueError, bst.predict, dm) + def test_pandas(self): + import pandas as pd + df = pd.DataFrame([[1, 2., True], [2, 3., False]], columns=['a', 'b', 'c']) + dm = xgb.DMatrix(df, label=pd.Series([1, 2])) + assert dm.feature_names == ['a', 'b', 'c'] + assert dm.feature_types == ['int', 'q', 'i'] + assert dm.num_row() == 2 + assert dm.num_col() == 3 + + # overwrite feature_names and feature_types + dm = xgb.DMatrix(df, label=pd.Series([1, 2]), + feature_names=['x', 'y', 'z'], feature_types=['q', 'q', 'q']) + assert dm.feature_names == ['x', 'y', 'z'] + assert dm.feature_types == ['q', 'q', 'q'] + assert dm.num_row() == 2 + assert dm.num_col() == 3 + + # incorrect dtypes + df = pd.DataFrame([[1, 2., 'x'], [2, 3., 'y']], columns=['a', 'b', 'c']) + self.assertRaises(ValueError, xgb.DMatrix, df) + def test_load_file_invalid(self): self.assertRaises(ValueError, xgb.Booster, From b958c55ac6ab6ceaffee3869fbb932b2eab1adbc Mon Sep 17 00:00:00 2001 From: sinhrks Date: Fri, 2 Oct 2015 21:56:35 +0900 Subject: [PATCH 52/53] CV returns ndarray or DataFrame --- python-package/xgboost/training.py | 80 ++++++++++++++++++++++++------ scripts/travis_script.sh | 2 +- tests/python/test_basic.py | 30 +++++++++++ 3 files changed, 96 insertions(+), 16 deletions(-) diff --git a/python-package/xgboost/training.py b/python-package/xgboost/training.py index 2f890a063c65..a6a7c203b3bd 100644 --- a/python-package/xgboost/training.py +++ b/python-package/xgboost/training.py @@ -1,5 +1,6 @@ # coding: utf-8 # pylint: disable=too-many-locals, too-many-arguments, invalid-name +# pylint: disable=too-many-branches """Training Library containing training routines.""" from __future__ import absolute_import @@ -179,16 +180,16 @@ def mknfold(dall, nfold, param, seed, evals=(), fpreproc=None): return ret -def aggcv(rlist, show_stdv=True): +def aggcv(rlist, show_stdv=True, show_progress=None, as_pandas=True): # pylint: disable=invalid-name """ Aggregate cross-validation results. """ cvmap = {} - ret = rlist[0].split()[0] + idx = rlist[0].split()[0] for line in rlist: arr = line.split() - assert ret == arr[0] + assert idx == arr[0] for it in arr[1:]: if not isinstance(it, STRING_TYPES): it = it.decode() @@ -196,19 +197,50 @@ def aggcv(rlist, show_stdv=True): if k not in cvmap: cvmap[k] = [] cvmap[k].append(float(v)) + + msg = idx + + if show_stdv: + fmt = '\tcv-{0}:{1}+{2}' + else: + fmt = '\tcv-{0}:{1}' + + index = [] + results = [] for k, v in sorted(cvmap.items(), key=lambda x: x[0]): v = np.array(v) - if not isinstance(ret, STRING_TYPES): - ret = ret.decode() - if show_stdv: - ret += '\tcv-%s:%f+%f' % (k, np.mean(v), np.std(v)) - else: - ret += '\tcv-%s:%f' % (k, np.mean(v)) - return ret + if not isinstance(msg, STRING_TYPES): + msg = msg.decode() + mean, std = np.mean(v), np.std(v) + msg += fmt.format(k, mean, std) + + index.extend([k + '-mean', k + '-std']) + results.extend([mean, std]) + + + + if as_pandas: + try: + import pandas as pd + results = pd.Series(results, index=index) + except ImportError: + if show_progress is None: + show_progress = True + else: + # if show_progress is default (None), + # result will be np.ndarray as it can't hold column name + if show_progress is None: + show_progress = True + + if show_progress: + sys.stderr.write(msg + '\n') + + return results def cv(params, dtrain, num_boost_round=10, nfold=3, metrics=(), - obj=None, feval=None, fpreproc=None, show_stdv=True, seed=0): + obj=None, feval=None, fpreproc=None, as_pandas=True, + show_progress=None, show_stdv=True, seed=0): # pylint: disable = invalid-name """Cross-validation with given paramaters. @@ -231,8 +263,15 @@ def cv(params, dtrain, num_boost_round=10, nfold=3, metrics=(), fpreproc : function Preprocessing function that takes (dtrain, dtest, param) and returns transformed versions of those. - show_stdv : bool - Whether to display the standard deviation. + as_pandas : bool, default True + Return pd.DataFrame when pandas is installed. + If False or pandas is not installed, return np.ndarray + show_progress : bool or None, default None + Whether to display the progress. If None, progress will be displayed + when np.ndarray is returned. + show_stdv : bool, default True + Whether to display the standard deviation in progress. + Results are not affected, and always contains std. seed : int Seed used to generate the folds (passed to numpy.random.seed). @@ -245,8 +284,19 @@ def cv(params, dtrain, num_boost_round=10, nfold=3, metrics=(), for i in range(num_boost_round): for fold in cvfolds: fold.update(i, obj) - res = aggcv([f.eval(i, feval) for f in cvfolds], show_stdv) - sys.stderr.write(res + '\n') + res = aggcv([f.eval(i, feval) for f in cvfolds], + show_stdv=show_stdv, show_progress=show_progress, + as_pandas=as_pandas) results.append(res) + + if as_pandas: + try: + import pandas as pd + results = pd.DataFrame(results) + except ImportError: + results = np.array(results) + else: + results = np.array(results) + return results diff --git a/scripts/travis_script.sh b/scripts/travis_script.sh index f633f9d7b81e..3a026966dc78 100755 --- a/scripts/travis_script.sh +++ b/scripts/travis_script.sh @@ -64,7 +64,7 @@ if [ ${TASK} == "python-package" -o ${TASK} == "python-package3" ]; then conda create -n myenv python=2.7 fi source activate myenv - conda install numpy scipy matplotlib nose + conda install numpy scipy pandas matplotlib nose python -m pip install graphviz make all CXX=${CXX} || exit -1 diff --git a/tests/python/test_basic.py b/tests/python/test_basic.py index afbc53e1edd4..d7a0e0e35ac8 100644 --- a/tests/python/test_basic.py +++ b/tests/python/test_basic.py @@ -127,6 +127,36 @@ def test_dmatrix_numpy_init(self): data = np.array([['a', 'b'], ['c', 'd']]) self.assertRaises(ValueError, xgb.DMatrix, data) + def test_cv(self): + dm = xgb.DMatrix(dpath + 'agaricus.txt.train') + params = {'max_depth':2, 'eta':1, 'silent':1, 'objective':'binary:logistic' } + + import pandas as pd + cv = xgb.cv(params, dm, num_boost_round=10, nfold=10) + assert isinstance(cv, pd.DataFrame) + exp = pd.Index([u'test-error-mean', u'test-error-std', + u'train-error-mean', u'train-error-std']) + assert cv.columns.equals(exp) + + # show progress log (result is the same as above) + cv = xgb.cv(params, dm, num_boost_round=10, nfold=10, + show_progress=True) + assert isinstance(cv, pd.DataFrame) + exp = pd.Index([u'test-error-mean', u'test-error-std', + u'train-error-mean', u'train-error-std']) + assert cv.columns.equals(exp) + cv = xgb.cv(params, dm, num_boost_round=10, nfold=10, + show_progress=True, show_stdv=False) + assert isinstance(cv, pd.DataFrame) + exp = pd.Index([u'test-error-mean', u'test-error-std', + u'train-error-mean', u'train-error-std']) + assert cv.columns.equals(exp) + + # return np.ndarray + cv = xgb.cv(params, dm, num_boost_round=10, nfold=10, as_pandas=False) + assert isinstance(cv, np.ndarray) + assert cv.shape == (10, 4) + def test_plotting(self): bst2 = xgb.Booster(model_file='xgb.model') # plotting From dbcb4c8729dc18bb3bb453389c248e7be88eaf50 Mon Sep 17 00:00:00 2001 From: sinhrks Date: Sun, 4 Oct 2015 13:30:01 +0900 Subject: [PATCH 53/53] Support non-str column names --- python-package/xgboost/core.py | 2 +- tests/python/test_basic.py | 15 +++++++++++++++ 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/python-package/xgboost/core.py b/python-package/xgboost/core.py index 8c256782014f..0273b7230da1 100644 --- a/python-package/xgboost/core.py +++ b/python-package/xgboost/core.py @@ -153,7 +153,7 @@ def _maybe_from_pandas(data, feature_names, feature_types): raise ValueError('DataFrame.dtypes must be int, float or bool') if feature_names is None: - feature_names = data.columns.tolist() + feature_names = data.columns.format() if feature_types is None: mapper = {'int64': 'int', 'float64': 'q', 'bool': 'i'} feature_types = [mapper[dtype.name] for dtype in dtypes] diff --git a/tests/python/test_basic.py b/tests/python/test_basic.py index 7cb68ad77836..fa287b247d86 100644 --- a/tests/python/test_basic.py +++ b/tests/python/test_basic.py @@ -118,6 +118,21 @@ def test_pandas(self): df = pd.DataFrame([[1, 2., 'x'], [2, 3., 'y']], columns=['a', 'b', 'c']) self.assertRaises(ValueError, xgb.DMatrix, df) + # numeric columns + df = pd.DataFrame([[1, 2., True], [2, 3., False]]) + dm = xgb.DMatrix(df, label=pd.Series([1, 2])) + assert dm.feature_names == ['0', '1', '2'] + assert dm.feature_types == ['int', 'q', 'i'] + assert dm.num_row() == 2 + assert dm.num_col() == 3 + + df = pd.DataFrame([[1, 2., 1], [2, 3., 1]], columns=[4, 5, 6]) + dm = xgb.DMatrix(df, label=pd.Series([1, 2])) + assert dm.feature_names == ['4', '5', '6'] + assert dm.feature_types == ['int', 'q', 'int'] + assert dm.num_row() == 2 + assert dm.num_col() == 3 + def test_load_file_invalid(self): self.assertRaises(ValueError, xgb.Booster,