Merge pull request #5 from dmlc/master

update from dmlc/xgboost
dmlc · Oct 7, 2015 · 3453b6e · 3453b6e
2 parents 34f0b31 + 3109069
commit 3453b6e
Show file tree

Hide file tree

Showing 40 changed files with 1,065 additions and 242 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -30,6 +30,8 @@ addons:
       - wget
       - libcurl4-openssl-dev
       - unzip
+      - python-numpy
+      - python-scipy
 
 before_install:
   - scripts/travis_osx_install.sh

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
@@ -34,6 +34,7 @@ List of Contributors
 * [Zygmunt Zając](https://github.com/zygmuntz)
   - Zygmunt is the master behind the early stopping feature frequently used by kagglers.
 * [Ajinkya Kale](https://github.com/ajkl)
+* [Yuan Tang](https://github.com/terrytangyuan)
 * [Boliang Chen](https://github.com/cblsjtu)
 * [Vadim Khotilovich](https://github.com/khotilov)
 * [Yangqing Men](https://github.com/yanqingmen)
@@ -48,3 +49,4 @@ List of Contributors
   - Masaaki is the initial creator of xgboost python plotting module.
 * [Hongliang Liu](https://github.com/phunterlau)
   - Hongliang is the maintainer of xgboost python PyPI package for pip installation.
+* [Huayi Zhang](https://github.com/irachex)
diff --git a/Makefile b/Makefile
@@ -1,6 +1,6 @@
-export CC  = gcc
-#build on the fly
-export CXX = g++
+export CC  = $(if $(shell which gcc-5),gcc-5,gcc)
+export CXX = $(if $(shell which g++-5),g++-5,g++)
+
 export MPICXX = mpicxx
 export LDFLAGS= -pthread -lm
 export CFLAGS = -Wall -O3 -msse2  -Wno-unknown-pragmas -funroll-loops
@@ -21,9 +21,17 @@ endif
 ifeq ($(no_omp),1)
 	CFLAGS += -DDISABLE_OPENMP
 else
-	CFLAGS += -fopenmp
+	#CFLAGS += -fopenmp
+	ifeq ($(omp_mac_static),1)
+		#CFLAGS += -fopenmp -Bstatic
+		CFLAGS += -static-libgcc -static-libstdc++ -L. -fopenmp
+		#LDFLAGS += -Wl,--whole-archive -lpthread -Wl --no-whole-archive
+	else
+		CFLAGS += -fopenmp
+	endif
 endif
 
+
 # by default use c++11
 ifeq ($(cxx11),1)
 	CFLAGS += -std=c++11

diff --git a/R-package/DESCRIPTION b/R-package/DESCRIPTION
@@ -23,7 +23,8 @@ Suggests:
     ggplot2 (>= 1.0.0),
     DiagrammeR (>= 0.6),
     Ckmeans.1d.dp (>= 3.3.1),
-    vcd (>= 1.3)
+    vcd (>= 1.3),
+    testthat
 Depends:
     R (>= 2.10)
 Imports:

diff --git a/R-package/R/utils.R b/R-package/R/utils.R
@@ -103,17 +103,21 @@ xgb.Booster.check <- function(bst, saveraw = TRUE)
 ## ----the following are low level iteratively function, not needed if
 ## you do not want to use them ---------------------------------------
 # get dmatrix from data, label
-xgb.get.DMatrix <- function(data, label = NULL, missing = NULL) {
+xgb.get.DMatrix <- function(data, label = NULL, missing = NULL, weight = NULL) {
   inClass <- class(data)
   if (inClass == "dgCMatrix" || inClass == "matrix") {
     if (is.null(label)) {
       stop("xgboost: need label when data is a matrix")
     }
+    dtrain <- xgb.DMatrix(data, label = label)
     if (is.null(missing)){
       dtrain <- xgb.DMatrix(data, label = label)
     } else {
       dtrain <- xgb.DMatrix(data, label = label, missing = missing)
     }
+    if (!is.null(weight)){
+      xgb.setinfo(dtrain, "weight", weight)
+    }
   } else {
     if (!is.null(label)) {
       warning("xgboost: label will be ignored.")
@@ -122,6 +126,9 @@ xgb.get.DMatrix <- function(data, label = NULL, missing = NULL) {
       dtrain <- xgb.DMatrix(data)
     } else if (inClass == "xgb.DMatrix") {
       dtrain <- data
+    } else if (inClass == "data.frame") {
+      stop("xgboost only support numerical matrix input, 
+           use 'data.frame' to transform the data.")
     } else {
       stop("xgboost: Invalid input of data")
     }

diff --git a/R-package/R/xgb.train.R b/R-package/R/xgb.train.R
@@ -72,6 +72,8 @@
 #'     keeps getting worse consecutively for \code{k} rounds.
 #' @param maximize If \code{feval} and \code{early.stop.round} are set, then \code{maximize} must be set as well.
 #'     \code{maximize=TRUE} means the larger the evaluation score the better.
+#' @param save_period save the model to the disk in every \code{save_period} rounds, 0 means no such action.
+#' @param save_name the name or path for periodically saved model file.
 #' @param ... other parameters to pass to \code{params}.
 #' 
 #' @details 
@@ -120,7 +122,8 @@
 #' 
 xgb.train <- function(params=list(), data, nrounds, watchlist = list(), 
                       obj = NULL, feval = NULL, verbose = 1, print.every.n=1L,
-                      early.stop.round = NULL, maximize = NULL, ...) {
+                      early.stop.round = NULL, maximize = NULL, 
+                      save_period = 0, save_name = "xgboost.model", ...) {
   dtrain <- data
   if (typeof(params) != "list") {
     stop("xgb.train: first argument params must be list")
@@ -215,6 +218,11 @@ xgb.train <- function(params=list(), data, nrounds, watchlist = list(),
         }
       }
     }
+    if (save_period > 0) {
+      if (i %% save_period == 0) {
+        xgb.save(bst, save_name)
+      }
+    }
   }
   bst <- xgb.Booster.check(bst)
   if (!is.null(early.stop.round)) {

diff --git a/R-package/R/xgboost.R b/R-package/R/xgboost.R
@@ -31,11 +31,14 @@
 #' @param print.every.n Print every N progress messages when \code{verbose>0}. Default is 1 which means all messages are printed.
 #' @param missing Missing is only used when input is dense matrix, pick a float 
 #'     value that represents missing value. Sometimes a data use 0 or other extreme value to represents missing values.
+#' @param weight a vector indicating the weight for each row of the input.
 #' @param early.stop.round If \code{NULL}, the early stopping function is not triggered. 
 #'     If set to an integer \code{k}, training with a validation set will stop if the performance 
 #'     keeps getting worse consecutively for \code{k} rounds.
 #' @param maximize If \code{feval} and \code{early.stop.round} are set, then \code{maximize} must be set as well.
 #'     \code{maximize=TRUE} means the larger the evaluation score the better.
+#' @param save_period save the model to the disk in every \code{save_period} rounds, 0 means no such action.
+#' @param save_name the name or path for periodically saved model file.
 #' @param ... other parameters to pass to \code{params}.
 #' 
 #' @details 
@@ -56,14 +59,11 @@
 #' 
 #' @export
 #' 
-xgboost <- function(data = NULL, label = NULL, missing = NULL, params = list(), nrounds, 
+xgboost <- function(data = NULL, label = NULL, missing = NULL, weight = NULL, 
+                    params = list(), nrounds, 
                     verbose = 1, print.every.n = 1L, early.stop.round = NULL,
-                    maximize = NULL, ...) {
-  if (is.null(missing)) {
-    dtrain <- xgb.get.DMatrix(data, label)
-  } else {
-    dtrain <- xgb.get.DMatrix(data, label, missing)
-  }
+                    maximize = NULL, save_period = 0, save_name = "xgboost.model", ...) {
+  dtrain <- xgb.get.DMatrix(data, label, missing, weight)
 
   params <- append(params, list(...))
 
@@ -74,7 +74,8 @@ xgboost <- function(data = NULL, label = NULL, missing = NULL, params = list(),
   }
 
   bst <- xgb.train(params, dtrain, nrounds, watchlist, verbose = verbose, print.every.n=print.every.n,
-                   early.stop.round = early.stop.round)
+                   early.stop.round = early.stop.round, maximize = maximize,
+                   save_period = save_period, save_name = save_name)
 
   return(bst)
 } 

diff --git a/R-package/demo/00Index b/R-package/demo/00Index
@@ -1,4 +1,5 @@
 basic_walkthrough               Basic feature walkthrough
+caret_wrapper                   Use xgboost to train in caret library
 custom_objective                Cutomize loss function, and evaluation metric
 boost_from_prediction           Boosting from existing prediction
 predict_first_ntree             Predicting using first n trees

diff --git a/R-package/demo/README.md b/R-package/demo/README.md
@@ -1,6 +1,7 @@
 XGBoost R Feature Walkthrough
 ====
-* [Basic walkthrough of wrappers](basic_walkthrough.R) 
+* [Basic walkthrough of wrappers](basic_walkthrough.R)
+* [Train a xgboost model from caret library](caret_wrapper.R)
 * [Cutomize loss function, and evaluation metric](custom_objective.R)
 * [Boosting from existing prediction](boost_from_prediction.R)
 * [Predicting using first n trees](predict_first_ntree.R)

diff --git a/R-package/demo/caret_wrapper.R b/R-package/demo/caret_wrapper.R
@@ -0,0 +1,35 @@
+# install development version of caret library that contains xgboost models
+devtools::install_github("topepo/caret/pkg/caret") 
+require(caret)
+require(xgboost)
+require(data.table)
+require(vcd)
+require(e1071)
+
+# Load Arthritis dataset in memory.
+data(Arthritis)
+# Create a copy of the dataset with data.table package (data.table is 100% compliant with R dataframe but its syntax is a lot more consistent and its performance are really good).
+df <- data.table(Arthritis, keep.rownames = F)
+
+# Let's add some new categorical features to see if it helps. Of course these feature are highly correlated to the Age feature. Usually it's not a good thing in ML, but Tree algorithms (including boosted trees) are able to select the best features, even in case of highly correlated features.
+# For the first feature we create groups of age by rounding the real age. Note that we transform it to factor (categorical data) so the algorithm treat them as independant values.
+df[,AgeDiscret:= as.factor(round(Age/10,0))]
+
+# Here is an even stronger simplification of the real age with an arbitrary split at 30 years old. I choose this value based on nothing. We will see later if simplifying the information based on arbitrary values is a good strategy (I am sure you already have an idea of how well it will work!).
+df[,AgeCat:= as.factor(ifelse(Age > 30, "Old", "Young"))]
+
+# We remove ID as there is nothing to learn from this feature (it will just add some noise as the dataset is small).
+df[,ID:=NULL]
+
+#-------------Basic Training using XGBoost in caret Library-----------------
+# Set up control parameters for caret::train
+# Here we use 10-fold cross-validation, repeating twice, and using random search for tuning hyper-parameters.
+fitControl <- trainControl(method = "cv", number = 10, repeats = 2, search = "random")
+# train a xgbTree model using caret::train
+model <- train(factor(Improved)~., data = df, method = "xgbTree", trControl = fitControl)
+
+# Instead of tree for our boosters, you can also fit a linear regression or logistic regression model using xgbLinear
+# model <- train(factor(Improved)~., data = df, method = "xgbLinear", trControl = fitControl)
+
+# See model results
+print(model)
diff --git a/R-package/demo/runall.R b/R-package/demo/runall.R
@@ -9,3 +9,4 @@ demo(create_sparse_matrix)
 demo(predict_leaf_indices)
 demo(early_stopping)
 demo(poisson_regression)
+demo(caret_wrapper)
diff --git a/R-package/man/xgb.train.Rd b/R-package/man/xgb.train.Rd
@@ -6,7 +6,8 @@
 \usage{
 xgb.train(params = list(), data, nrounds, watchlist = list(), obj = NULL,
   feval = NULL, verbose = 1, print.every.n = 1L,
-  early.stop.round = NULL, maximize = NULL, ...)
+  early.stop.round = NULL, maximize = NULL, save_period = 0,
+  save_name = "xgboost.model", ...)
 }
 \arguments{
 \item{params}{the list of parameters.
@@ -87,6 +88,10 @@ keeps getting worse consecutively for \code{k} rounds.}
 \item{maximize}{If \code{feval} and \code{early.stop.round} are set, then \code{maximize} must be set as well.
 \code{maximize=TRUE} means the larger the evaluation score the better.}
 
+\item{save_period}{save the model to the disk in every \code{save_period} rounds, 0 means no such action.}
+
+\item{save_name}{the name or path for periodically saved model file.}
+
 \item{...}{other parameters to pass to \code{params}.}
 }
 \description{

diff --git a/R-package/man/xgboost.Rd b/R-package/man/xgboost.Rd
@@ -4,9 +4,10 @@
 \alias{xgboost}
 \title{eXtreme Gradient Boosting (Tree) library}
 \usage{
-xgboost(data = NULL, label = NULL, missing = NULL, params = list(),
-  nrounds, verbose = 1, print.every.n = 1L, early.stop.round = NULL,
-  maximize = NULL, ...)
+xgboost(data = NULL, label = NULL, missing = NULL, weight = NULL,
+  params = list(), nrounds, verbose = 1, print.every.n = 1L,
+  early.stop.round = NULL, maximize = NULL, save_period = 0,
+  save_name = "xgboost.model", ...)
 }
 \arguments{
 \item{data}{takes \code{matrix}, \code{dgCMatrix}, local data file or
@@ -18,6 +19,8 @@ if data is local data file or  \code{xgb.DMatrix}.}
 \item{missing}{Missing is only used when input is dense matrix, pick a float
 value that represents missing value. Sometimes a data use 0 or other extreme value to represents missing values.}
 
+\item{weight}{a vector indicating the weight for each row of the input.}
+
 \item{params}{the list of parameters.
 
 Commonly used ones are:
@@ -51,6 +54,10 @@ keeps getting worse consecutively for \code{k} rounds.}
 \item{maximize}{If \code{feval} and \code{early.stop.round} are set, then \code{maximize} must be set as well.
 \code{maximize=TRUE} means the larger the evaluation score the better.}
 
+\item{save_period}{save the model to the disk in every \code{save_period} rounds, 0 means no such action.}
+
+\item{save_name}{the name or path for periodically saved model file.}
+
 \item{...}{other parameters to pass to \code{params}.}
 }
 \description{

diff --git a/R-package/tests/testthat.R b/R-package/tests/testthat.R
@@ -0,0 +1,4 @@
+library(testthat)
+library(xgboost)
+
+test_check("xgboost")
diff --git a/R-package/tests/testthat/test_basic.R b/R-package/tests/testthat/test_basic.R
@@ -0,0 +1,33 @@
+require(xgboost)
+
+context("basic functions")
+
+data(agaricus.train, package='xgboost')
+data(agaricus.test, package='xgboost')
+train = agaricus.train
+test = agaricus.test
+
+test_that("train and predict", {
+  bst = xgboost(data = train$data, label = train$label, max.depth = 2,
+                eta = 1, nthread = 2, nround = 2, objective = "binary:logistic")
+  pred = predict(bst, test$data)
+})
+
+
+test_that("early stopping", {
+  res = xgb.cv(data = train$data, label = train$label, max.depth = 2, nfold = 5,
+               eta = 0.3, nthread = 2, nround = 20, objective = "binary:logistic",
+               early.stop.round = 3, maximize = FALSE)
+  expect_true(nrow(res)<20)
+  bst = xgboost(data = train$data, label = train$label, max.depth = 2,
+                eta = 0.3, nthread = 2, nround = 20, objective = "binary:logistic",
+                early.stop.round = 3, maximize = FALSE)
+  pred = predict(bst, test$data)
+})
+
+test_that("save_period", {
+  bst = xgboost(data = train$data, label = train$label, max.depth = 2,
+                eta = 0.3, nthread = 2, nround = 20, objective = "binary:logistic",
+                save_period = 10, save_name = "xgb.model")
+  pred = predict(bst, test$data)
+})
diff --git a/R-package/tests/testthat/test_custom_objective.R b/R-package/tests/testthat/test_custom_objective.R
@@ -0,0 +1,47 @@
+context('Test models with custom objective')
+
+require(xgboost)
+
+test_that("custom objective works", {
+  data(agaricus.train, package='xgboost')
+  data(agaricus.test, package='xgboost')
+  dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
+  dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label)
+
+  watchlist <- list(eval = dtest, train = dtrain)
+  num_round <- 2
+
+  logregobj <- function(preds, dtrain) {
+    labels <- getinfo(dtrain, "label")
+    preds <- 1/(1 + exp(-preds))
+    grad <- preds - labels
+    hess <- preds * (1 - preds)
+    return(list(grad = grad, hess = hess))
+  }
+  evalerror <- function(preds, dtrain) {
+    labels <- getinfo(dtrain, "label")
+    err <- as.numeric(sum(labels != (preds > 0)))/length(labels)
+    return(list(metric = "error", value = err))
+  }
+
+  param <- list(max.depth=2, eta=1, nthread = 2, silent=1, 
+                objective=logregobj, eval_metric=evalerror)
+
+  bst <- xgb.train(param, dtrain, num_round, watchlist)
+  expect_equal(class(bst), "xgb.Booster")
+  expect_equal(length(bst$raw), 1064)
+  attr(dtrain, 'label') <- getinfo(dtrain, 'label')
+
+  logregobjattr <- function(preds, dtrain) {
+    labels <- attr(dtrain, 'label')
+    preds <- 1/(1 + exp(-preds))
+    grad <- preds - labels
+    hess <- preds * (1 - preds)
+    return(list(grad = grad, hess = hess))
+  }
+  param <- list(max.depth=2, eta=1, nthread = 2, silent=1, 
+                objective=logregobjattr, eval_metric=evalerror)
+  bst <- xgb.train(param, dtrain, num_round, watchlist)
+  expect_equal(class(bst), "xgb.Booster")
+  expect_equal(length(bst$raw), 1064)
+})
diff --git a/R-package/tests/testthat/test_glm.R b/R-package/tests/testthat/test_glm.R
@@ -0,0 +1,19 @@
+context('Test generalized linear models')
+
+require(xgboost)
+
+test_that("glm works", {
+  data(agaricus.train, package='xgboost')
+  data(agaricus.test, package='xgboost')
+  dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
+  dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label)
+  expect_equal(class(dtrain), "xgb.DMatrix")
+  expect_equal(class(dtest), "xgb.DMatrix")
+  param <- list(objective = "binary:logistic", booster = "gblinear",
+                nthread = 2, alpha = 0.0001, lambda = 1)
+  watchlist <- list(eval = dtest, train = dtrain)
+  num_round <- 2
+  bst <- xgb.train(param, dtrain, num_round, watchlist)
+  ypred <- predict(bst, dtest)
+  expect_equal(length(getinfo(dtest, 'label')), 1611)
+})