Skip to content

Commit

Permalink
Merge pull request #5 from dmlc/master
Browse files Browse the repository at this point in the history
update from dmlc/xgboost
  • Loading branch information
yanqingmen committed Oct 7, 2015
2 parents 34f0b31 + 3109069 commit 3453b6e
Show file tree
Hide file tree
Showing 40 changed files with 1,065 additions and 242 deletions.
2 changes: 2 additions & 0 deletions .travis.yml
Expand Up @@ -30,6 +30,8 @@ addons:
- wget
- libcurl4-openssl-dev
- unzip
- python-numpy
- python-scipy

before_install:
- scripts/travis_osx_install.sh
Expand Down
2 changes: 2 additions & 0 deletions CONTRIBUTORS.md
Expand Up @@ -34,6 +34,7 @@ List of Contributors
* [Zygmunt Zając](https://github.com/zygmuntz)
- Zygmunt is the master behind the early stopping feature frequently used by kagglers.
* [Ajinkya Kale](https://github.com/ajkl)
* [Yuan Tang](https://github.com/terrytangyuan)
* [Boliang Chen](https://github.com/cblsjtu)
* [Vadim Khotilovich](https://github.com/khotilov)
* [Yangqing Men](https://github.com/yanqingmen)
Expand All @@ -48,3 +49,4 @@ List of Contributors
- Masaaki is the initial creator of xgboost python plotting module.
* [Hongliang Liu](https://github.com/phunterlau)
- Hongliang is the maintainer of xgboost python PyPI package for pip installation.
* [Huayi Zhang](https://github.com/irachex)
16 changes: 12 additions & 4 deletions Makefile
@@ -1,6 +1,6 @@
export CC = gcc
#build on the fly
export CXX = g++
export CC = $(if $(shell which gcc-5),gcc-5,gcc)
export CXX = $(if $(shell which g++-5),g++-5,g++)

export MPICXX = mpicxx
export LDFLAGS= -pthread -lm
export CFLAGS = -Wall -O3 -msse2 -Wno-unknown-pragmas -funroll-loops
Expand All @@ -21,9 +21,17 @@ endif
ifeq ($(no_omp),1)
CFLAGS += -DDISABLE_OPENMP
else
CFLAGS += -fopenmp
#CFLAGS += -fopenmp
ifeq ($(omp_mac_static),1)
#CFLAGS += -fopenmp -Bstatic
CFLAGS += -static-libgcc -static-libstdc++ -L. -fopenmp
#LDFLAGS += -Wl,--whole-archive -lpthread -Wl --no-whole-archive
else
CFLAGS += -fopenmp
endif
endif


# by default use c++11
ifeq ($(cxx11),1)
CFLAGS += -std=c++11
Expand Down
3 changes: 2 additions & 1 deletion R-package/DESCRIPTION
Expand Up @@ -23,7 +23,8 @@ Suggests:
ggplot2 (>= 1.0.0),
DiagrammeR (>= 0.6),
Ckmeans.1d.dp (>= 3.3.1),
vcd (>= 1.3)
vcd (>= 1.3),
testthat
Depends:
R (>= 2.10)
Imports:
Expand Down
9 changes: 8 additions & 1 deletion R-package/R/utils.R
Expand Up @@ -103,17 +103,21 @@ xgb.Booster.check <- function(bst, saveraw = TRUE)
## ----the following are low level iteratively function, not needed if
## you do not want to use them ---------------------------------------
# get dmatrix from data, label
xgb.get.DMatrix <- function(data, label = NULL, missing = NULL) {
xgb.get.DMatrix <- function(data, label = NULL, missing = NULL, weight = NULL) {
inClass <- class(data)
if (inClass == "dgCMatrix" || inClass == "matrix") {
if (is.null(label)) {
stop("xgboost: need label when data is a matrix")
}
dtrain <- xgb.DMatrix(data, label = label)
if (is.null(missing)){
dtrain <- xgb.DMatrix(data, label = label)
} else {
dtrain <- xgb.DMatrix(data, label = label, missing = missing)
}
if (!is.null(weight)){
xgb.setinfo(dtrain, "weight", weight)
}
} else {
if (!is.null(label)) {
warning("xgboost: label will be ignored.")
Expand All @@ -122,6 +126,9 @@ xgb.get.DMatrix <- function(data, label = NULL, missing = NULL) {
dtrain <- xgb.DMatrix(data)
} else if (inClass == "xgb.DMatrix") {
dtrain <- data
} else if (inClass == "data.frame") {
stop("xgboost only support numerical matrix input,
use 'data.frame' to transform the data.")
} else {
stop("xgboost: Invalid input of data")
}
Expand Down
10 changes: 9 additions & 1 deletion R-package/R/xgb.train.R
Expand Up @@ -72,6 +72,8 @@
#' keeps getting worse consecutively for \code{k} rounds.
#' @param maximize If \code{feval} and \code{early.stop.round} are set, then \code{maximize} must be set as well.
#' \code{maximize=TRUE} means the larger the evaluation score the better.
#' @param save_period save the model to the disk in every \code{save_period} rounds, 0 means no such action.
#' @param save_name the name or path for periodically saved model file.
#' @param ... other parameters to pass to \code{params}.
#'
#' @details
Expand Down Expand Up @@ -120,7 +122,8 @@
#'
xgb.train <- function(params=list(), data, nrounds, watchlist = list(),
obj = NULL, feval = NULL, verbose = 1, print.every.n=1L,
early.stop.round = NULL, maximize = NULL, ...) {
early.stop.round = NULL, maximize = NULL,
save_period = 0, save_name = "xgboost.model", ...) {
dtrain <- data
if (typeof(params) != "list") {
stop("xgb.train: first argument params must be list")
Expand Down Expand Up @@ -215,6 +218,11 @@ xgb.train <- function(params=list(), data, nrounds, watchlist = list(),
}
}
}
if (save_period > 0) {
if (i %% save_period == 0) {
xgb.save(bst, save_name)
}
}
}
bst <- xgb.Booster.check(bst)
if (!is.null(early.stop.round)) {
Expand Down
17 changes: 9 additions & 8 deletions R-package/R/xgboost.R
Expand Up @@ -31,11 +31,14 @@
#' @param print.every.n Print every N progress messages when \code{verbose>0}. Default is 1 which means all messages are printed.
#' @param missing Missing is only used when input is dense matrix, pick a float
#' value that represents missing value. Sometimes a data use 0 or other extreme value to represents missing values.
#' @param weight a vector indicating the weight for each row of the input.
#' @param early.stop.round If \code{NULL}, the early stopping function is not triggered.
#' If set to an integer \code{k}, training with a validation set will stop if the performance
#' keeps getting worse consecutively for \code{k} rounds.
#' @param maximize If \code{feval} and \code{early.stop.round} are set, then \code{maximize} must be set as well.
#' \code{maximize=TRUE} means the larger the evaluation score the better.
#' @param save_period save the model to the disk in every \code{save_period} rounds, 0 means no such action.
#' @param save_name the name or path for periodically saved model file.
#' @param ... other parameters to pass to \code{params}.
#'
#' @details
Expand All @@ -56,14 +59,11 @@
#'
#' @export
#'
xgboost <- function(data = NULL, label = NULL, missing = NULL, params = list(), nrounds,
xgboost <- function(data = NULL, label = NULL, missing = NULL, weight = NULL,
params = list(), nrounds,
verbose = 1, print.every.n = 1L, early.stop.round = NULL,
maximize = NULL, ...) {
if (is.null(missing)) {
dtrain <- xgb.get.DMatrix(data, label)
} else {
dtrain <- xgb.get.DMatrix(data, label, missing)
}
maximize = NULL, save_period = 0, save_name = "xgboost.model", ...) {
dtrain <- xgb.get.DMatrix(data, label, missing, weight)

params <- append(params, list(...))

Expand All @@ -74,7 +74,8 @@ xgboost <- function(data = NULL, label = NULL, missing = NULL, params = list(),
}

bst <- xgb.train(params, dtrain, nrounds, watchlist, verbose = verbose, print.every.n=print.every.n,
early.stop.round = early.stop.round)
early.stop.round = early.stop.round, maximize = maximize,
save_period = save_period, save_name = save_name)

return(bst)
}
Expand Down
1 change: 1 addition & 0 deletions R-package/demo/00Index
@@ -1,4 +1,5 @@
basic_walkthrough Basic feature walkthrough
caret_wrapper Use xgboost to train in caret library
custom_objective Cutomize loss function, and evaluation metric
boost_from_prediction Boosting from existing prediction
predict_first_ntree Predicting using first n trees
Expand Down
3 changes: 2 additions & 1 deletion R-package/demo/README.md
@@ -1,6 +1,7 @@
XGBoost R Feature Walkthrough
====
* [Basic walkthrough of wrappers](basic_walkthrough.R)
* [Basic walkthrough of wrappers](basic_walkthrough.R)
* [Train a xgboost model from caret library](caret_wrapper.R)
* [Cutomize loss function, and evaluation metric](custom_objective.R)
* [Boosting from existing prediction](boost_from_prediction.R)
* [Predicting using first n trees](predict_first_ntree.R)
Expand Down
35 changes: 35 additions & 0 deletions R-package/demo/caret_wrapper.R
@@ -0,0 +1,35 @@
# install development version of caret library that contains xgboost models
devtools::install_github("topepo/caret/pkg/caret")
require(caret)
require(xgboost)
require(data.table)
require(vcd)
require(e1071)

# Load Arthritis dataset in memory.
data(Arthritis)
# Create a copy of the dataset with data.table package (data.table is 100% compliant with R dataframe but its syntax is a lot more consistent and its performance are really good).
df <- data.table(Arthritis, keep.rownames = F)

# Let's add some new categorical features to see if it helps. Of course these feature are highly correlated to the Age feature. Usually it's not a good thing in ML, but Tree algorithms (including boosted trees) are able to select the best features, even in case of highly correlated features.
# For the first feature we create groups of age by rounding the real age. Note that we transform it to factor (categorical data) so the algorithm treat them as independant values.
df[,AgeDiscret:= as.factor(round(Age/10,0))]

# Here is an even stronger simplification of the real age with an arbitrary split at 30 years old. I choose this value based on nothing. We will see later if simplifying the information based on arbitrary values is a good strategy (I am sure you already have an idea of how well it will work!).
df[,AgeCat:= as.factor(ifelse(Age > 30, "Old", "Young"))]

# We remove ID as there is nothing to learn from this feature (it will just add some noise as the dataset is small).
df[,ID:=NULL]

#-------------Basic Training using XGBoost in caret Library-----------------
# Set up control parameters for caret::train
# Here we use 10-fold cross-validation, repeating twice, and using random search for tuning hyper-parameters.
fitControl <- trainControl(method = "cv", number = 10, repeats = 2, search = "random")
# train a xgbTree model using caret::train
model <- train(factor(Improved)~., data = df, method = "xgbTree", trControl = fitControl)

# Instead of tree for our boosters, you can also fit a linear regression or logistic regression model using xgbLinear
# model <- train(factor(Improved)~., data = df, method = "xgbLinear", trControl = fitControl)

# See model results
print(model)
1 change: 1 addition & 0 deletions R-package/demo/runall.R
Expand Up @@ -9,3 +9,4 @@ demo(create_sparse_matrix)
demo(predict_leaf_indices)
demo(early_stopping)
demo(poisson_regression)
demo(caret_wrapper)
7 changes: 6 additions & 1 deletion R-package/man/xgb.train.Rd
Expand Up @@ -6,7 +6,8 @@
\usage{
xgb.train(params = list(), data, nrounds, watchlist = list(), obj = NULL,
feval = NULL, verbose = 1, print.every.n = 1L,
early.stop.round = NULL, maximize = NULL, ...)
early.stop.round = NULL, maximize = NULL, save_period = 0,
save_name = "xgboost.model", ...)
}
\arguments{
\item{params}{the list of parameters.
Expand Down Expand Up @@ -87,6 +88,10 @@ keeps getting worse consecutively for \code{k} rounds.}
\item{maximize}{If \code{feval} and \code{early.stop.round} are set, then \code{maximize} must be set as well.
\code{maximize=TRUE} means the larger the evaluation score the better.}
\item{save_period}{save the model to the disk in every \code{save_period} rounds, 0 means no such action.}
\item{save_name}{the name or path for periodically saved model file.}
\item{...}{other parameters to pass to \code{params}.}
}
\description{
Expand Down
13 changes: 10 additions & 3 deletions R-package/man/xgboost.Rd
Expand Up @@ -4,9 +4,10 @@
\alias{xgboost}
\title{eXtreme Gradient Boosting (Tree) library}
\usage{
xgboost(data = NULL, label = NULL, missing = NULL, params = list(),
nrounds, verbose = 1, print.every.n = 1L, early.stop.round = NULL,
maximize = NULL, ...)
xgboost(data = NULL, label = NULL, missing = NULL, weight = NULL,
params = list(), nrounds, verbose = 1, print.every.n = 1L,
early.stop.round = NULL, maximize = NULL, save_period = 0,
save_name = "xgboost.model", ...)
}
\arguments{
\item{data}{takes \code{matrix}, \code{dgCMatrix}, local data file or
Expand All @@ -18,6 +19,8 @@ if data is local data file or \code{xgb.DMatrix}.}
\item{missing}{Missing is only used when input is dense matrix, pick a float
value that represents missing value. Sometimes a data use 0 or other extreme value to represents missing values.}

\item{weight}{a vector indicating the weight for each row of the input.}

\item{params}{the list of parameters.

Commonly used ones are:
Expand Down Expand Up @@ -51,6 +54,10 @@ keeps getting worse consecutively for \code{k} rounds.}
\item{maximize}{If \code{feval} and \code{early.stop.round} are set, then \code{maximize} must be set as well.
\code{maximize=TRUE} means the larger the evaluation score the better.}

\item{save_period}{save the model to the disk in every \code{save_period} rounds, 0 means no such action.}

\item{save_name}{the name or path for periodically saved model file.}

\item{...}{other parameters to pass to \code{params}.}
}
\description{
Expand Down
4 changes: 4 additions & 0 deletions R-package/tests/testthat.R
@@ -0,0 +1,4 @@
library(testthat)
library(xgboost)

test_check("xgboost")
33 changes: 33 additions & 0 deletions R-package/tests/testthat/test_basic.R
@@ -0,0 +1,33 @@
require(xgboost)

context("basic functions")

data(agaricus.train, package='xgboost')
data(agaricus.test, package='xgboost')
train = agaricus.train
test = agaricus.test

test_that("train and predict", {
bst = xgboost(data = train$data, label = train$label, max.depth = 2,
eta = 1, nthread = 2, nround = 2, objective = "binary:logistic")
pred = predict(bst, test$data)
})


test_that("early stopping", {
res = xgb.cv(data = train$data, label = train$label, max.depth = 2, nfold = 5,
eta = 0.3, nthread = 2, nround = 20, objective = "binary:logistic",
early.stop.round = 3, maximize = FALSE)
expect_true(nrow(res)<20)
bst = xgboost(data = train$data, label = train$label, max.depth = 2,
eta = 0.3, nthread = 2, nround = 20, objective = "binary:logistic",
early.stop.round = 3, maximize = FALSE)
pred = predict(bst, test$data)
})

test_that("save_period", {
bst = xgboost(data = train$data, label = train$label, max.depth = 2,
eta = 0.3, nthread = 2, nround = 20, objective = "binary:logistic",
save_period = 10, save_name = "xgb.model")
pred = predict(bst, test$data)
})
47 changes: 47 additions & 0 deletions R-package/tests/testthat/test_custom_objective.R
@@ -0,0 +1,47 @@
context('Test models with custom objective')

require(xgboost)

test_that("custom objective works", {
data(agaricus.train, package='xgboost')
data(agaricus.test, package='xgboost')
dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label)

watchlist <- list(eval = dtest, train = dtrain)
num_round <- 2

logregobj <- function(preds, dtrain) {
labels <- getinfo(dtrain, "label")
preds <- 1/(1 + exp(-preds))
grad <- preds - labels
hess <- preds * (1 - preds)
return(list(grad = grad, hess = hess))
}
evalerror <- function(preds, dtrain) {
labels <- getinfo(dtrain, "label")
err <- as.numeric(sum(labels != (preds > 0)))/length(labels)
return(list(metric = "error", value = err))
}

param <- list(max.depth=2, eta=1, nthread = 2, silent=1,
objective=logregobj, eval_metric=evalerror)

bst <- xgb.train(param, dtrain, num_round, watchlist)
expect_equal(class(bst), "xgb.Booster")
expect_equal(length(bst$raw), 1064)
attr(dtrain, 'label') <- getinfo(dtrain, 'label')

logregobjattr <- function(preds, dtrain) {
labels <- attr(dtrain, 'label')
preds <- 1/(1 + exp(-preds))
grad <- preds - labels
hess <- preds * (1 - preds)
return(list(grad = grad, hess = hess))
}
param <- list(max.depth=2, eta=1, nthread = 2, silent=1,
objective=logregobjattr, eval_metric=evalerror)
bst <- xgb.train(param, dtrain, num_round, watchlist)
expect_equal(class(bst), "xgb.Booster")
expect_equal(length(bst$raw), 1064)
})
19 changes: 19 additions & 0 deletions R-package/tests/testthat/test_glm.R
@@ -0,0 +1,19 @@
context('Test generalized linear models')

require(xgboost)

test_that("glm works", {
data(agaricus.train, package='xgboost')
data(agaricus.test, package='xgboost')
dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label)
expect_equal(class(dtrain), "xgb.DMatrix")
expect_equal(class(dtest), "xgb.DMatrix")
param <- list(objective = "binary:logistic", booster = "gblinear",
nthread = 2, alpha = 0.0001, lambda = 1)
watchlist <- list(eval = dtest, train = dtrain)
num_round <- 2
bst <- xgb.train(param, dtrain, num_round, watchlist)
ypred <- predict(bst, dtest)
expect_equal(length(getinfo(dtest, 'label')), 1611)
})

0 comments on commit 3453b6e

Please sign in to comment.