diff --git a/.gitignore b/.gitignore
index 048803abd7de..276ed2d54be1 100644
--- a/.gitignore
+++ b/.gitignore
@@ -66,3 +66,8 @@ java/xgboost4j-demo/tmp/
 java/xgboost4j-demo/model/
 nb-configuration*
 dmlc-core
+# Eclipse
+.project
+.cproject
+.pydevproject
+.settings/
diff --git a/CHANGES.md b/CHANGES.md
index a8ddcd7ea577..25d7efe26942 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -37,11 +37,22 @@ xgboost-0.4
 
 on going at master
 ==================
-* Fix List
-  - Fixed possible problem of poisson regression for R.
-* Python module now throw exception instead of crash terminal when a parameter error happens.
-* Python module now has importance plot and tree plot functions.
+* Changes in R library
+  - fixed possible problem of poisson regression.
+  - switched from 0 to NA for missing values.
+  - exposed access to additional model parameters.
+* Changes in Python library
+  - throws exception instead of crash terminal when a parameter error happens.
+  - has importance plot and tree plot functions.
+  - accepts different learning rates for each boosting round.
+  - allows model training continuation from previously saved model.
+  - allows early stopping in CV.
+  - allows feval to return a list of tuples.
+  - allows eval_metric to handle additional format.
+  - improved compatibility in sklearn module.
+  - additional parameters added for sklearn wrapper.
+  - added pip installation functionality.
+  - supports more Pandas DataFrame dtypes. 
+  - added best_ntree_limit attribute, in addition to best_score and best_iteration.
 * Java api is ready for use
-* Added more test cases and continuous integration to make each build more robust
-* Improvements in sklearn compatible module
-* Added pip installation functionality for python module
+* Added more test cases and continuous integration to make each build more robust.
diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index 32a6745f01f4..568ec2635e7c 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -13,6 +13,8 @@ Committers are people who have made substantial contribution to the project and
   - Bing is the original creator of xgboost python package and currently the maintainer of [XGBoost.jl](https://github.com/antinucleon/XGBoost.jl).
 * [Michael Benesty](https://github.com/pommedeterresautee)
   - Micheal is a lawyer, data scientist in France, he is the creator of xgboost interactive analysis module in R.
+* [Yuan Tang](https://github.com/terrytangyuan)
+  - Yuan is a data scientist in Chicago, US. He contributed mostly in R and Python packages. 
 
 Become a Comitter
 -----------------
@@ -34,7 +36,6 @@ List of Contributors
 * [Zygmunt Zając](https://github.com/zygmuntz)
   - Zygmunt is the master behind the early stopping feature frequently used by kagglers.
 * [Ajinkya Kale](https://github.com/ajkl)
-* [Yuan Tang](https://github.com/terrytangyuan)
 * [Boliang Chen](https://github.com/cblsjtu)
 * [Vadim Khotilovich](https://github.com/khotilov)
 * [Yangqing Men](https://github.com/yanqingmen)
@@ -49,4 +50,10 @@ List of Contributors
   - Masaaki is the initial creator of xgboost python plotting module.
 * [Hongliang Liu](https://github.com/phunterlau)
   - Hongliang is the maintainer of xgboost python PyPI package for pip installation.
+* [daiyl0320](https://github.com/daiyl0320)
+  - daiyl0320 contributed patch to xgboost distributed version more robust, and scales stably on TB scale datasets.
 * [Huayi Zhang](https://github.com/irachex)
+* [Johan Manders](https://github.com/johanmanders)
+* [yoori](https://github.com/yoori)
+* [Mathias Müller](https://github.com/far0n)
+* [Sam Thomson](https://github.com/sammthomson)
diff --git a/Makefile b/Makefile
index 6685b0c6daa7..986c5d7747a6 100644
--- a/Makefile
+++ b/Makefile
@@ -177,11 +177,11 @@ Rcheck:
 	R CMD check --as-cran xgboost*.tar.gz
 
 pythonpack:
-	#make clean
+	#for pip maintainer only
 	cd subtree/rabit;make clean;cd ..
 	rm -rf xgboost-deploy xgboost*.tar.gz
 	cp -r python-package xgboost-deploy
-	cp *.md xgboost-deploy/
+	#cp *.md xgboost-deploy/
 	cp LICENSE xgboost-deploy/
 	cp Makefile xgboost-deploy/xgboost
 	cp -r wrapper xgboost-deploy/xgboost
@@ -189,7 +189,7 @@ pythonpack:
 	cp -r multi-node xgboost-deploy/xgboost
 	cp -r windows xgboost-deploy/xgboost
 	cp -r src xgboost-deploy/xgboost
-
+	cp python-package/setup_pip.py xgboost-deploy/setup.py
 	#make python
 
 pythonbuild:
diff --git a/R-package/DESCRIPTION b/R-package/DESCRIPTION
index 59728f3c2f79..6594954f3528 100644
--- a/R-package/DESCRIPTION
+++ b/R-package/DESCRIPTION
@@ -3,16 +3,16 @@ Type: Package
 Title: Extreme Gradient Boosting
 Version: 0.4-2
 Date: 2015-08-01
-Author: Tianqi Chen <tianqi.tchen@gmail.com>, Tong He <hetong007@gmail.com>, Michael Benesty <michael@benesty.fr>
+Author: Tianqi Chen <tianqi.tchen@gmail.com>, Tong He <hetong007@gmail.com>,
+    Michael Benesty <michael@benesty.fr>
 Maintainer: Tong He <hetong007@gmail.com>
-Description: Extreme Gradient Boosting, which is an 
-    efficient implementation of gradient boosting framework. 
-    This package is its R interface. The package includes efficient 
-    linear model solver and tree learning algorithms. The package can automatically 
-    do parallel computation on a single machine which could be more than 10 times faster
-    than existing gradient boosting packages. It supports various
-    objective functions, including regression, classification and ranking. The
-    package is made to be extensible, so that users are also allowed to define
+Description: Extreme Gradient Boosting, which is an efficient implementation
+    of gradient boosting framework. This package is its R interface. The package
+    includes efficient linear model solver and tree learning algorithms. The package
+    can automatically do parallel computation on a single machine which could be
+    more than 10 times faster than existing gradient boosting packages. It supports
+    various objective functions, including regression, classification and ranking.
+    The package is made to be extensible, so that users are also allowed to define
     their own objectives easily.
 License: Apache License (== 2.0) | file LICENSE
 URL: https://github.com/dmlc/xgboost
@@ -20,16 +20,18 @@ BugReports: https://github.com/dmlc/xgboost/issues
 VignetteBuilder: knitr
 Suggests:
     knitr,
-    ggplot2 (>= 1.0.0),
-    DiagrammeR (>= 0.6),
+    ggplot2 (>= 1.0.1),
+    DiagrammeR (>= 0.8.1),
     Ckmeans.1d.dp (>= 3.3.1),
     vcd (>= 1.3),
-    testthat
+    testthat,
+    igraph (>= 1.0.1)
 Depends:
     R (>= 2.10)
 Imports:
     Matrix (>= 1.1-0),
     methods,
-    data.table (>= 1.9.4),
+    data.table (>= 1.9.6),
     magrittr (>= 1.5),
     stringr (>= 0.6.2)
+RoxygenNote: 5.0.1
diff --git a/R-package/NAMESPACE b/R-package/NAMESPACE
index a4f07799a3db..3cd80d5c2f5b 100644
--- a/R-package/NAMESPACE
+++ b/R-package/NAMESPACE
@@ -1,16 +1,19 @@
-# Generated by roxygen2 (4.1.1): do not edit by hand
+# Generated by roxygen2: do not edit by hand
 
 export(getinfo)
 export(setinfo)
 export(slice)
 export(xgb.DMatrix)
 export(xgb.DMatrix.save)
+export(xgb.create.features)
 export(xgb.cv)
 export(xgb.dump)
 export(xgb.importance)
 export(xgb.load)
 export(xgb.model.dt.tree)
+export(xgb.plot.deepness)
 export(xgb.plot.importance)
+export(xgb.plot.multi.trees)
 export(xgb.plot.tree)
 export(xgb.save)
 export(xgb.save.raw)
@@ -23,6 +26,7 @@ importClassesFrom(Matrix,dgCMatrix)
 importClassesFrom(Matrix,dgeMatrix)
 importFrom(Matrix,cBind)
 importFrom(Matrix,colSums)
+importFrom(Matrix,sparse.model.matrix)
 importFrom(Matrix,sparseVector)
 importFrom(data.table,":=")
 importFrom(data.table,as.data.table)
@@ -35,6 +39,7 @@ importFrom(data.table,setnames)
 importFrom(magrittr,"%>%")
 importFrom(magrittr,add)
 importFrom(magrittr,not)
+importFrom(stringr,str_detect)
 importFrom(stringr,str_extract)
 importFrom(stringr,str_extract_all)
 importFrom(stringr,str_match)
diff --git a/R-package/R/getinfo.xgb.DMatrix.R b/R-package/R/getinfo.xgb.DMatrix.R
index 26523699a2f9..3000a1e7d209 100644
--- a/R-package/R/getinfo.xgb.DMatrix.R
+++ b/R-package/R/getinfo.xgb.DMatrix.R
@@ -23,7 +23,6 @@ setClass('xgb.DMatrix')
 #' stopifnot(all(labels2 == 1-labels))
 #' @rdname getinfo
 #' @export
-#' 
 getinfo <- function(object, ...){
     UseMethod("getinfo")
 }
@@ -35,7 +34,7 @@ getinfo <- function(object, ...){
 #' @param ... other parameters
 #' @rdname getinfo
 #' @method getinfo xgb.DMatrix
-setMethod("getinfo", signature = "xgb.DMatrix", 
+setMethod("getinfo", signature = "xgb.DMatrix",
           definition = function(object, name) {
               if (typeof(name) != "character") {
                   stop("xgb.getinfo: name must be character")
@@ -43,7 +42,7 @@ setMethod("getinfo", signature = "xgb.DMatrix",
               if (class(object) != "xgb.DMatrix") {
                   stop("xgb.setinfo: first argument dtrain must be xgb.DMatrix")
               }
-              if (name != "label" && name != "weight" && 
+              if (name != "label" && name != "weight" &&
                       name != "base_margin" && name != "nrow") {
                   stop(paste("xgb.getinfo: unknown info name", name))
               }
@@ -54,4 +53,3 @@ setMethod("getinfo", signature = "xgb.DMatrix",
               }
               return(ret)
           })
-
diff --git a/R-package/R/predict.xgb.Booster.R b/R-package/R/predict.xgb.Booster.R
index 0c50b25043ce..d608f3465177 100644
--- a/R-package/R/predict.xgb.Booster.R
+++ b/R-package/R/predict.xgb.Booster.R
@@ -20,6 +20,17 @@ setClass("xgb.Booster",
 #'  only valid for gbtree, but not for gblinear. set it to be value bigger 
 #'  than 0. It will use all trees by default.
 #' @param predleaf whether predict leaf index instead. If set to TRUE, the output will be a matrix object.
+#' 
+#' @details  
+#' The option \code{ntreelimit} purpose is to let the user train a model with lots 
+#' of trees but use only the first trees for prediction to avoid overfitting 
+#' (without having to train a new model with less trees).
+#' 
+#' The option \code{predleaf} purpose is inspired from §3.1 of the paper 
+#' \code{Practical Lessons from Predicting Clicks on Ads at Facebook}.
+#' The idea is to use the model as a generator of new features which capture non linear link 
+#' from original features.
+#' 
 #' @examples
 #' data(agaricus.train, package='xgboost')
 #' data(agaricus.test, package='xgboost')
@@ -29,9 +40,8 @@ setClass("xgb.Booster",
 #'                eta = 1, nthread = 2, nround = 2,objective = "binary:logistic")
 #' pred <- predict(bst, test$data)
 #' @export
-#' 
-setMethod("predict", signature = "xgb.Booster", 
-          definition = function(object, newdata, missing = NULL, 
+setMethod("predict", signature = "xgb.Booster",
+          definition = function(object, newdata, missing = NA,
                                 outputmargin = FALSE, ntreelimit = NULL, predleaf = FALSE) {
   if (class(object) != "xgb.Booster"){
     stop("predict: model in prediction must be of class xgb.Booster")
@@ -39,11 +49,7 @@ setMethod("predict", signature = "xgb.Booster",
     object <- xgb.Booster.check(object, saveraw = FALSE)
   }
   if (class(newdata) != "xgb.DMatrix") {
-    if (is.null(missing)) {
-      newdata <- xgb.DMatrix(newdata)
-    } else {
-      newdata <- xgb.DMatrix(newdata, missing = missing)
-    }
+    newdata <- xgb.DMatrix(newdata, missing = missing)
   }
   if (is.null(ntreelimit)) {
     ntreelimit <- 0
@@ -52,14 +58,14 @@ setMethod("predict", signature = "xgb.Booster",
       stop("predict: ntreelimit must be equal to or greater than 1")
     }
   }
-  option = 0
+  option <- 0
   if (outputmargin) {
     option <- option + 1
   }
   if (predleaf) {
     option <- option + 2
   }
-  ret <- .Call("XGBoosterPredict_R", object$handle, newdata, as.integer(option), 
+  ret <- .Call("XGBoosterPredict_R", object$handle, newdata, as.integer(option),
                as.integer(ntreelimit), PACKAGE = "xgboost")
   if (predleaf){
       len <- getinfo(newdata, "nrow")
@@ -72,4 +78,3 @@ setMethod("predict", signature = "xgb.Booster",
   }
   return(ret)
 })
-
diff --git a/R-package/R/predict.xgb.Booster.handle.R b/R-package/R/predict.xgb.Booster.handle.R
index 685318f1219a..3e4013b759dc 100644
--- a/R-package/R/predict.xgb.Booster.handle.R
+++ b/R-package/R/predict.xgb.Booster.handle.R
@@ -5,15 +5,14 @@
 #' @param object Object of class "xgb.Boost.handle"
 #' @param ... Parameters pass to \code{predict.xgb.Booster}
 #' 
-setMethod("predict", signature = "xgb.Booster.handle", 
+setMethod("predict", signature = "xgb.Booster.handle",
           definition = function(object, ...) {
   if (class(object) != "xgb.Booster.handle"){
     stop("predict: model in prediction must be of class xgb.Booster.handle")
   }
-  
+
   bst <- xgb.handleToBooster(object)
-  
-  ret = predict(bst, ...)
+
+  ret <- predict(bst, ...)
   return(ret)
 })
-
diff --git a/R-package/R/setinfo.xgb.DMatrix.R b/R-package/R/setinfo.xgb.DMatrix.R
index 61019d8e2a5a..427de08d4ae7 100644
--- a/R-package/R/setinfo.xgb.DMatrix.R
+++ b/R-package/R/setinfo.xgb.DMatrix.R
@@ -21,7 +21,6 @@
 #' stopifnot(all(labels2 == 1-labels))
 #' @rdname setinfo
 #' @export
-#' 
 setinfo <- function(object, ...){
   UseMethod("setinfo")
 }
@@ -32,7 +31,7 @@ setinfo <- function(object, ...){
 #' @param ... other parameters
 #' @rdname setinfo
 #' @method setinfo xgb.DMatrix
-setMethod("setinfo", signature = "xgb.DMatrix", 
+setMethod("setinfo", signature = "xgb.DMatrix",
           definition = function(object, name, info) {
             xgb.setinfo(object, name, info)
           })
diff --git a/R-package/R/slice.xgb.DMatrix.R b/R-package/R/slice.xgb.DMatrix.R
index b70a8ee92c57..4626c2b4d80a 100644
--- a/R-package/R/slice.xgb.DMatrix.R
+++ b/R-package/R/slice.xgb.DMatrix.R
@@ -13,7 +13,6 @@ setClass('xgb.DMatrix')
 #' dsub <- slice(dtrain, 1:3)
 #' @rdname slice
 #' @export
-#' 
 slice <- function(object, ...){
     UseMethod("slice")
 }
@@ -23,19 +22,19 @@ slice <- function(object, ...){
 #' @param ... other parameters
 #' @rdname slice
 #' @method slice xgb.DMatrix
-setMethod("slice", signature = "xgb.DMatrix", 
+setMethod("slice", signature = "xgb.DMatrix",
           definition = function(object, idxset, ...) {
               if (class(object) != "xgb.DMatrix") {
                   stop("slice: first argument dtrain must be xgb.DMatrix")
               }
-              ret <- .Call("XGDMatrixSliceDMatrix_R", object, idxset, 
+              ret <- .Call("XGDMatrixSliceDMatrix_R", object, idxset,
                            PACKAGE = "xgboost")
-              
+
               attr_list <- attributes(object)
               nr <- xgb.numrow(object)
               len <- sapply(attr_list,length)
-              ind <- which(len==nr)
-              if (length(ind)>0) {
+              ind <- which(len == nr)
+              if (length(ind) > 0) {
                   nms <- names(attr_list)[ind]
                   for (i in 1:length(ind)) {
                     attr(ret,nms[i]) <- attr(object,nms[i])[idxset]
diff --git a/R-package/R/utils.R b/R-package/R/utils.R
index 732ef0d11b5a..926e82994d10 100644
--- a/R-package/R/utils.R
+++ b/R-package/R/utils.R
@@ -1,4 +1,4 @@
-#' @importClassesFrom Matrix dgCMatrix dgeMatrix
+  #' @importClassesFrom Matrix dgCMatrix dgeMatrix
 #' @import methods
 
 # depends on matrix
@@ -15,30 +15,30 @@ xgb.setinfo <- function(dmat, name, info) {
     stop("xgb.setinfo: first argument dtrain must be xgb.DMatrix")
   }
   if (name == "label") {
-    if (length(info)!=xgb.numrow(dmat))
+    if (length(info) != xgb.numrow(dmat))
       stop("The length of labels must equal to the number of rows in the input data")
-    .Call("XGDMatrixSetInfo_R", dmat, name, as.numeric(info), 
+    .Call("XGDMatrixSetInfo_R", dmat, name, as.numeric(info),
           PACKAGE = "xgboost")
     return(TRUE)
   }
   if (name == "weight") {
-    if (length(info)!=xgb.numrow(dmat))
+    if (length(info) != xgb.numrow(dmat))
       stop("The length of weights must equal to the number of rows in the input data")
-    .Call("XGDMatrixSetInfo_R", dmat, name, as.numeric(info), 
+    .Call("XGDMatrixSetInfo_R", dmat, name, as.numeric(info),
           PACKAGE = "xgboost")
     return(TRUE)
   }
   if (name == "base_margin") {
     # if (length(info)!=xgb.numrow(dmat))
     #   stop("The length of base margin must equal to the number of rows in the input data")
-    .Call("XGDMatrixSetInfo_R", dmat, name, as.numeric(info), 
+    .Call("XGDMatrixSetInfo_R", dmat, name, as.numeric(info),
           PACKAGE = "xgboost")
     return(TRUE)
   }
   if (name == "group") {
-    if (sum(info)!=xgb.numrow(dmat))
+    if (sum(info) != xgb.numrow(dmat))
       stop("The sum of groups must equal to the number of rows in the input data")
-    .Call("XGDMatrixSetInfo_R", dmat, name, as.integer(info), 
+    .Call("XGDMatrixSetInfo_R", dmat, name, as.integer(info),
           PACKAGE = "xgboost")
     return(TRUE)
   }
@@ -68,7 +68,7 @@ xgb.Booster <- function(params = list(), cachelist = list(), modelfile = NULL) {
     if (typeof(modelfile) == "character") {
       .Call("XGBoosterLoadModel_R", handle, modelfile, PACKAGE = "xgboost")
     } else if (typeof(modelfile) == "raw") {
-      .Call("XGBoosterLoadModelFromRaw_R", handle, modelfile, PACKAGE = "xgboost")      
+      .Call("XGBoosterLoadModelFromRaw_R", handle, modelfile, PACKAGE = "xgboost")
     } else {
       stop("xgb.Booster: modelfile must be character or raw vector")
     }
@@ -103,18 +103,13 @@ xgb.Booster.check <- function(bst, saveraw = TRUE)
 ## ----the following are low level iteratively function, not needed if
 ## you do not want to use them ---------------------------------------
 # get dmatrix from data, label
-xgb.get.DMatrix <- function(data, label = NULL, missing = NULL, weight = NULL) {
+xgb.get.DMatrix <- function(data, label = NULL, missing = NA, weight = NULL) {
   inClass <- class(data)
   if (inClass == "dgCMatrix" || inClass == "matrix") {
     if (is.null(label)) {
       stop("xgboost: need label when data is a matrix")
     }
-    dtrain <- xgb.DMatrix(data, label = label)
-    if (is.null(missing)){
-      dtrain <- xgb.DMatrix(data, label = label)
-    } else {
-      dtrain <- xgb.DMatrix(data, label = label, missing = missing)
-    }
+    dtrain <- xgb.DMatrix(data, label = label, missing = missing)
     if (!is.null(weight)){
       xgb.setinfo(dtrain, "weight", weight)
     }
@@ -127,8 +122,8 @@ xgb.get.DMatrix <- function(data, label = NULL, missing = NULL, weight = NULL) {
     } else if (inClass == "xgb.DMatrix") {
       dtrain <- data
     } else if (inClass == "data.frame") {
-      stop("xgboost only support numerical matrix input, 
-           use 'data.frame' to transform the data.")
+      stop("xgboost only support numerical matrix input,
+           use 'data.matrix' to transform the data.")
     } else {
       stop("xgboost: Invalid input of data")
     }
@@ -147,8 +142,7 @@ xgb.iter.boost <- function(booster, dtrain, gpair) {
   if (class(dtrain) != "xgb.DMatrix") {
     stop("xgb.iter.update: second argument must be type xgb.DMatrix")
   }
-  .Call("XGBoosterBoostOneIter_R", booster, dtrain, gpair$grad, gpair$hess, 
-        PACKAGE = "xgboost")
+  .Call("XGBoosterBoostOneIter_R", booster, dtrain, gpair$grad, gpair$hess, PACKAGE = "xgboost")
   return(TRUE)
 }
 
@@ -162,9 +156,9 @@ xgb.iter.update <- function(booster, dtrain, iter, obj = NULL) {
   }
 
   if (is.null(obj)) {
-    .Call("XGBoosterUpdateOneIter_R", booster, as.integer(iter), dtrain, 
+    .Call("XGBoosterUpdateOneIter_R", booster, as.integer(iter), dtrain,
           PACKAGE = "xgboost")
-  } else {
+    } else {
     pred <- predict(booster, dtrain)
     gpair <- obj(pred, dtrain)
     succ <- xgb.iter.boost(booster, dtrain, gpair)
@@ -195,7 +189,7 @@ xgb.iter.eval <- function(booster, watchlist, iter, feval = NULL, prediction = F
         }
         evnames <- append(evnames, names(w))
       }
-      msg <- .Call("XGBoosterEvalOneIter_R", booster, as.integer(iter), watchlist, 
+      msg <- .Call("XGBoosterEvalOneIter_R", booster, as.integer(iter), watchlist,
                    evnames, PACKAGE = "xgboost")
     } else {
       msg <- paste("[", iter, "]", sep="")
@@ -253,21 +247,21 @@ xgb.cv.mknfold <- function(dall, nfold, param, stratified, folds) {
         if (length(unique(y)) <= 5) y <- factor(y)
       }
       folds <- xgb.createFolds(y, nfold)
-    } else { 
+    } else {
       # make simple non-stratified folds
       kstep <- length(randidx) %/% nfold
       folds <- list()
-      for (i in 1:(nfold-1)) {
-        folds[[i]] = randidx[1:kstep]
-        randidx = setdiff(randidx, folds[[i]])
+      for (i in 1:(nfold - 1)) {
+        folds[[i]] <- randidx[1:kstep]
+        randidx <- setdiff(randidx, folds[[i]])
       }
-      folds[[nfold]] = randidx
+      folds[[nfold]] <- randidx
     }
   }
   ret <- list()
   for (k in 1:nfold) {
     dtest <- slice(dall, folds[[k]])
-    didx = c()
+    didx <- c()
     for (i in 1:nfold) {
       if (i != k) {
         didx <- append(didx, folds[[i]])
@@ -275,7 +269,7 @@ xgb.cv.mknfold <- function(dall, nfold, param, stratified, folds) {
     }
     dtrain <- slice(dall, didx)
     bst <- xgb.Booster(param, list(dtrain, dtest))
-    watchlist = list(train=dtrain, test=dtest)
+    watchlist <- list(train=dtrain, test=dtest)
     ret[[k]] <- list(dtrain=dtrain, booster=bst, watchlist=watchlist, index=folds[[k]])
   }
   return (ret)
@@ -288,7 +282,7 @@ xgb.cv.aggcv <- function(res, showsd = TRUE) {
     kv <- strsplit(header[i], ":")[[1]]
     ret <- paste(ret, "\t", kv[1], ":", sep="")
     stats <- c()
-    stats[1] <- as.numeric(kv[2])    
+    stats[1] <- as.numeric(kv[2])
     for (j in 2:length(res)) {
       tkv <- strsplit(res[[j]][i], ":")[[1]]
       stats[j] <- as.numeric(tkv[2])
@@ -316,9 +310,9 @@ xgb.createFolds <- function(y, k = 10)
     ## At most, we will use quantiles. If the sample
     ## is too small, we just do regular unstratified
     ## CV
-    cuts <- floor(length(y)/k)
-    if(cuts < 2) cuts <- 2
-    if(cuts > 5) cuts <- 5
+    cuts <- floor(length(y) / k)
+    if (cuts < 2) cuts <- 2
+    if (cuts > 5) cuts <- 5
     y <- cut(y,
              unique(stats::quantile(y, probs = seq(0, 1, length = cuts))),
              include.lowest = TRUE)
@@ -330,7 +324,7 @@ xgb.createFolds <- function(y, k = 10)
     y <- factor(as.character(y))
     numInClass <- table(y)
     foldVector <- vector(mode = "integer", length(y))
-    
+
     ## For each class, balance the fold allocation as far
     ## as possible, then resample the remainder.
     ## The final assignment of folds is also randomized.
diff --git a/R-package/R/xgb.DMatrix.R b/R-package/R/xgb.DMatrix.R
index 8c3ea80bcbcd..c34c65d95b4a 100644
--- a/R-package/R/xgb.DMatrix.R
+++ b/R-package/R/xgb.DMatrix.R
@@ -17,29 +17,28 @@
 #' xgb.DMatrix.save(dtrain, 'xgb.DMatrix.data')
 #' dtrain <- xgb.DMatrix('xgb.DMatrix.data')
 #' @export
-#' 
-xgb.DMatrix <- function(data, info = list(), missing = 0, ...) {
+xgb.DMatrix <- function(data, info = list(), missing = NA, ...) {
   if (typeof(data) == "character") {
-    handle <- .Call("XGDMatrixCreateFromFile_R", data, as.integer(FALSE), 
+    handle <- .Call("XGDMatrixCreateFromFile_R", data, as.integer(FALSE),
                     PACKAGE = "xgboost")
   } else if (is.matrix(data)) {
-    handle <- .Call("XGDMatrixCreateFromMat_R", data, missing, 
+    handle <- .Call("XGDMatrixCreateFromMat_R", data, missing,
                     PACKAGE = "xgboost")
   } else if (class(data) == "dgCMatrix") {
-    handle <- .Call("XGDMatrixCreateFromCSC_R", data@p, data@i, data@x, 
+    handle <- .Call("XGDMatrixCreateFromCSC_R", data@p, data@i, data@x,
                     PACKAGE = "xgboost")
   } else {
-    stop(paste("xgb.DMatrix: does not support to construct from ", 
+    stop(paste("xgb.DMatrix: does not support to construct from ",
                typeof(data)))
   }
   dmat <- structure(handle, class = "xgb.DMatrix")
-  
+
   info <- append(info, list(...))
-  if (length(info) == 0) 
+  if (length(info) == 0)
     return(dmat)
   for (i in 1:length(info)) {
     p <- info[i]
     xgb.setinfo(dmat, names(p), p[[1]])
   }
   return(dmat)
-} 
+}
diff --git a/R-package/R/xgb.DMatrix.save.R b/R-package/R/xgb.DMatrix.save.R
index d58dc09debdd..63a0be6919bf 100644
--- a/R-package/R/xgb.DMatrix.save.R
+++ b/R-package/R/xgb.DMatrix.save.R
@@ -12,16 +12,15 @@
 #' xgb.DMatrix.save(dtrain, 'xgb.DMatrix.data')
 #' dtrain <- xgb.DMatrix('xgb.DMatrix.data')
 #' @export
-#' 
 xgb.DMatrix.save <- function(DMatrix, fname) {
   if (typeof(fname) != "character") {
     stop("xgb.save: fname must be character")
   }
   if (class(DMatrix) == "xgb.DMatrix") {
-    .Call("XGDMatrixSaveBinary_R", DMatrix, fname, as.integer(FALSE), 
+    .Call("XGDMatrixSaveBinary_R", DMatrix, fname, as.integer(FALSE),
           PACKAGE = "xgboost")
     return(TRUE)
   }
   stop("xgb.DMatrix.save: the input must be xgb.DMatrix")
   return(FALSE)
-} 
+}
diff --git a/R-package/R/xgb.create.features.R b/R-package/R/xgb.create.features.R
new file mode 100644
index 000000000000..bd913a81c1de
--- /dev/null
+++ b/R-package/R/xgb.create.features.R
@@ -0,0 +1,91 @@
+#' Create new features from a previously learned model
+#' 
+#' May improve the learning by adding new features to the training data based on the decision trees from a previously learned model.
+#' 
+#' @importFrom magrittr %>%
+#' @importFrom Matrix cBind
+#' @importFrom Matrix sparse.model.matrix
+#' 
+#' @param model decision tree boosting model learned on the original data
+#' @param training.data original data (usually provided as a \code{dgCMatrix} matrix)
+#' 
+#' @return \code{dgCMatrix} matrix including both the original data and the new features.
+#'
+#' @details 
+#' This is the function inspired from the paragraph 3.1 of the paper:
+#' 
+#' \strong{Practical Lessons from Predicting Clicks on Ads at Facebook}
+#' 
+#' \emph{(Xinran He, Junfeng Pan, Ou Jin, Tianbing Xu, Bo Liu, Tao Xu, Yan, xin Shi, Antoine Atallah, Ralf Herbrich, Stuart Bowers, 
+#' Joaquin Quiñonero Candela)}
+#'  
+#' International Workshop on Data Mining for Online Advertising (ADKDD) - August 24, 2014
+#' 
+#' \url{https://research.facebook.com/publications/758569837499391/practical-lessons-from-predicting-clicks-on-ads-at-facebook/}.
+#' 
+#' Extract explaining the method:
+#' 
+#' "\emph{We found that boosted decision trees are a powerful and very
+#' convenient way to implement non-linear and tuple transformations
+#' of the kind we just described. We treat each individual
+#' tree as a categorical feature that takes as value the
+#' index of the leaf an instance ends up falling in. We use 
+#' 1-of-K coding of this type of features. 
+#' 
+#' For example, consider the boosted tree model in Figure 1 with 2 subtrees, 
+#' where the first subtree has 3 leafs and the second 2 leafs. If an
+#' instance ends up in leaf 2 in the first subtree and leaf 1 in
+#' second subtree, the overall input to the linear classifier will
+#' be the binary vector \code{[0, 1, 0, 1, 0]}, where the first 3 entries
+#' correspond to the leaves of the first subtree and last 2 to
+#' those of the second subtree.
+#' 
+#' [...]
+#' 
+#' We can understand boosted decision tree
+#' based transformation as a supervised feature encoding that
+#' converts a real-valued vector into a compact binary-valued
+#' vector. A traversal from root node to a leaf node represents
+#' a rule on certain features.}"
+#' 
+#' @examples
+#' data(agaricus.train, package='xgboost')
+#' data(agaricus.test, package='xgboost')
+#' dtrain <- xgb.DMatrix(data = agaricus.train$data, label = agaricus.train$label)
+#' dtest <- xgb.DMatrix(data = agaricus.test$data, label = agaricus.test$label)
+#'
+#' param <- list(max.depth=2, eta=1, silent=1, objective='binary:logistic')
+#' nround = 4
+#'
+#' bst = xgb.train(params = param, data = dtrain, nrounds = nround, nthread = 2)
+#' 
+#' # Model accuracy without new features
+#' accuracy.before <- sum((predict(bst, agaricus.test$data) >= 0.5) == agaricus.test$label) / length(agaricus.test$label)
+#' 
+#' # Convert previous features to one hot encoding
+#' new.features.train <- xgb.create.features(model = bst, agaricus.train$data)
+#' new.features.test <- xgb.create.features(model = bst, agaricus.test$data)
+#' 
+#' # learning with new features
+#' new.dtrain <- xgb.DMatrix(data = new.features.train, label = agaricus.train$label)
+#' new.dtest <- xgb.DMatrix(data = new.features.test, label = agaricus.test$label)
+#' watchlist <- list(train = new.dtrain)
+#' bst <- xgb.train(params = param, data = new.dtrain, nrounds = nround, nthread = 2)
+#' 
+#' # Model accuracy with new features
+#' accuracy.after <- sum((predict(bst, new.dtest) >= 0.5) == agaricus.test$label) / length(agaricus.test$label)
+#' 
+#' # Here the accuracy was already good and is now perfect.
+#' cat(paste("The accuracy was", accuracy.before, "before adding leaf features and it is now", accuracy.after, "!\n"))
+#' 
+#' @export
+xgb.create.features <- function(model, training.data){
+  pred_with_leaf = predict(model, training.data, predleaf = TRUE)
+  cols <- list()
+  for(i in 1:length(trees)){
+    # max is not the real max but it s not important for the purpose of adding features
+    leaf.id <- sort(unique(pred_with_leaf[,i]))
+    cols[[i]] <- factor(x = pred_with_leaf[,i], level = leaf.id)
+  }
+  cBind(training.data, sparse.model.matrix( ~ . -1, as.data.frame(cols)))
+}
\ No newline at end of file
diff --git a/R-package/R/xgb.cv.R b/R-package/R/xgb.cv.R
index a5364db52b8d..89edbeb6330e 100644
--- a/R-package/R/xgb.cv.R
+++ b/R-package/R/xgb.cv.R
@@ -90,16 +90,15 @@
 #'                   max.depth =3, eta = 1, objective = "binary:logistic")
 #' print(history)
 #' @export
-#'
-xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing = NULL, 
-                   prediction = FALSE, showsd = TRUE, metrics=list(), 
+xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing = NA,
+                   prediction = FALSE, showsd = TRUE, metrics=list(),
                    obj = NULL, feval = NULL, stratified = TRUE, folds = NULL, verbose = T, print.every.n=1L,
                    early.stop.round = NULL, maximize = NULL, ...) {
     if (typeof(params) != "list") {
         stop("xgb.cv: first argument params must be list")
     }
     if(!is.null(folds)) {
-        if(class(folds)!="list" | length(folds) < 2) {
+        if(class(folds) != "list" | length(folds) < 2) {
             stop("folds must be a list with 2 or more elements that are vectors of indices for each CV-fold")
         }
         nfold <- length(folds)
@@ -107,38 +106,34 @@ xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing =
     if (nfold <= 1) {
         stop("nfold must be bigger than 1")
     }
-    if (is.null(missing)) {
-        dtrain <- xgb.get.DMatrix(data, label)
-    } else {
-        dtrain <- xgb.get.DMatrix(data, label, missing)
-    }
-    dot.params = list(...)
-    nms.params = names(params)
-    nms.dot.params = names(dot.params)
-    if (length(intersect(nms.params,nms.dot.params))>0)
+    dtrain <- xgb.get.DMatrix(data, label, missing)
+    dot.params <- list(...)
+    nms.params <- names(params)
+    nms.dot.params <- names(dot.params)
+    if (length(intersect(nms.params,nms.dot.params)) > 0)
         stop("Duplicated defined term in parameters. Please check your list of params.")
     params <- append(params, dot.params)
     params <- append(params, list(silent=1))
     for (mc in metrics) {
         params <- append(params, list("eval_metric"=mc))
     }
-    
+
     # customized objective and evaluation metric interface
     if (!is.null(params$objective) && !is.null(obj))
         stop("xgb.cv: cannot assign two different objectives")
     if (!is.null(params$objective))
-        if (class(params$objective)=='function') {
-            obj = params$objective
-            params[['objective']] = NULL
+        if (class(params$objective) == 'function') {
+            obj <- params$objective
+            params[['objective']] <- NULL
         }
     # if (!is.null(params$eval_metric) && !is.null(feval))
     #  stop("xgb.cv: cannot assign two different evaluation metrics")
     if (!is.null(params$eval_metric))
-        if (class(params$eval_metric)=='function') {
-            feval = params$eval_metric
-            params[['eval_metric']] = NULL
+        if (class(params$eval_metric) == 'function') {
+            feval <- params$eval_metric
+            params[['eval_metric']] <- NULL
         }
-    
+
     # Early Stopping
     if (!is.null(early.stop.round)){
         if (!is.null(feval) && is.null(maximize))
@@ -148,39 +143,39 @@ xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing =
         if (is.null(maximize))
         {
             if (params$eval_metric %in% c('rmse','logloss','error','merror','mlogloss')) {
-                maximize = FALSE
+                maximize <- FALSE
             } else {
-                maximize = TRUE
+                maximize <- TRUE
             }
         }
-        
+
         if (maximize) {
-            bestScore = 0
+            bestScore <- 0
         } else {
-            bestScore = Inf
+            bestScore <- Inf
         }
-        bestInd = 0
-        earlyStopflag = FALSE
-        
-        if (length(metrics)>1)
+        bestInd <- 0
+        earlyStopflag <- FALSE
+
+        if (length(metrics) > 1)
             warning('Only the first metric is used for early stopping process.')
     }
-    
+
     xgb_folds <- xgb.cv.mknfold(dtrain, nfold, params, stratified, folds)
-    obj_type = params[['objective']]
-    mat_pred = FALSE
-    if (!is.null(obj_type) && obj_type=='multi:softprob')
+    obj_type <- params[['objective']]
+    mat_pred <- FALSE
+    if (!is.null(obj_type) && obj_type == 'multi:softprob')
     {
-        num_class = params[['num_class']]
+        num_class <- params[['num_class']]
         if (is.null(num_class))
             stop('must set num_class to use softmax')
         predictValues <- matrix(0,xgb.numrow(dtrain),num_class)
-        mat_pred = TRUE
+        mat_pred <- TRUE
     }
     else
         predictValues <- rep(0,xgb.numrow(dtrain))
     history <- c()
-    print.every.n = max(as.integer(print.every.n), 1L)
+    print.every.n <- max(as.integer(print.every.n), 1L)
     for (i in 1:nrounds) {
         msg <- list()
         for (k in 1:nfold) {
@@ -191,62 +186,60 @@ xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing =
         ret <- xgb.cv.aggcv(msg, showsd)
         history <- c(history, ret)
         if(verbose)
-            if (0==(i-1L)%%print.every.n)
+            if (0 == (i - 1L) %% print.every.n)
                 cat(ret, "\n", sep="")
-        
+
         # early_Stopping
         if (!is.null(early.stop.round)){
-            score = strsplit(ret,'\\s+')[[1]][1+length(metrics)+2]
-            score = strsplit(score,'\\+|:')[[1]][[2]]
-            score = as.numeric(score)
-            if ((maximize && score>bestScore) || (!maximize && score<bestScore)) {
-                bestScore = score
-                bestInd = i
+            score <- strsplit(ret,'\\s+')[[1]][1 + length(metrics) + 2]
+            score <- strsplit(score,'\\+|:')[[1]][[2]]
+            score <- as.numeric(score)
+            if ( (maximize && score > bestScore) || (!maximize && score < bestScore)) {
+                bestScore <- score
+                bestInd <- i
             } else {
-                if (i-bestInd>=early.stop.round) {
-                    earlyStopflag = TRUE
+                if (i - bestInd >= early.stop.round) {
+                    earlyStopflag <- TRUE
                     cat('Stopping. Best iteration:',bestInd)
                     break
                 }
             }
         }
-        
     }
-    
+
     if (prediction) {
         for (k in 1:nfold) {
-            fd = xgb_folds[[k]]
+            fd <- xgb_folds[[k]]
             if (!is.null(early.stop.round) && earlyStopflag) {
-              res = xgb.iter.eval(fd$booster, fd$watchlist, bestInd - 1, feval, prediction)
+              res <- xgb.iter.eval(fd$booster, fd$watchlist, bestInd - 1, feval, prediction)
             } else {
-              res = xgb.iter.eval(fd$booster, fd$watchlist, nrounds - 1, feval, prediction)
+              res <- xgb.iter.eval(fd$booster, fd$watchlist, nrounds - 1, feval, prediction)
             }
             if (mat_pred) {
-                pred_mat = matrix(res[[2]],num_class,length(fd$index))
-                predictValues[fd$index,] = t(pred_mat)
+                pred_mat <- matrix(res[[2]],num_class,length(fd$index))
+                predictValues[fd$index,] <- t(pred_mat)
             } else {
-                predictValues[fd$index] = res[[2]]
+                predictValues[fd$index] <- res[[2]]
             }
         }
     }
-    
-    
+
     colnames <- str_split(string = history[1], pattern = "\t")[[1]] %>% .[2:length(.)] %>% str_extract(".*:") %>% str_replace(":","") %>% str_replace("-", ".")
     colnamesMean <- paste(colnames, "mean")
     if(showsd) colnamesStd <- paste(colnames, "std")
-    
+
     colnames <- c()
     if(showsd) for(i in 1:length(colnamesMean)) colnames <- c(colnames, colnamesMean[i], colnamesStd[i])
     else colnames <- colnamesMean
-    
+
     type <- rep(x = "numeric", times = length(colnames))
     dt <- utils::read.table(text = "", colClasses = type, col.names = colnames) %>% as.data.table
     split <- str_split(string = history, pattern = "\t")
-    
-    for(line in split) dt <- line[2:length(line)] %>% str_extract_all(pattern = "\\d*\\.+\\d*") %>% unlist %>% as.numeric %>% as.list %>% {rbindlist(list(dt, .), use.names = F, fill = F)}
-    
+
+    for(line in split) dt <- line[2:length(line)] %>% str_extract_all(pattern = "\\d*\\.+\\d*") %>% unlist %>% as.numeric %>% as.list %>% {rbindlist( list( dt, .), use.names = F, fill = F)}
+
     if (prediction) {
-        return(list(dt = dt,pred = predictValues))
+        return( list( dt = dt,pred = predictValues))
     }
     return(dt)
 }
diff --git a/R-package/R/xgb.dump.R b/R-package/R/xgb.dump.R
index fae1c7d2be53..b39359abd5b0 100644
--- a/R-package/R/xgb.dump.R
+++ b/R-package/R/xgb.dump.R
@@ -36,7 +36,6 @@
 #' # print the model without saving it to a file
 #' print(xgb.dump(bst))
 #' @export
-#' 
 xgb.dump <- function(model = NULL, fname = NULL, fmap = "", with.stats=FALSE) {
   if (class(model) != "xgb.Booster") {
     stop("model: argument must be type xgb.Booster")
@@ -49,13 +48,13 @@ xgb.dump <- function(model = NULL, fname = NULL, fmap = "", with.stats=FALSE) {
   if (!(class(fmap) %in% c("character", "NULL") && length(fname) <= 1)) {
     stop("fmap: argument must be type character (when provided)")
   }
-  
+
   longString <- .Call("XGBoosterDumpModel_R", model$handle, fmap, as.integer(with.stats), PACKAGE = "xgboost")
-  
+
   dt <- fread(paste(longString, collapse = ""), sep = "\n", header = F)
 
   setnames(dt, "Lines")
-  
+
   if(is.null(fname)) {
     result <- dt[Lines != "0"][, Lines := str_replace(Lines, "^\t+", "")][Lines != ""][, paste(Lines)]
     return(result)
diff --git a/R-package/R/xgb.importance.R b/R-package/R/xgb.importance.R
index f7696d53e766..50a7af5cb67c 100644
--- a/R-package/R/xgb.importance.R
+++ b/R-package/R/xgb.importance.R
@@ -1,7 +1,6 @@
 #' Show importance of features in a model
 #' 
-#' Read a xgboost model text dump. 
-#' Can be tree or linear model (text dump of linear model are only supported in dev version of \code{Xgboost} for now).
+#' Create a \code{data.table} of the most important features of a model. 
 #' 
 #' @importFrom data.table data.table
 #' @importFrom data.table setnames
@@ -11,34 +10,30 @@
 #' @importFrom Matrix cBind
 #' @importFrom Matrix sparseVector
 #' 
-#' @param feature_names names of each feature as a character vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}.
-#' 
-#' @param filename_dump the path to the text file storing the model. Model dump must include the gain per feature and per tree (\code{with.stats = T} in function \code{xgb.dump}).
-#' 
-#' @param model generated by the \code{xgb.train} function. Avoid the creation of a dump file.
-#' 
+#' @param feature_names names of each feature as a \code{character} vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}.
+#' @param model generated by the \code{xgb.train} function.
 #' @param data the dataset used for the training step. Will be used with \code{label} parameter for co-occurence computation. More information in \code{Detail} part. This parameter is optional.
-#' 
 #' @param label the label vetor used for the training step. Will be used with \code{data} parameter for co-occurence computation. More information in \code{Detail} part. This parameter is optional.
-#' 
 #' @param target a function which returns \code{TRUE} or \code{1} when an observation should be count as a co-occurence and \code{FALSE} or \code{0} otherwise. Default function is provided for computing co-occurences in a binary classification. The \code{target} function should have only one parameter. This parameter will be used to provide each important feature vector after having applied the split condition, therefore these vector will be only made of 0 and 1 only, whatever was the information before. More information in \code{Detail} part. This parameter is optional.
 #'
 #' @return A \code{data.table} of the features used in the model with their average gain (and their weight for boosted tree model) in the model.
 #'
 #' @details 
-#' This is the function to understand the model trained (and through your model, your data).
-#' 
-#' Results are returned for both linear and tree models.
+#' This function is for both linear and tree models.
 #' 
 #' \code{data.table} is returned by the function. 
-#' There are 3 columns :
+#' The columns are :
 #' \itemize{
-#'   \item \code{Features} name of the features as provided in \code{feature_names} or already present in the model dump.
-#'   \item \code{Gain} contribution of each feature to the model. For boosted tree model, each gain of each feature of each tree is taken into account, then average per feature to give a vision of the entire model. Highest percentage means important feature to predict the \code{label} used for the training ;
-#'   \item \code{Cover} metric of the number of observation related to this feature (only available for tree models) ;
-#'   \item \code{Weight} percentage representing the relative number of times a feature have been taken into trees. \code{Gain} should be prefered to search the most important feature. For boosted linear model, this column has no meaning.
+#'   \item \code{Features} name of the features as provided in \code{feature_names} or already present in the model dump;
+#'   \item \code{Gain} contribution of each feature to the model. For boosted tree model, each gain of each feature of each tree is taken into account, then average per feature to give a vision of the entire model. Highest percentage means important feature to predict the \code{label} used for the training (only available for tree models);
+#'   \item \code{Cover} metric of the number of observation related to this feature (only available for tree models);
+#'   \item \code{Weight} percentage representing the relative number of times a feature have been taken into trees.
 #' }
 #' 
+#' If you don't provide \code{feature_names}, index of the features will be used instead.
+#' 
+#' Because the index is extracted from the model dump (made on the C++ side), it starts at 0 (usual in C++) instead of 1 (usual in R).
+#' 
 #' Co-occurence count
 #' ------------------
 #' 
@@ -51,57 +46,55 @@
 #' @examples
 #' data(agaricus.train, package='xgboost')
 #' 
-#' # Both dataset are list with two items, a sparse matrix and labels 
-#' # (labels = outcome column which will be learned). 
-#' # Each column of the sparse Matrix is a feature in one hot encoding format.
-#' train <- agaricus.train
-#' 
-#' bst <- xgboost(data = train$data, label = train$label, max.depth = 2, 
+#' bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max.depth = 2, 
 #'                eta = 1, nthread = 2, nround = 2,objective = "binary:logistic")
 #' 
-#' # train$data@@Dimnames[[2]] represents the column names of the sparse matrix.
-#' xgb.importance(train$data@@Dimnames[[2]], model = bst)
+#' # agaricus.train$data@@Dimnames[[2]] represents the column names of the sparse matrix.
+#' xgb.importance(agaricus.train$data@@Dimnames[[2]], model = bst)
 #' 
 #' # Same thing with co-occurence computation this time
-#' xgb.importance(train$data@@Dimnames[[2]], model = bst, data = train$data, label = train$label)
+#' xgb.importance(agaricus.train$data@@Dimnames[[2]], model = bst, data = agaricus.train$data, label = agaricus.train$label)
 #' 
 #' @export
-xgb.importance <- function(feature_names = NULL, filename_dump = NULL, model = NULL, data = NULL, label = NULL, target = function(x) ((x + label) == 2)){  
-  if (!class(feature_names) %in% c("character", "NULL")) {	   
-    stop("feature_names: Has to be a vector of character or NULL if the model dump already contains feature name. Look at this function documentation to see where to get feature names.")
-  }
-  
-  if (!(class(filename_dump) %in% c("character", "NULL") && length(filename_dump) <= 1)) {
-    stop("filename_dump: Has to be a path to the model dump file.")
+xgb.importance <- function(feature_names = NULL, model = NULL, data = NULL, label = NULL, target = function(x) ( (x + label) == 2)){
+  if (!class(feature_names) %in% c("character", "NULL")) {
+    stop("feature_names: Has to be a vector of character or NULL if the model already contains feature name. Look at this function documentation to see where to get feature names.")
   }
-  
-  if (!class(model) %in% c("xgb.Booster", "NULL")) {
+
+  if (class(model) != "xgb.Booster") {
     stop("model: Has to be an object of class xgb.Booster model generaged by the xgb.train function.")
   }
-  
-  if((is.null(data) & !is.null(label)) |(!is.null(data) & is.null(label))) {
+
+  if((is.null(data) & !is.null(label)) | (!is.null(data) & is.null(label))) {
     stop("data/label: Provide the two arguments if you want co-occurence computation or none of them if you are not interested but not one of them only.")
   }
-  
+
   if(class(label) == "numeric"){
     if(sum(label == 0) / length(label) > 0.5) label <- as(label, "sparseVector")
   }
   
-  if(is.null(model)){
-    text <- readLines(filename_dump)  
-  } else {
-    text <- xgb.dump(model = model, with.stats = T)
-  } 
+  treeDump <- function(feature_names, text, keepDetail){
+    if(keepDetail) groupBy <- c("Feature", "Split", "MissingNo") else groupBy <- "Feature"
+    xgb.model.dt.tree(feature_names = feature_names, text = text)[,"MissingNo" := Missing == No ][Feature != "Leaf",.(Gain = sum(Quality), Cover = sum(Cover), Frequency = .N), by = groupBy, with = T][,`:=`(Gain = Gain / sum(Gain), Cover = Cover / sum(Cover), Frequency = Frequency / sum(Frequency))][order(Gain, decreasing = T)]
+  }
   
-  if(text[2] == "bias:"){
-    result <- readLines(filename_dump) %>% linearDump(feature_names, .)
+  linearDump <- function(feature_names, text){
+    weights <- which(text == "weight:") %>% {a =. + 1; text[a:length(text)]} %>% as.numeric
+    if(is.null(feature_names)) feature_names <- seq(to = length(weights))
+    data.table(Feature = feature_names, Weight = weights)
+  }
+
+  model.text.dump <- xgb.dump(model = model, with.stats = T)
+  
+  if(model.text.dump[2] == "bias:"){
+    result <- model.text.dump %>% linearDump(feature_names, .)
     if(!is.null(data) | !is.null(label)) warning("data/label: these parameters should only be provided with decision tree based models.")
   }  else {
-    result <- treeDump(feature_names, text = text, keepDetail = !is.null(data))
-    
+    result <- treeDump(feature_names, text = model.text.dump, keepDetail = !is.null(data))
+
     # Co-occurence computation
     if(!is.null(data) & !is.null(label) & nrow(result) > 0) {
-      # Take care of missing column 
+      # Take care of missing column
       a <- data[, result[MissingNo == T,Feature], drop=FALSE] != 0
       # Bind the two Matrix and reorder columns
       c <- data[, result[MissingNo == F,Feature], drop=FALSE] %>% cBind(a,.) %>% .[,result[,Feature]]
@@ -109,25 +102,13 @@ xgb.importance <- function(feature_names = NULL, filename_dump = NULL, model = N
       # Apply split
       d <- data[, result[,Feature], drop=FALSE] < as.numeric(result[,Split])
       apply(c & d, 2, . %>% target %>% sum) -> vec
-            
-      result <- result[, "RealCover":= as.numeric(vec), with = F][, "RealCover %" := RealCover / sum(label)][,MissingNo:=NULL]
-    }    
+
+      result <- result[, "RealCover" := as.numeric(vec), with = F][, "RealCover %" := RealCover / sum(label)][,MissingNo := NULL]
+    }
   }
   result
 }
 
-treeDump <- function(feature_names, text, keepDetail){
-  if(keepDetail) groupBy <- c("Feature", "Split", "MissingNo") else groupBy <- "Feature"
-  
-  result <- xgb.model.dt.tree(feature_names = feature_names, text = text)[,"MissingNo":= Missing == No ][Feature!="Leaf",.(Gain = sum(Quality), Cover = sum(Cover), Frequence = .N), by = groupBy, with = T][,`:=`(Gain = Gain/sum(Gain), Cover = Cover/sum(Cover), Frequence = Frequence/sum(Frequence))][order(Gain, decreasing = T)]
-  
-  result  
-}
-
-linearDump <- function(feature_names, text){
-  which(text == "weight:") %>% {a=.+1;text[a:length(text)]} %>% as.numeric %>% data.table(Feature = feature_names, Weight = .)
-}
-
 # Avoid error messages during CRAN check.
 # The reason is that these variables are never declared
 # They are mainly column names inferred by Data.table...
diff --git a/R-package/R/xgb.load.R b/R-package/R/xgb.load.R
index b69a719cf446..03d6a4842a9e 100644
--- a/R-package/R/xgb.load.R
+++ b/R-package/R/xgb.load.R
@@ -15,11 +15,10 @@
 #' bst <- xgb.load('xgb.model')
 #' pred <- predict(bst, test$data)
 #' @export
-#' 
 xgb.load <- function(modelfile) {
-  if (is.null(modelfile)) 
+  if (is.null(modelfile))
     stop("xgb.load: modelfile cannot be NULL")
-  
+
   handle <- xgb.Booster(modelfile = modelfile)
   # re-use modelfile if it is raw so we donot need to serialize
   if (typeof(modelfile) == "raw") {
@@ -29,4 +28,4 @@ xgb.load <- function(modelfile) {
   }
   bst <- xgb.Booster.check(bst)
   return(bst)
-} 
+}
diff --git a/R-package/R/xgb.model.dt.tree.R b/R-package/R/xgb.model.dt.tree.R
index d083566a56bd..0083dae93ad9 100644
--- a/R-package/R/xgb.model.dt.tree.R
+++ b/R-package/R/xgb.model.dt.tree.R
@@ -1,6 +1,6 @@
-#' Convert tree model dump to data.table
+#' Parse boosted tree model text dump
 #' 
-#' Read a tree model text dump and return a data.table.
+#' Parse a boosted tree model text dump and return a \code{data.table}.
 #' 
 #' @importFrom data.table data.table
 #' @importFrom data.table set
@@ -12,20 +12,20 @@
 #' @importFrom magrittr add
 #' @importFrom stringr str_extract
 #' @importFrom stringr str_split
-#' @importFrom stringr str_extract
 #' @importFrom stringr str_trim
-#' @param feature_names names of each feature as a character vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}.
-#' @param filename_dump the path to the text file storing the model. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}).
-#' @param model dump generated by the \code{xgb.train} function. Avoid the creation of a dump file.
-#' @param text dump generated by the \code{xgb.dump} function. Avoid the creation of a dump file. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}).
-#' @param n_first_tree limit the plot to the n first trees. If \code{NULL}, all trees of the model are plotted. Performance can be low for huge models.
+#' @param feature_names names of each feature as a character vector. Can be extracted from a sparse matrix (see example). If the model already contains feature names, this argument should be \code{NULL} (default value).
+#' @param model object created by the \code{xgb.train} function.
+#' @param text \code{character} vector generated by the \code{xgb.dump} function. Model dump must include the gain per feature and per tree (parameter \code{with.stats = TRUE} in function \code{xgb.dump}).
+#' @param n_first_tree limit the plot to the \code{n} first trees. If set to \code{NULL}, all trees of the model are plotted. Performance can be low depending of the size of the model.
 #'
-#' @return A \code{data.table} of the features used in the model with their gain, cover and few other thing.
+#' @return A \code{data.table} of the features used in the model with their gain, cover and few other information.
 #'
 #' @details 
-#' General function to convert a text dump of tree model to a Matrix. The purpose is to help user to explore the model and get a better understanding of it.
+#' General function to convert a text dump of tree model to a \code{data.table}. 
+#' 
+#' The purpose is to help user to explore the model and get a better understanding of it.
 #' 
-#' The content of the \code{data.table} is organised that way:
+#' The columns of the \code{data.table} are:
 #' 
 #' \itemize{
 #' \item \code{ID}: unique identifier of a node ;
@@ -37,89 +37,73 @@
 #'  \item \code{Quality}: it's the gain related to the split in this specific node ;
 #'  \item \code{Cover}: metric to measure the number of observation affected by the split ;
 #'  \item \code{Tree}: ID of the tree. It is included in the main ID ;
-#'  \item \code{Yes.X} or \code{No.X}: data related to the pointer in \code{Yes} or \code{No} column ;
+#'  \item \code{Yes.Feature}, \code{No.Feature}, \code{Yes.Cover}, \code{No.Cover}, \code{Yes.Quality} and \code{No.Quality}: data related to the pointer in \code{Yes} or \code{No} column ;
 #' } 
 #'   
 #' @examples
 #' data(agaricus.train, package='xgboost')
 #' 
-#' #Both dataset are list with two items, a sparse matrix and labels 
-#' #(labels = outcome column which will be learned). 
-#' #Each column of the sparse Matrix is a feature in one hot encoding format.
-#' train <- agaricus.train
-#' 
-#' bst <- xgboost(data = train$data, label = train$label, max.depth = 2, 
+#' bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max.depth = 2, 
 #'                eta = 1, nthread = 2, nround = 2,objective = "binary:logistic")
 #' 
-#' #agaricus.test$data@@Dimnames[[2]] represents the column names of the sparse matrix.
-#' xgb.model.dt.tree(agaricus.train$data@@Dimnames[[2]], model = bst)
+#' # agaricus.train$data@@Dimnames[[2]] represents the column names of the sparse matrix.
+#' xgb.model.dt.tree(feature_names = agaricus.train$data@@Dimnames[[2]], model = bst)
 #' 
 #' @export
-xgb.model.dt.tree <- function(feature_names = NULL, filename_dump = NULL, model = NULL, text = NULL, n_first_tree = NULL){
-  
-  if (!class(feature_names) %in% c("character", "NULL")) {     
+xgb.model.dt.tree <- function(feature_names = NULL, model = NULL, text = NULL, n_first_tree = NULL){
+
+  if (!class(feature_names) %in% c("character", "NULL")) {
     stop("feature_names: Has to be a vector of character or NULL if the model dump already contains feature name. Look at this function documentation to see where to get feature names.")
   }
-  if (!(class(filename_dump) %in% c("character", "NULL") && length(filename_dump) <= 1)) {
-    stop("filename_dump: Has to be a character vector of size 1 representing the path to the model dump file.")
-  } else if (!is.null(filename_dump) && !file.exists(filename_dump)) {
-    stop("filename_dump: path to the model doesn't exist.")
-  } else if(is.null(filename_dump) && is.null(model) && is.null(text)){
-    stop("filename_dump & model & text: no path to dump model, no model, no text dump, have been provided.")
-  }
-  
-  if (!class(model) %in% c("xgb.Booster", "NULL")) {
-    stop("model: Has to be an object of class xgb.Booster model generaged by the xgb.train function.")
-  }
-  
-  if (!class(text) %in% c("character", "NULL")) { 
-    stop("text: Has to be a vector of character or NULL if a path to the model dump has already been provided.")
+
+  if (class(model) != "xgb.Booster" & class(text) != "character") {
+    "model: Has to be an object of class xgb.Booster model generaged by the xgb.train function.\n" %>%
+      paste0("text: Has to be a vector of character or NULL if a path to the model dump has already been provided.") %>%
+      stop()
   }
-  
+
   if (!class(n_first_tree) %in% c("numeric", "NULL") | length(n_first_tree) > 1) {
     stop("n_first_tree: Has to be a numeric vector of size 1.")
   }
-  
-  if(!is.null(model)){
-    text = xgb.dump(model = model, with.stats = T)
-  } else if(!is.null(filename_dump)){
-    text <- readLines(filename_dump) %>% str_trim(side = "both")  
+
+  if(is.null(text)){		
+    text <- xgb.dump(model = model, with.stats = T)
   }
   
-  position <- str_match(text, "booster") %>% is.na %>% not %>% which %>% c(length(text)+1)
-  
+  position <- str_match(text, "booster") %>% is.na %>% not %>% which %>% c(length(text) + 1)
+
   extract <- function(x, pattern)  str_extract(x, pattern) %>% str_split("=") %>% lapply(function(x) x[2] %>% as.numeric) %>% unlist
-  
+
   n_round <- min(length(position) - 1, n_first_tree)
-  
+
   addTreeId <- function(x, i) paste(i,x,sep = "-")
-  
+
   allTrees <- data.table()
- 
-  anynumber_regex<-"[-+]?[0-9]*\\.?[0-9]+([eE][-+]?[0-9]+)?" 
-  for(i in 1:n_round){
-    
-    tree <- text[(position[i]+1):(position[i+1]-1)]
-    
+
+  anynumber_regex <- "[-+]?[0-9]*\\.?[0-9]+([eE][-+]?[0-9]+)?"
+  for (i in 1:n_round){
+
+    tree <- text[(position[i] + 1):(position[i + 1] - 1)]
+
     # avoid tree made of a leaf only (no split)
-    if(length(tree) <2) next
-    
-    treeID <- i-1
-    
+    if(length(tree) < 2) next
+
+    treeID <- i - 1
+
     notLeaf <- str_match(tree, "leaf") %>% is.na
     leaf <- notLeaf %>% not %>% tree[.]
     branch <- notLeaf %>% tree[.]
     idBranch <- str_extract(branch, "\\d*:") %>% str_replace(":", "") %>% addTreeId(treeID)
     idLeaf <- str_extract(leaf, "\\d*:") %>% str_replace(":", "") %>% addTreeId(treeID)
-    featureBranch <- str_extract(branch, "f\\d*<") %>% str_replace("<", "") %>% str_replace("f", "") %>% as.numeric 
+    featureBranch <- str_extract(branch, "f\\d*<") %>% str_replace("<", "") %>% str_replace("f", "") %>% as.numeric
     if(!is.null(feature_names)){
       featureBranch <- feature_names[featureBranch + 1]
     }
     featureLeaf <- rep("Leaf", length(leaf))
-    splitBranch <- str_extract(branch, paste0("<",anynumber_regex,"\\]")) %>% str_replace("<", "") %>% str_replace("\\]", "") 
-    splitLeaf <- rep(NA, length(leaf)) 
+    splitBranch <- str_extract(branch, paste0("<",anynumber_regex,"\\]")) %>% str_replace("<", "") %>% str_replace("\\]", "")
+    splitLeaf <- rep(NA, length(leaf))
     yesBranch <- extract(branch, "yes=\\d*") %>% addTreeId(treeID)
-    yesLeaf <- rep(NA, length(leaf)) 
+    yesLeaf <- rep(NA, length(leaf))
     noBranch <- extract(branch, "no=\\d*") %>% addTreeId(treeID)
     noLeaf <- rep(NA, length(leaf))
     missingBranch <- extract(branch, "missing=\\d+") %>% addTreeId(treeID)
@@ -128,42 +112,42 @@ xgb.model.dt.tree <- function(feature_names = NULL, filename_dump = NULL, model
     qualityLeaf <- extract(leaf, paste0("leaf=",anynumber_regex))
     coverBranch <- extract(branch, "cover=\\d*\\.*\\d*")
     coverLeaf <- extract(leaf, "cover=\\d*\\.*\\d*")
-    dt <- data.table(ID = c(idBranch, idLeaf), Feature = c(featureBranch, featureLeaf), Split = c(splitBranch, splitLeaf), Yes = c(yesBranch, yesLeaf), No = c(noBranch, noLeaf), Missing = c(missingBranch, missingLeaf), Quality = c(qualityBranch, qualityLeaf), Cover = c(coverBranch, coverLeaf))[order(ID)][,Tree:=treeID]
-    
+    dt <- data.table(ID = c(idBranch, idLeaf), Feature = c(featureBranch, featureLeaf), Split = c(splitBranch, splitLeaf), Yes = c(yesBranch, yesLeaf), No = c(noBranch, noLeaf), Missing = c(missingBranch, missingLeaf), Quality = c(qualityBranch, qualityLeaf), Cover = c(coverBranch, coverLeaf))[order(ID)][,Tree := treeID]
+
     allTrees <- rbindlist(list(allTrees, dt), use.names = T, fill = F)
   }
-  
+
   yes <- allTrees[!is.na(Yes), Yes]
-  
-  set(allTrees, i = which(allTrees[, Feature] != "Leaf"), 
-      j = "Yes.Feature", 
+
+  set(allTrees, i = which(allTrees[, Feature] != "Leaf"),
+      j = "Yes.Feature",
       value = allTrees[ID %in% yes, Feature])
-  
+
   set(allTrees, i = which(allTrees[, Feature] != "Leaf"),
-      j = "Yes.Cover", 
+      j = "Yes.Cover",
       value = allTrees[ID %in% yes, Cover])
-  
+
   set(allTrees, i = which(allTrees[, Feature] != "Leaf"),
-      j = "Yes.Quality", 
+      j = "Yes.Quality",
       value = allTrees[ID %in% yes, Quality])
   no <- allTrees[!is.na(No), No]
-  
+
   set(allTrees, i = which(allTrees[, Feature] != "Leaf"),
-      j = "No.Feature", 
+      j = "No.Feature",
       value = allTrees[ID %in% no, Feature])
-  
+
   set(allTrees, i = which(allTrees[, Feature] != "Leaf"),
-      j = "No.Cover", 
+      j = "No.Cover",
       value = allTrees[ID %in% no, Cover])
-  
-  set(allTrees, i = which(allTrees[, Feature] != "Leaf"), 
-      j = "No.Quality", 
+
+  set(allTrees, i = which(allTrees[, Feature] != "Leaf"),
+      j = "No.Quality",
       value = allTrees[ID %in% no, Quality])
-  
+
   allTrees
 }
 
 # Avoid error messages during CRAN check.
 # The reason is that these variables are never declared
 # They are mainly column names inferred by Data.table...
-globalVariables(c("ID", "Tree", "Yes", ".", ".N", "Feature", "Cover", "Quality", "No", "Gain", "Frequence"))
+globalVariables(c("ID", "Tree", "Yes", ".", ".N", "Feature", "Cover", "Quality", "No", "Gain", "Frequency"))
\ No newline at end of file
diff --git a/R-package/R/xgb.plot.deepness.R b/R-package/R/xgb.plot.deepness.R
new file mode 100644
index 000000000000..0efd783acd60
--- /dev/null
+++ b/R-package/R/xgb.plot.deepness.R
@@ -0,0 +1,160 @@
+#' Plot multiple graphs at the same time
+#' 
+#' Plot multiple graph aligned by rows and columns.
+#' 
+#' @importFrom data.table data.table
+#' @param cols number of columns
+#' @return NULL
+multiplot <- function(..., cols = 1) {
+  plots <- list(...)
+  numPlots = length(plots)
+  
+  layout <- matrix(seq(1, cols * ceiling(numPlots / cols)),
+                   ncol = cols, nrow = ceiling(numPlots / cols))
+  
+  if (numPlots == 1) {
+    print(plots[[1]])
+  } else {
+    grid::grid.newpage()
+    grid::pushViewport(grid::viewport(layout = grid::grid.layout(nrow(layout), ncol(layout))))
+    for (i in 1:numPlots) {
+      # Get the i,j matrix positions of the regions that contain this subplot
+      matchidx <- as.data.table(which(layout == i, arr.ind = TRUE))
+      
+      print(
+        plots[[i]], vp = grid::viewport(
+          layout.pos.row = matchidx$row,
+          layout.pos.col = matchidx$col
+        )
+      )
+    }
+  }
+}
+
+#' Parse the graph to extract vector of edges
+#' @param element igraph object containing the path from the root to the leaf.
+edge.parser <- function(element) {
+  edges.vector <- igraph::as_ids(element)
+  t <- tail(edges.vector, n = 1)
+  l <- length(edges.vector)
+  list(t,l)
+}
+
+#' Extract path from root to leaf from data.table
+#' @param dt.tree data.table containing the nodes and edges of the trees
+get.paths.to.leaf <- function(dt.tree) {
+  dt.not.leaf.edges <-
+    dt.tree[Feature != "Leaf",.(ID, Yes, Tree)] %>% list(dt.tree[Feature != "Leaf",.(ID, No, Tree)]) %>% rbindlist(use.names = F)
+  
+  trees <- dt.tree[,unique(Tree)]
+  
+  paths <- list()
+  for (tree in trees) {
+    graph <-
+      igraph::graph_from_data_frame(dt.not.leaf.edges[Tree == tree])
+    paths.tmp <-
+      igraph::shortest_paths(graph, from = paste0(tree, "-0"), to = dt.tree[Tree == tree &
+                                                                              Feature == "Leaf", c(ID)])
+    paths <- c(paths, paths.tmp$vpath)
+  }
+  paths
+}
+
+#' Plot model trees deepness
+#'
+#' Generate a graph to plot the distribution of deepness among trees.
+#'
+#' @importFrom data.table data.table
+#' @importFrom data.table rbindlist
+#' @importFrom data.table setnames
+#' @importFrom data.table :=
+#' @importFrom magrittr %>%
+#' @param model dump generated by the \code{xgb.train} function.
+#'
+#' @return Two graphs showing the distribution of the model deepness.
+#'
+#' @details
+#' Display both the number of \code{leaf} and the distribution of \code{weighted observations}
+#' by tree deepness level.
+#' 
+#' The purpose of this function is to help the user to find the best trade-off to set
+#' the \code{max.depth} and \code{min_child_weight} parameters according to the bias / variance trade-off.
+#' 
+#' See \link{xgb.train} for more information about these parameters.
+#'
+#' The graph is made of two parts:
+#'
+#' \itemize{
+#'  \item Count: number of leaf per level of deepness;
+#'  \item Weighted cover: noramlized weighted cover per leaf (weighted number of instances).
+#' }
+#'
+#' This function is inspired by the blog post \url{http://aysent.github.io/2015/11/08/random-forest-leaf-visualization.html}
+#'
+#' @examples
+#' data(agaricus.train, package='xgboost')
+#'
+#' bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max.depth = 15,
+#'                  eta = 1, nthread = 2, nround = 30, objective = "binary:logistic",
+#'                  min_child_weight = 50)
+#'
+#' xgb.plot.deepness(model = bst)
+#'
+#' @export
+xgb.plot.deepness <- function(model = NULL) {
+  if (!requireNamespace("ggplot2", quietly = TRUE)) {
+    stop("ggplot2 package is required for plotting the graph deepness.",
+         call. = FALSE)
+  }
+  
+  if (!requireNamespace("igraph", quietly = TRUE)) {
+    stop("igraph package is required for plotting the graph deepness.",
+         call. = FALSE)
+  }
+  
+  if (!requireNamespace("grid", quietly = TRUE)) {
+    stop("grid package is required for plotting the graph deepness.",
+         call. = FALSE)
+  }
+  
+  if (class(model) != "xgb.Booster") {
+    stop("model: Has to be an object of class xgb.Booster model generaged by the xgb.train function.")
+  }
+  
+  dt.tree <- xgb.model.dt.tree(model = model)
+  
+  dt.edge.elements <- data.table()
+  paths <- get.paths.to.leaf(dt.tree)
+  
+  dt.edge.elements <-
+    lapply(paths, edge.parser) %>% rbindlist %>% setnames(c("last.edge", "size")) %>%
+    merge(dt.tree, by.x = "last.edge", by.y = "ID") %>% rbind(dt.edge.elements)
+  
+  dt.edge.summuize <-
+    dt.edge.elements[, .(.N, Cover = sum(Cover)), size][,Cover:= Cover / sum(Cover)]
+  
+  p1 <-
+    ggplot2::ggplot(dt.edge.summuize) + ggplot2::geom_line(ggplot2::aes(x = size, y = N, group = 1)) +
+    ggplot2::xlab("") + ggplot2::ylab("Count") + ggplot2::ggtitle("Model complexity") +
+    ggplot2::theme(
+      plot.title = ggplot2::element_text(lineheight = 0.9, face = "bold"),
+      panel.grid.major.y = ggplot2::element_blank(),
+      axis.ticks = ggplot2::element_blank(),
+      axis.text.x = ggplot2::element_blank()
+    )
+  
+  p2 <- 
+    ggplot2::ggplot(dt.edge.summuize) + ggplot2::geom_line(ggplot2::aes(x =size, y = Cover, group = 1)) + 
+    ggplot2::xlab("From root to leaf path length") + ggplot2::ylab("Weighted cover")
+  
+  multiplot(p1,p2,cols = 1)
+}
+
+# Avoid error messages during CRAN check.
+# The reason is that these variables are never declared
+# They are mainly column names inferred by Data.table...
+globalVariables(
+  c(
+    "Feature", "Count", "ggplot", "aes", "geom_bar", "xlab", "ylab", "ggtitle", "theme", "element_blank", "element_text", "ID", "Yes", "No", "Tree"
+  )
+)
diff --git a/R-package/R/xgb.plot.importance.R b/R-package/R/xgb.plot.importance.R
index f126dfe464ae..1fcd7c01438d 100644
--- a/R-package/R/xgb.plot.importance.R
+++ b/R-package/R/xgb.plot.importance.R
@@ -1,57 +1,79 @@
 #' Plot feature importance bar graph
-#' 
-#' Read a data.table containing feature importance details and plot it.
-#' 
+#'
+#' Read a data.table containing feature importance details and plot it (for both GLM and Trees).
+#'
 #' @importFrom magrittr %>%
 #' @param importance_matrix a \code{data.table} returned by the \code{xgb.importance} function.
 #' @param numberOfClusters a \code{numeric} vector containing the min and the max range of the possible number of clusters of bars.
 #'
 #' @return A \code{ggplot2} bar graph representing each feature by a horizontal bar. Longer is the bar, more important is the feature. Features are classified by importance and clustered by importance. The group is represented through the color of the bar.
 #'
-#' @details 
+#' @details
 #' The purpose of this function is to easily represent the importance of each feature of a model.
-#' The function return a ggplot graph, therefore each of its characteristic can be overriden (to customize it).
-#' In particular you may want to override the title of the graph. To do so, add \code{+ ggtitle("A GRAPH NAME")} next to the value returned by this function. 
-#'   
+#' The function returns a ggplot graph, therefore each of its characteristic can be overriden (to customize it).
+#' In particular you may want to override the title of the graph. To do so, add \code{+ ggtitle("A GRAPH NAME")} next to the value returned by this function.
+#'
 #' @examples
 #' data(agaricus.train, package='xgboost')
-#' 
-#' #Both dataset are list with two items, a sparse matrix and labels 
-#' #(labels = outcome column which will be learned). 
+#'
+#' #Both dataset are list with two items, a sparse matrix and labels
+#' #(labels = outcome column which will be learned).
 #' #Each column of the sparse Matrix is a feature in one hot encoding format.
-#' train <- agaricus.train
-#' 
-#' bst <- xgboost(data = train$data, label = train$label, max.depth = 2, 
+#'
+#' bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max.depth = 2,
 #'                eta = 1, nthread = 2, nround = 2,objective = "binary:logistic")
-#' 
-#' #train$data@@Dimnames[[2]] represents the column names of the sparse matrix.
-#' importance_matrix <- xgb.importance(train$data@@Dimnames[[2]], model = bst)
+#'
+#' #agaricus.train$data@@Dimnames[[2]] represents the column names of the sparse matrix.
+#' importance_matrix <- xgb.importance(agaricus.train$data@@Dimnames[[2]], model = bst)
 #' xgb.plot.importance(importance_matrix)
-#' 
+#'
 #' @export
-xgb.plot.importance <- function(importance_matrix = NULL, numberOfClusters = c(1:10)){
-  if (!"data.table" %in% class(importance_matrix))  {     
-    stop("importance_matrix: Should be a data.table.")
-  }
-  if (!requireNamespace("ggplot2", quietly = TRUE)) {
-    stop("ggplot2 package is required for plotting the importance", call. = FALSE)
-  }
-  if (!requireNamespace("Ckmeans.1d.dp", quietly = TRUE)) {
-    stop("Ckmeans.1d.dp package is required for plotting the importance", call. = FALSE)
-  }
-
-  # To avoid issues in clustering when co-occurences are used
-  importance_matrix <- importance_matrix[, .(Gain = sum(Gain)), by = Feature]
-  
-  clusters <- suppressWarnings(Ckmeans.1d.dp::Ckmeans.1d.dp(importance_matrix[,Gain], numberOfClusters))
-  importance_matrix[,"Cluster":=clusters$cluster %>% as.character]
+xgb.plot.importance <-
+  function(importance_matrix = NULL, numberOfClusters = c(1:10)) {
+    if (!"data.table" %in% class(importance_matrix))  {
+      stop("importance_matrix: Should be a data.table.")
+    }
+    if (!requireNamespace("ggplot2", quietly = TRUE)) {
+      stop("ggplot2 package is required for plotting the importance", call. = FALSE)
+    }
+    if (!requireNamespace("Ckmeans.1d.dp", quietly = TRUE)) {
+      stop("Ckmeans.1d.dp package is required for plotting the importance", call. = FALSE)
+    }
+    
+    if(isTRUE(all.equal(colnames(importance_matrix), c("Feature", "Gain", "Cover", "Frequency")))){
+      y.axe.name <- "Gain"
+    } else if(isTRUE(all.equal(colnames(importance_matrix), c("Feature", "Weight")))){
+      y.axe.name <- "Weight"
+    } else {
+      stop("Importance matrix is not correct (column names issue)")
+    }
     
-  plot <- ggplot2::ggplot(importance_matrix, ggplot2::aes(x=stats::reorder(Feature, Gain), y = Gain, width= 0.05), environment = environment())+  ggplot2::geom_bar(ggplot2::aes(fill=Cluster), stat="identity", position="identity") + ggplot2::coord_flip() + ggplot2::xlab("Features") + ggplot2::ylab("Gain") + ggplot2::ggtitle("Feature importance") + ggplot2::theme(plot.title = ggplot2::element_text(lineheight=.9, face="bold"), panel.grid.major.y = ggplot2::element_blank() )
-  
-  return(plot)  
-}
+    # To avoid issues in clustering when co-occurences are used
+    importance_matrix <-
+      importance_matrix[, .(Gain.or.Weight = sum(get(y.axe.name))), by = Feature]
+    
+    clusters <-
+      suppressWarnings(Ckmeans.1d.dp::Ckmeans.1d.dp(importance_matrix[,Gain.or.Weight], numberOfClusters))
+    importance_matrix[,"Cluster":= clusters$cluster %>% as.character]
+    
+    plot <-
+      ggplot2::ggplot(
+        importance_matrix, ggplot2::aes(
+          x = stats::reorder(Feature, Gain.or.Weight), y = Gain.or.Weight, width = 0.05
+        ), environment = environment()
+      ) + ggplot2::geom_bar(ggplot2::aes(fill = Cluster), stat = "identity", position =
+                              "identity") + ggplot2::coord_flip() + ggplot2::xlab("Features") + ggplot2::ylab(y.axe.name) + ggplot2::ggtitle("Feature importance") + ggplot2::theme(
+                                plot.title = ggplot2::element_text(lineheight = .9, face = "bold"), panel.grid.major.y = ggplot2::element_blank()
+                              )
+    
+    return(plot)
+  }
 
 # Avoid error messages during CRAN check.
 # The reason is that these variables are never declared
 # They are mainly column names inferred by Data.table...
-globalVariables(c("Feature", "Gain", "Cluster", "ggplot", "aes", "geom_bar", "coord_flip", "xlab", "ylab", "ggtitle", "theme", "element_blank", "element_text"))
+globalVariables(
+  c(
+    "Feature", "Gain.or.Weight", "Cluster", "ggplot", "aes", "geom_bar", "coord_flip", "xlab", "ylab", "ggtitle", "theme", "element_blank", "element_text", "Gain.or.Weight"
+  )
+)
diff --git a/R-package/R/xgb.plot.multi.trees.R b/R-package/R/xgb.plot.multi.trees.R
new file mode 100644
index 000000000000..c61cb8cd4daf
--- /dev/null
+++ b/R-package/R/xgb.plot.multi.trees.R
@@ -0,0 +1,114 @@
+#' Project all trees on one tree and plot it
+#' 
+#' Visualization of the ensemble of trees as a single collective unit.
+#'
+#' @importFrom data.table data.table
+#' @importFrom data.table rbindlist
+#' @importFrom data.table setnames
+#' @importFrom data.table :=
+#' @importFrom magrittr %>%
+#' @importFrom stringr str_detect
+#' @importFrom stringr str_extract
+#' 
+#' @param model dump generated by the \code{xgb.train} function.
+#' @param feature_names names of each feature as a \code{character} vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}.
+#' @param features.keep number of features to keep in each position of the multi trees.
+#' @param plot.width width in pixels of the graph to produce
+#' @param plot.height height in pixels of the graph to produce
+#' 
+#' @return Two graphs showing the distribution of the model deepness.
+#' 
+#' @details
+#' 
+#' This function tries to capture the complexity of gradient boosted tree ensemble 
+#' in a cohesive way. 
+#' 
+#' The goal is to improve the interpretability of the model generally seen as black box.
+#' The function is dedicated to boosting applied to decision trees only.
+#' 
+#' The purpose is to move from an ensemble of trees to a single tree only.
+#' 
+#' It takes advantage of the fact that the shape of a binary tree is only defined by 
+#' its deepness (therefore in a boosting model, all trees have the same shape). 
+#' 
+#' Moreover, the trees tend to reuse the same features.
+#' 
+#' The function will project each tree on one, and keep for each position the 
+#' \code{features.keep} first features (based on Gain per feature measure).
+#' 
+#' This function is inspired by this blog post:
+#' \url{https://wellecks.wordpress.com/2015/02/21/peering-into-the-black-box-visualizing-lambdamart/}
+#'
+#' @examples
+#' data(agaricus.train, package='xgboost')
+#'
+#' bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max.depth = 15,
+#'                  eta = 1, nthread = 2, nround = 30, objective = "binary:logistic",
+#'                  min_child_weight = 50)
+#'
+#' p <- xgb.plot.multi.trees(model = bst, feature_names = agaricus.train$data@Dimnames[[2]], features.keep = 3)
+#' print(p)
+#'
+#' @export
+xgb.plot.multi.trees <- function(model, feature_names = NULL, features.keep = 5, plot.width = NULL, plot.height = NULL){
+  tree.matrix <- xgb.model.dt.tree(feature_names = feature_names, model = model)
+  
+  # first number of the path represents the tree, then the following numbers are related to the path to follow
+  # root init
+  root.nodes <- tree.matrix[str_detect(ID, "\\d+-0"), ID]
+  tree.matrix[ID %in% root.nodes, abs.node.position:=root.nodes]
+  
+  precedent.nodes <- root.nodes
+  
+  while(tree.matrix[,sum(is.na(abs.node.position))] > 0) {
+    yes.row.nodes <- tree.matrix[abs.node.position %in% precedent.nodes & !is.na(Yes)]
+    no.row.nodes <- tree.matrix[abs.node.position %in% precedent.nodes & !is.na(No)]
+    yes.nodes.abs.pos <- yes.row.nodes[, abs.node.position] %>% paste0("_0")
+    no.nodes.abs.pos <- no.row.nodes[, abs.node.position] %>% paste0("_1")
+    
+    tree.matrix[ID %in% yes.row.nodes[, Yes], abs.node.position := yes.nodes.abs.pos]
+    tree.matrix[ID %in% no.row.nodes[, No], abs.node.position := no.nodes.abs.pos]
+    precedent.nodes <- c(yes.nodes.abs.pos, no.nodes.abs.pos)
+  }
+  
+  tree.matrix[!is.na(Yes),Yes:= paste0(abs.node.position, "_0")]
+  tree.matrix[!is.na(No),No:= paste0(abs.node.position, "_1")]
+  
+  
+  
+  remove.tree <- . %>% str_replace(pattern = "^\\d+-", replacement = "")
+  
+  tree.matrix[,`:=`(abs.node.position=remove.tree(abs.node.position), Yes=remove.tree(Yes), No=remove.tree(No))]
+  
+  nodes.dt <- tree.matrix[,.(Quality = sum(Quality)),by = .(abs.node.position, Feature)][,.(Text =paste0(Feature[1:min(length(Feature), features.keep)], " (", Quality[1:min(length(Quality), features.keep)], ")") %>% paste0(collapse = "\n")), by=abs.node.position]
+  edges.dt <- tree.matrix[Feature != "Leaf",.(abs.node.position, Yes)] %>% list(tree.matrix[Feature != "Leaf",.(abs.node.position, No)]) %>% rbindlist() %>% setnames(c("From", "To")) %>% .[,.N,.(From, To)] %>% .[,N:=NULL]
+  
+  nodes <- DiagrammeR::create_nodes(nodes = nodes.dt[,abs.node.position],
+                                    label = nodes.dt[,Text],
+                                    style = "filled",
+                                    color = "DimGray",
+                                    fillcolor= "Beige",
+                                    shape = "oval",
+                                    fontname = "Helvetica"
+  )
+  
+  edges <- DiagrammeR::create_edges(from = edges.dt[,From],
+                                    to = edges.dt[,To],
+                                    color = "DimGray", 
+                                    arrowsize = "1.5", 
+                                    arrowhead = "vee",
+                                    fontname = "Helvetica",
+                                    rel = "leading_to")
+  
+  graph <- DiagrammeR::create_graph(nodes_df = nodes,
+                                    edges_df = edges,
+                                    graph_attrs = "rankdir = LR")
+  
+  DiagrammeR::render_graph(graph, width = plot.width, height = plot.height)  
+}
+
+globalVariables(
+  c(
+    "Feature", "no.nodes.abs.pos", "ID", "Yes", "No", "Tree", "yes.nodes.abs.pos", "abs.node.position"
+  )
+)
\ No newline at end of file
diff --git a/R-package/R/xgb.plot.tree.R b/R-package/R/xgb.plot.tree.R
index edcd5f47f65b..3d9d55c9f3a5 100644
--- a/R-package/R/xgb.plot.tree.R
+++ b/R-package/R/xgb.plot.tree.R
@@ -1,27 +1,15 @@
 #' Plot a boosted tree model
 #' 
-#' Read a tree model text dump. 
-#' Plotting only works for boosted tree model (not linear model).
+#' Read a tree model text dump and plot the model. 
 #' 
 #' @importFrom data.table data.table
-#' @importFrom data.table set
-#' @importFrom data.table rbindlist
 #' @importFrom data.table :=
-#' @importFrom data.table copy
 #' @importFrom magrittr %>%
-#' @importFrom magrittr not
-#' @importFrom magrittr add
-#' @importFrom stringr str_extract
-#' @importFrom stringr str_split
-#' @importFrom stringr str_extract
-#' @importFrom stringr str_trim
-#' @param feature_names names of each feature as a character vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}.
-#' @param filename_dump the path to the text file storing the model. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}). Possible to provide a model directly (see \code{model} argument).
+#' @param feature_names names of each feature as a \code{character} vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}.
 #' @param model generated by the \code{xgb.train} function. Avoid the creation of a dump file.
 #' @param n_first_tree limit the plot to the n first trees. If \code{NULL}, all trees of the model are plotted. Performance can be low for huge models.
-#' @param CSSstyle a \code{character} vector storing a css style to customize the appearance of nodes. Look at the \href{https://github.com/knsv/mermaid/wiki}{Mermaid wiki} for more information.
-#' @param  width  the width of the diagram in pixels.
-#' @param height	the height of the diagram in pixels.
+#' @param plot.width  the width of the diagram in pixels.
+#' @param plot.height	the height of the diagram in pixels.
 #'
 #' @return A \code{DiagrammeR} of the model.
 #'
@@ -30,68 +18,67 @@
 #' The content of each node is organised that way:
 #' 
 #' \itemize{
-#'  \item \code{feature} value ;
-#'  \item \code{cover}: the sum of second order gradient of training data classified to the leaf, if it is square loss, this simply corresponds to the number of instances in that branch. Deeper in the tree a node is, lower this metric will be ;
+#'  \item \code{feature} value;
+#'  \item \code{cover}: the sum of second order gradient of training data classified to the leaf, if it is square loss, this simply corresponds to the number of instances in that branch. Deeper in the tree a node is, lower this metric will be;
 #'  \item \code{gain}: metric the importance of the node in the model.
 #' } 
 #' 
-#' Each branch finishes with a leaf. For each leaf, only the \code{cover} is indicated.
-#' It uses \href{https://github.com/knsv/mermaid/}{Mermaid} library for that purpose.
+#' The function uses \href{http://www.graphviz.org/}{GraphViz} library for that purpose.
 #'  
 #' @examples
 #' data(agaricus.train, package='xgboost')
 #' 
-#' #Both dataset are list with two items, a sparse matrix and labels 
-#' #(labels = outcome column which will be learned). 
-#' #Each column of the sparse Matrix is a feature in one hot encoding format.
-#' train <- agaricus.train
-#' 
-#' bst <- xgboost(data = train$data, label = train$label, max.depth = 2, 
+#' bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max.depth = 2, 
 #'                eta = 1, nthread = 2, nround = 2,objective = "binary:logistic")
 #' 
-#' #agaricus.test$data@@Dimnames[[2]] represents the column names of the sparse matrix.
-#' xgb.plot.tree(agaricus.train$data@@Dimnames[[2]], model = bst)
+#' # agaricus.train$data@@Dimnames[[2]] represents the column names of the sparse matrix.
+#' xgb.plot.tree(feature_names = agaricus.train$data@@Dimnames[[2]], model = bst)
 #' 
 #' @export
-#' 
-xgb.plot.tree <- function(feature_names = NULL, filename_dump = NULL, model = NULL, n_first_tree = NULL, CSSstyle = NULL, width = NULL, height = NULL){  
-  
-  if (!(class(CSSstyle) %in% c("character", "NULL") && length(CSSstyle) <= 1)) {
-    stop("style: Has to be a character vector of size 1.")
-  }
-  
-  if (!class(model) %in% c("xgb.Booster", "NULL")) {
+xgb.plot.tree <- function(feature_names = NULL, model = NULL, n_first_tree = NULL, plot.width = NULL, plot.height = NULL){
+
+  if (class(model) != "xgb.Booster") {
     stop("model: Has to be an object of class xgb.Booster model generaged by the xgb.train function.")
   }
-  
+
   if (!requireNamespace("DiagrammeR", quietly = TRUE)) {
     stop("DiagrammeR package is required for xgb.plot.tree", call. = FALSE)
   }
   
-  if(is.null(model)){
-    allTrees <- xgb.model.dt.tree(feature_names = feature_names, filename_dump = filename_dump, n_first_tree = n_first_tree)  
-  } else {
-    allTrees <- xgb.model.dt.tree(feature_names = feature_names, model = model, n_first_tree = n_first_tree)  
-  }
-  
-  allTrees[Feature!="Leaf" ,yesPath:= paste(ID,"(", Feature, "<br/>Cover: ", Cover, "<br/>Gain: ", Quality, ")-->|< ", Split, "|", Yes, ">", Yes.Feature, "]", sep = "")]
+  allTrees <- xgb.model.dt.tree(feature_names = feature_names, model = model, n_first_tree = n_first_tree)
   
-  allTrees[Feature!="Leaf" ,noPath:= paste(ID,"(", Feature, ")-->|>= ", Split, "|", No, ">", No.Feature, "]", sep = "")]
+  allTrees[, label:= paste0(Feature, "\nCover: ", Cover, "\nGain: ", Quality)]
+  allTrees[, shape:= "rectangle"][Feature == "Leaf", shape:= "oval"]
+  allTrees[, filledcolor:= "Beige"][Feature == "Leaf", filledcolor:= "Khaki"]
   
+  # rev is used to put the first tree on top.
+  nodes <- DiagrammeR::create_nodes(nodes = allTrees[,ID] %>% rev,
+                 label = allTrees[,label] %>% rev,
+                 style = "filled",
+                 color = "DimGray",
+                 fillcolor= allTrees[,filledcolor] %>% rev,
+                 shape = allTrees[,shape] %>% rev,
+                 data = allTrees[,Feature] %>% rev,
+                 fontname = "Helvetica"
+                 )
   
-  if(is.null(CSSstyle)){
-    CSSstyle <- "classDef greenNode fill:#A2EB86, stroke:#04C4AB, stroke-width:2px;classDef redNode fill:#FFA070, stroke:#FF5E5E, stroke-width:2px"  
-  }  
-  
-  yes <- allTrees[Feature!="Leaf", c(Yes)] %>% paste(collapse = ",") %>% paste("class ", ., " greenNode", sep = "")
-  
-  no <- allTrees[Feature!="Leaf", c(No)] %>% paste(collapse = ",") %>% paste("class ", ., " redNode", sep = "")
+  edges <- DiagrammeR::create_edges(from = allTrees[Feature != "Leaf", c(ID)] %>% rep(2),
+                        to = allTrees[Feature != "Leaf", c(Yes, No)],
+                        label = allTrees[Feature != "Leaf", paste("<",Split)] %>% c(rep("",nrow(allTrees[Feature != "Leaf"]))),
+                        color = "DimGray", 
+                        arrowsize = "1.5", 
+                        arrowhead = "vee",
+                        fontname = "Helvetica",
+                        rel = "leading_to")
+
+  graph <- DiagrammeR::create_graph(nodes_df = nodes,
+                        edges_df = edges,
+                        graph_attrs = "rankdir = LR")
   
-  path <- allTrees[Feature!="Leaf", c(yesPath, noPath)] %>% .[order(.)] %>% paste(sep = "", collapse = ";") %>% paste("graph LR", .,collapse = "", sep = ";") %>% paste(CSSstyle, yes, no, sep = ";")
-  DiagrammeR::mermaid(path, width, height)
+  DiagrammeR::render_graph(graph, width = plot.width, height = plot.height)
 }
 
 # Avoid error messages during CRAN check.
 # The reason is that these variables are never declared
 # They are mainly column names inferred by Data.table...
-globalVariables(c("Feature", "yesPath", "ID", "Cover", "Quality", "Split", "Yes", "Yes.Feature", "noPath", "No", "No.Feature", "."))
+globalVariables(c("Feature", "ID", "Cover", "Quality", "Split", "Yes", "No", ".", "shape", "filledcolor", "label"))
diff --git a/R-package/R/xgb.save.R b/R-package/R/xgb.save.R
index 2600b8cff261..7d595ddc6128 100644
--- a/R-package/R/xgb.save.R
+++ b/R-package/R/xgb.save.R
@@ -16,7 +16,6 @@
 #' bst <- xgb.load('xgb.model')
 #' pred <- predict(bst, test$data)
 #' @export
-#' 
 xgb.save <- function(model, fname) {
   if (typeof(fname) != "character") {
     stop("xgb.save: fname must be character")
@@ -29,4 +28,4 @@ xgb.save <- function(model, fname) {
   stop("xgb.save: the input must be xgb.Booster. Use xgb.DMatrix.save to save
        xgb.DMatrix object.")
   return(FALSE)
-} 
+}
diff --git a/R-package/R/xgb.save.raw.R b/R-package/R/xgb.save.raw.R
index e885e6e7e9b7..e61303addfe2 100644
--- a/R-package/R/xgb.save.raw.R
+++ b/R-package/R/xgb.save.raw.R
@@ -16,7 +16,6 @@
 #' bst <- xgb.load(raw)
 #' pred <- predict(bst, test$data)
 #' @export
-#' 
 xgb.save.raw <- function(model) {
   if (class(model) == "xgb.Booster"){
     model <- model$handle
diff --git a/R-package/R/xgb.train.R b/R-package/R/xgb.train.R
index b1d79d8660cd..d7fa6e1ee0ec 100644
--- a/R-package/R/xgb.train.R
+++ b/R-package/R/xgb.train.R
@@ -19,7 +19,7 @@
 #'   \item \code{eta} control the learning rate: scale the contribution of each tree by a factor of \code{0 < eta < 1} when it is added to the current approximation. Used to prevent overfitting by making the boosting process more conservative. Lower value for \code{eta} implies larger value for \code{nrounds}: low \code{eta} value means model more robust to overfitting but slower to compute. Default: 0.3
 #'   \item \code{gamma} minimum loss reduction required to make a further partition on a leaf node of the tree. the larger, the more conservative the algorithm will be. 
 #'   \item \code{max_depth} maximum depth of a tree. Default: 6
-#'   \item \code{min_child_weight} minimum sum of instance weight(hessian) needed in a child. If the tree partition step results in a leaf node with the sum of instance weight less than min_child_weight, then the building process will give up further partitioning. In linear regression mode, this simply corresponds to minimum number of instances needed to be in each node. The larger, the more conservative the algorithm will be. Default: 1
+#'   \item \code{min_child_weight} minimum sum of instance weight (hessian) needed in a child. If the tree partition step results in a leaf node with the sum of instance weight less than min_child_weight, then the building process will give up further partitioning. In linear regression mode, this simply corresponds to minimum number of instances needed to be in each node. The larger, the more conservative the algorithm will be. Default: 1
 #'   \item \code{subsample} subsample ratio of the training instance. Setting it to 0.5 means that xgboost randomly collected half of the data instances to grow trees and this will prevent overfitting. It makes computation shorter (because less data to analyse). It is advised to use this parameter with \code{eta} and increase \code{nround}. Default: 1 
 #'   \item \code{colsample_bytree} subsample ratio of columns when constructing each tree. Default: 1
 #'   \item \code{num_parallel_tree} Experimental parameter. number of trees to grow per round. Useful to test Random Forest through Xgboost (set \code{colsample_bytree < 1}, \code{subsample  < 1}  and \code{round = 1}) accordingly. Default: 1
@@ -43,7 +43,7 @@
 #'     \item \code{binary:logistic} logistic regression for binary classification. Output probability.
 #'     \item \code{binary:logitraw} logistic regression for binary classification, output score before logistic transformation.
 #'     \item \code{num_class} set the number of classes. To use only with multiclass objectives.
-#'     \item \code{multi:softmax} set xgboost to do multiclass classification using the softmax objective. Class is represented by a number and should be from 0 to \code{tonum_class}.
+#'     \item \code{multi:softmax} set xgboost to do multiclass classification using the softmax objective. Class is represented by a number and should be from 0 to \code{num_class}.
 #'     \item \code{multi:softprob} same as softmax, but output a vector of ndata * nclass, which can be further reshaped to ndata, nclass matrix. The result contains predicted probabilities of each data point belonging to each class.
 #'     \item \code{rank:pairwise} set xgboost to do ranking task by minimizing the pairwise loss.
 #'   }
@@ -89,6 +89,7 @@
 #'   \itemize{
 #'      \item \code{rmse} root mean square error. \url{http://en.wikipedia.org/wiki/Root_mean_square_error}
 #'      \item \code{logloss} negative log-likelihood. \url{http://en.wikipedia.org/wiki/Log-likelihood}
+#'      \item \code{mlogloss} multiclass logloss. \url{https://www.kaggle.com/wiki/MultiClassLogLoss}
 #'      \item \code{error} Binary classification error rate. It is calculated as \code{(wrong cases) / (all cases)}. For the predictions, the evaluation will regard the instances with prediction value larger than 0.5 as positive instances, and the others as negative instances.
 #'      \item \code{merror} Multiclass classification error rate. It is calculated as \code{(wrong cases) / (all cases)}.
 #'      \item \code{auc} Area under the curve. \url{http://en.wikipedia.org/wiki/Receiver_operating_characteristic#'Area_under_curve} for ranking evaluation.
@@ -119,10 +120,9 @@
 #' param <- list(max.depth = 2, eta = 1, silent = 1, objective=logregobj,eval_metric=evalerror)
 #' bst <- xgb.train(param, dtrain, nthread = 2, nround = 2, watchlist)
 #' @export
-#' 
-xgb.train <- function(params=list(), data, nrounds, watchlist = list(), 
+xgb.train <- function(params=list(), data, nrounds, watchlist = list(),
                       obj = NULL, feval = NULL, verbose = 1, print.every.n=1L,
-                      early.stop.round = NULL, maximize = NULL, 
+                      early.stop.round = NULL, maximize = NULL,
                       save_period = 0, save_name = "xgboost.model", ...) {
   dtrain <- data
   if (typeof(params) != "list") {
@@ -139,30 +139,31 @@ xgb.train <- function(params=list(), data, nrounds, watchlist = list(),
   if (length(watchlist) != 0 && verbose == 0) {
     warning('watchlist is provided but verbose=0, no evaluation information will be printed')
   }
-  
-  dot.params = list(...)
-  nms.params = names(params)
-  nms.dot.params = names(dot.params)
-  if (length(intersect(nms.params,nms.dot.params))>0)
+
+  fit.call <- match.call()
+  dot.params <- list(...)
+  nms.params <- names(params)
+  nms.dot.params <- names(dot.params)
+  if (length(intersect(nms.params,nms.dot.params)) > 0)
     stop("Duplicated term in parameters. Please check your list of params.")
-  params = append(params, dot.params)
-  
+  params <- append(params, dot.params)
+
   # customized objective and evaluation metric interface
   if (!is.null(params$objective) && !is.null(obj))
     stop("xgb.train: cannot assign two different objectives")
   if (!is.null(params$objective))
-    if (class(params$objective)=='function') {
-      obj = params$objective
-      params$objective = NULL
+    if (class(params$objective) == 'function') {
+      obj <- params$objective
+      params$objective <- NULL
     }
   if (!is.null(params$eval_metric) && !is.null(feval))
     stop("xgb.train: cannot assign two different evaluation metrics")
   if (!is.null(params$eval_metric))
-    if (class(params$eval_metric)=='function') {
-      feval = params$eval_metric
-      params$eval_metric = NULL
+    if (class(params$eval_metric) == 'function') {
+      feval <- params$eval_metric
+      params$eval_metric <- NULL
     }
-    
+
   # Early stopping
   if (!is.null(early.stop.round)){
     if (!is.null(feval) && is.null(maximize))
@@ -174,44 +175,43 @@ xgb.train <- function(params=list(), data, nrounds, watchlist = list(),
     if (is.null(maximize))
     {
       if (params$eval_metric %in% c('rmse','logloss','error','merror','mlogloss')) {
-        maximize = FALSE
+        maximize <- FALSE
       } else {
-        maximize = TRUE
+        maximize <- TRUE
       }
     }
-    
+
     if (maximize) {
-      bestScore = 0
+      bestScore <- 0
     } else {
-      bestScore = Inf
+      bestScore <- Inf
     }
-    bestInd = 0
+    bestInd <- 0
     earlyStopflag = FALSE
-    
-    if (length(watchlist)>1)
+
+    if (length(watchlist) > 1)
       warning('Only the first data set in watchlist is used for early stopping process.')
   }
-  
-  
+
   handle <- xgb.Booster(params, append(watchlist, dtrain))
   bst <- xgb.handleToBooster(handle)
-  print.every.n=max( as.integer(print.every.n), 1L)
+  print.every.n <- max( as.integer(print.every.n), 1L)
   for (i in 1:nrounds) {
     succ <- xgb.iter.update(bst$handle, dtrain, i - 1, obj)
     if (length(watchlist) != 0) {
       msg <- xgb.iter.eval(bst$handle, watchlist, i - 1, feval)
-      if (0== ( (i-1) %% print.every.n))
-	    cat(paste(msg, "\n", sep=""))
+      if (0 == ( (i - 1) %% print.every.n))
+	    cat(paste(msg, "\n", sep = ""))
       if (!is.null(early.stop.round))
       {
-        score = strsplit(msg,':|\\s+')[[1]][3]
-        score = as.numeric(score)
-        if ((maximize && score>bestScore) || (!maximize && score<bestScore)) {
-          bestScore = score
-          bestInd = i
+        score <- strsplit(msg,':|\\s+')[[1]][3]
+        score <- as.numeric(score)
+        if ( (maximize && score > bestScore) || (!maximize && score < bestScore)) {
+          bestScore <- score
+          bestInd <- i
         } else {
-          if (i-bestInd>=early.stop.round) {
-            earlyStopflag = TRUE
+          earlyStopflag = TRUE
+          if (i - bestInd >= early.stop.round) {
             cat('Stopping. Best iteration:',bestInd)
             break
           }
@@ -225,9 +225,13 @@ xgb.train <- function(params=list(), data, nrounds, watchlist = list(),
     }
   }
   bst <- xgb.Booster.check(bst)
+
   if (!is.null(early.stop.round)) {
-    bst$bestScore = bestScore
-    bst$bestInd = bestInd
+    bst$bestScore <- bestScore
+    bst$bestInd <- bestInd
   }
+
+  attr(bst, "call") <- fit.call
+  attr(bst, "params") <- params
   return(bst)
-} 
+}
diff --git a/R-package/R/xgboost.R b/R-package/R/xgboost.R
index 164dc1838539..92637bb434c0 100644
--- a/R-package/R/xgboost.R
+++ b/R-package/R/xgboost.R
@@ -58,29 +58,26 @@
 #' pred <- predict(bst, test$data)
 #' 
 #' @export
-#' 
-xgboost <- function(data = NULL, label = NULL, missing = NULL, weight = NULL, 
-                    params = list(), nrounds, 
+xgboost <- function(data = NULL, label = NULL, missing = NA, weight = NULL,
+                    params = list(), nrounds,
                     verbose = 1, print.every.n = 1L, early.stop.round = NULL,
                     maximize = NULL, save_period = 0, save_name = "xgboost.model", ...) {
   dtrain <- xgb.get.DMatrix(data, label, missing, weight)
-    
+
   params <- append(params, list(...))
-  
+
   if (verbose > 0) {
     watchlist <- list(train = dtrain)
   } else {
     watchlist <- list()
   }
-  
+
   bst <- xgb.train(params, dtrain, nrounds, watchlist, verbose = verbose, print.every.n=print.every.n,
                    early.stop.round = early.stop.round, maximize = maximize,
                    save_period = save_period, save_name = save_name)
-  
-  return(bst)
-} 
-
 
+  return(bst)
+}
 #' Training part from Mushroom Data Set
 #' 
 #' This data set is originally from the Mushroom data set,
diff --git a/R-package/demo/basic_walkthrough.R b/R-package/demo/basic_walkthrough.R
index 532c5d873280..193618be30e3 100644
--- a/R-package/demo/basic_walkthrough.R
+++ b/R-package/demo/basic_walkthrough.R
@@ -14,28 +14,28 @@ class(train$data)
 # this is the basic usage of xgboost you can put matrix in data field
 # note: we are putting in sparse matrix here, xgboost naturally handles sparse input
 # use sparse matrix when your feature is sparse(e.g. when you are using one-hot encoding vector)
-print("training xgboost with sparseMatrix")
+print("Training xgboost with sparseMatrix")
 bst <- xgboost(data = train$data, label = train$label, max.depth = 2, eta = 1, nround = 2,
                nthread = 2, objective = "binary:logistic")
 # alternatively, you can put in dense matrix, i.e. basic R-matrix
-print("training xgboost with Matrix")
+print("Training xgboost with Matrix")
 bst <- xgboost(data = as.matrix(train$data), label = train$label, max.depth = 2, eta = 1, nround = 2,
                nthread = 2, objective = "binary:logistic")
 
 # you can also put in xgb.DMatrix object, which stores label, data and other meta datas needed for advanced features
-print("training xgboost with xgb.DMatrix")
+print("Training xgboost with xgb.DMatrix")
 dtrain <- xgb.DMatrix(data = train$data, label = train$label)
 bst <- xgboost(data = dtrain, max.depth = 2, eta = 1, nround = 2, nthread = 2, 
                objective = "binary:logistic")
 
 # Verbose = 0,1,2
-print ('train xgboost with verbose 0, no message')
+print("Train xgboost with verbose 0, no message")
 bst <- xgboost(data = dtrain, max.depth = 2, eta = 1, nround = 2,
                nthread = 2, objective = "binary:logistic", verbose = 0)
-print ('train xgboost with verbose 1, print evaluation metric')
+print("Train xgboost with verbose 1, print evaluation metric")
 bst <- xgboost(data = dtrain, max.depth = 2, eta = 1, nround = 2,
                nthread = 2, objective = "binary:logistic", verbose = 1)
-print ('train xgboost with verbose 2, also print information about tree')
+print("Train xgboost with verbose 2, also print information about tree")
 bst <- xgboost(data = dtrain, max.depth = 2, eta = 1, nround = 2,
                nthread = 2, objective = "binary:logistic", verbose = 2)
 
@@ -76,11 +76,11 @@ dtest <- xgb.DMatrix(data = test$data, label=test$label)
 watchlist <- list(train=dtrain, test=dtest)
 # to train with watchlist, use xgb.train, which contains more advanced features
 # watchlist allows us to monitor the evaluation result on all data in the list 
-print ('train xgboost using xgb.train with watchlist')
+print("Train xgboost using xgb.train with watchlist")
 bst <- xgb.train(data=dtrain, max.depth=2, eta=1, nround=2, watchlist=watchlist,
                  nthread = 2, objective = "binary:logistic")
 # we can change evaluation metrics, or use multiple evaluation metrics
-print ('train xgboost using xgb.train with watchlist, watch logloss and error')
+print("train xgboost using xgb.train with watchlist, watch logloss and error")
 bst <- xgb.train(data=dtrain, max.depth=2, eta=1, nround=2, watchlist=watchlist,
                  eval.metric = "error", eval.metric = "logloss",
                  nthread = 2, objective = "binary:logistic")
@@ -102,4 +102,9 @@ xgb.dump(bst, "dump.raw.txt", with.stats = T)
 
 # Finally, you can check which features are the most important.
 print("Most important features (look at column Gain):")
-print(xgb.importance(feature_names = train$data@Dimnames[[2]], filename_dump = "dump.raw.txt"))
+imp_matrix <- xgb.importance(feature_names = train$data@Dimnames[[2]], model = bst)
+print(imp_matrix)
+
+# Feature importance bar plot by gain
+print("Feature importance Plot : ")
+print(xgb.plot.importance(importance_matrix = imp_matrix))
diff --git a/R-package/demo/boost_from_prediction.R b/R-package/demo/boost_from_prediction.R
index 9d7db806b9aa..7fa7d8545de4 100644
--- a/R-package/demo/boost_from_prediction.R
+++ b/R-package/demo/boost_from_prediction.R
@@ -23,4 +23,4 @@ setinfo(dtrain, "base_margin", ptrain)
 setinfo(dtest, "base_margin", ptest)
 
 print('this is result of boost from initial prediction')
-bst <- xgb.train( param, dtrain, 1, watchlist )
+bst <- xgb.train(params = param, data = dtrain, nrounds = 1, watchlist = watchlist)
diff --git a/R-package/demo/create_sparse_matrix.R b/R-package/demo/create_sparse_matrix.R
index 2fbf41772029..7a8dfaa82532 100644
--- a/R-package/demo/create_sparse_matrix.R
+++ b/R-package/demo/create_sparse_matrix.R
@@ -67,10 +67,9 @@ output_vector = df[,Y:=0][Improved == "Marked",Y:=1][,Y]
 cat("Learning...\n")
 bst <- xgboost(data = sparse_matrix, label = output_vector, max.depth = 9,
                eta = 1, nthread = 2, nround = 10,objective = "binary:logistic")
-xgb.dump(bst, 'xgb.model.dump', with.stats = T)
 
 # sparse_matrix@Dimnames[[2]] represents the column names of the sparse matrix.
-importance <- xgb.importance(sparse_matrix@Dimnames[[2]], 'xgb.model.dump')
+importance <- xgb.importance(feature_names = sparse_matrix@Dimnames[[2]], model = bst)
 print(importance)
 # According to the matrix below, the most important feature in this dataset to predict if the treatment will work is the Age. The second most important feature is having received a placebo or not. The sex is third. Then we see our generated features (AgeDiscret). We can see that their contribution is very low (Gain column).
 
diff --git a/R-package/demo/cross_validation.R b/R-package/demo/cross_validation.R
index c3148ae215b5..5d748f6797c9 100644
--- a/R-package/demo/cross_validation.R
+++ b/R-package/demo/cross_validation.R
@@ -43,9 +43,9 @@ evalerror <- function(preds, dtrain) {
 param <- list(max.depth=2,eta=1,silent=1,
               objective = logregobj, eval_metric = evalerror)
 # train with customized objective
-xgb.cv(param, dtrain, nround, nfold = 5)
+xgb.cv(params = param, data = dtrain, nrounds = nround, nfold = 5)
 
 # do cross validation with prediction values for each fold
-res <- xgb.cv(param, dtrain, nround, nfold=5, prediction = TRUE)
+res <- xgb.cv(params = param, data = dtrain, nrounds = nround, nfold = 5, prediction = TRUE)
 res$dt
 length(res$pred)
diff --git a/R-package/demo/predict_leaf_indices.R b/R-package/demo/predict_leaf_indices.R
index c03a17955f9d..fc87befb7abc 100644
--- a/R-package/demo/predict_leaf_indices.R
+++ b/R-package/demo/predict_leaf_indices.R
@@ -1,21 +1,52 @@
 require(xgboost)
+require(data.table)
+require(Matrix)
+
+set.seed(1982)
+
 # load in the agaricus dataset
 data(agaricus.train, package='xgboost')
 data(agaricus.test, package='xgboost')
-dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
-dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label)
+dtrain <- xgb.DMatrix(data = agaricus.train$data, label = agaricus.train$label)
+dtest <- xgb.DMatrix(data = agaricus.test$data, label = agaricus.test$label)
 
-param <- list(max.depth=2,eta=1,silent=1,objective='binary:logistic')
-watchlist <- list(eval = dtest, train = dtrain)
-nround = 5
+param <- list(max.depth=2, eta=1, silent=1, objective='binary:logistic')
+nround = 4
 
 # training the model for two rounds
-bst = xgb.train(param, dtrain, nround, nthread = 2, watchlist)
-cat('start testing prediction from first n trees\n')
+bst = xgb.train(params = param, data = dtrain, nrounds = nround, nthread = 2)
+
+# Model accuracy without new features
+accuracy.before <- sum((predict(bst, agaricus.test$data) >= 0.5) == agaricus.test$label) / length(agaricus.test$label)
 
-### predict using first 2 tree
-pred_with_leaf = predict(bst, dtest, ntreelimit = 2, predleaf = TRUE)
-head(pred_with_leaf)
 # by default, we predict using all the trees
+
 pred_with_leaf = predict(bst, dtest, predleaf = TRUE)
 head(pred_with_leaf)
+
+create.new.tree.features <- function(model, original.features){
+  pred_with_leaf <- predict(model, original.features, predleaf = TRUE)
+  cols <- list()
+  for(i in 1:length(trees)){
+    # max is not the real max but it s not important for the purpose of adding features
+    leaf.id <- sort(unique(pred_with_leaf[,i]))
+    cols[[i]] <- factor(x = pred_with_leaf[,i], level = leaf.id)
+  }
+  cBind(original.features, sparse.model.matrix( ~ . -1, as.data.frame(cols)))
+}
+
+# Convert previous features to one hot encoding
+new.features.train <- create.new.tree.features(bst, agaricus.train$data)
+new.features.test <- create.new.tree.features(bst, agaricus.test$data)
+
+# learning with new features
+new.dtrain <- xgb.DMatrix(data = new.features.train, label = agaricus.train$label)
+new.dtest <- xgb.DMatrix(data = new.features.test, label = agaricus.test$label)
+watchlist <- list(train = new.dtrain)
+bst <- xgb.train(params = param, data = new.dtrain, nrounds = nround, nthread = 2)
+
+# Model accuracy with new features
+accuracy.after <- sum((predict(bst, new.dtest) >= 0.5) == agaricus.test$label) / length(agaricus.test$label)
+
+# Here the accuracy was already good and is now perfect.
+cat(paste("The accuracy was", accuracy.before, "before adding leaf features and it is now", accuracy.after, "!\n"))
diff --git a/R-package/man/agaricus.test.Rd b/R-package/man/agaricus.test.Rd
index c54e30ba34d2..52ff08f86199 100644
--- a/R-package/man/agaricus.test.Rd
+++ b/R-package/man/agaricus.test.Rd
@@ -1,10 +1,10 @@
-% Generated by roxygen2 (4.1.1): do not edit by hand
+% Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/xgboost.R
 \docType{data}
 \name{agaricus.test}
 \alias{agaricus.test}
 \title{Test part from Mushroom Data Set}
-\format{A list containing a label vector, and a dgCMatrix object with 1611
+\format{A list containing a label vector, and a dgCMatrix object with 1611 
 rows and 126 variables}
 \usage{
 data(agaricus.test)
@@ -24,8 +24,8 @@ This data set includes the following fields:
 \references{
 https://archive.ics.uci.edu/ml/datasets/Mushroom
 
-Bache, K. & Lichman, M. (2013). UCI Machine Learning Repository
-[http://archive.ics.uci.edu/ml]. Irvine, CA: University of California,
+Bache, K. & Lichman, M. (2013). UCI Machine Learning Repository 
+[http://archive.ics.uci.edu/ml]. Irvine, CA: University of California, 
 School of Information and Computer Science.
 }
 \keyword{datasets}
diff --git a/R-package/man/agaricus.train.Rd b/R-package/man/agaricus.train.Rd
index 955257148559..e27d3ac25a4f 100644
--- a/R-package/man/agaricus.train.Rd
+++ b/R-package/man/agaricus.train.Rd
@@ -1,10 +1,10 @@
-% Generated by roxygen2 (4.1.1): do not edit by hand
+% Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/xgboost.R
 \docType{data}
 \name{agaricus.train}
 \alias{agaricus.train}
 \title{Training part from Mushroom Data Set}
-\format{A list containing a label vector, and a dgCMatrix object with 6513
+\format{A list containing a label vector, and a dgCMatrix object with 6513 
 rows and 127 variables}
 \usage{
 data(agaricus.train)
@@ -24,8 +24,8 @@ This data set includes the following fields:
 \references{
 https://archive.ics.uci.edu/ml/datasets/Mushroom
 
-Bache, K. & Lichman, M. (2013). UCI Machine Learning Repository
-[http://archive.ics.uci.edu/ml]. Irvine, CA: University of California,
+Bache, K. & Lichman, M. (2013). UCI Machine Learning Repository 
+[http://archive.ics.uci.edu/ml]. Irvine, CA: University of California, 
 School of Information and Computer Science.
 }
 \keyword{datasets}
diff --git a/R-package/man/edge.parser.Rd b/R-package/man/edge.parser.Rd
new file mode 100644
index 000000000000..25ee4a30ae8e
--- /dev/null
+++ b/R-package/man/edge.parser.Rd
@@ -0,0 +1,15 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/xgb.plot.deepness.R
+\name{edge.parser}
+\alias{edge.parser}
+\title{Parse the graph to extract vector of edges}
+\usage{
+edge.parser(element)
+}
+\arguments{
+\item{element}{igraph object containing the path from the root to the leaf.}
+}
+\description{
+Parse the graph to extract vector of edges
+}
+
diff --git a/R-package/man/get.paths.to.leaf.Rd b/R-package/man/get.paths.to.leaf.Rd
new file mode 100644
index 000000000000..1fdcfd5d7121
--- /dev/null
+++ b/R-package/man/get.paths.to.leaf.Rd
@@ -0,0 +1,15 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/xgb.plot.deepness.R
+\name{get.paths.to.leaf}
+\alias{get.paths.to.leaf}
+\title{Extract path from root to leaf from data.table}
+\usage{
+get.paths.to.leaf(dt.tree)
+}
+\arguments{
+\item{dt.tree}{data.table containing the nodes and edges of the trees}
+}
+\description{
+Extract path from root to leaf from data.table
+}
+
diff --git a/R-package/man/getinfo.Rd b/R-package/man/getinfo.Rd
index 87c507566774..f8b4f6b991e8 100644
--- a/R-package/man/getinfo.Rd
+++ b/R-package/man/getinfo.Rd
@@ -1,4 +1,4 @@
-% Generated by roxygen2 (4.1.1): do not edit by hand
+% Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/getinfo.xgb.DMatrix.R
 \docType{methods}
 \name{getinfo}
diff --git a/R-package/man/multiplot.Rd b/R-package/man/multiplot.Rd
new file mode 100644
index 000000000000..a2fef7d99a25
--- /dev/null
+++ b/R-package/man/multiplot.Rd
@@ -0,0 +1,15 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/xgb.plot.deepness.R
+\name{multiplot}
+\alias{multiplot}
+\title{Plot multiple graphs at the same time}
+\usage{
+multiplot(..., cols = 1)
+}
+\arguments{
+\item{cols}{number of columns}
+}
+\description{
+Plot multiple graph aligned by rows and columns.
+}
+
diff --git a/R-package/man/nrow-xgb.DMatrix-method.Rd b/R-package/man/nrow-xgb.DMatrix-method.Rd
index f86709afd339..1fd52b9c1a8d 100644
--- a/R-package/man/nrow-xgb.DMatrix-method.Rd
+++ b/R-package/man/nrow-xgb.DMatrix-method.Rd
@@ -1,4 +1,4 @@
-% Generated by roxygen2 (4.1.1): do not edit by hand
+% Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/nrow.xgb.DMatrix.R
 \docType{methods}
 \name{nrow,xgb.DMatrix-method}
@@ -18,5 +18,6 @@ data(agaricus.train, package='xgboost')
 train <- agaricus.train
 dtrain <- xgb.DMatrix(train$data, label=train$label)
 stopifnot(nrow(dtrain) == nrow(train$data))
+
 }
 
diff --git a/R-package/man/predict-xgb.Booster-method.Rd b/R-package/man/predict-xgb.Booster-method.Rd
index 3ce2e2025cc0..341ced8c6ac7 100644
--- a/R-package/man/predict-xgb.Booster-method.Rd
+++ b/R-package/man/predict-xgb.Booster-method.Rd
@@ -1,29 +1,29 @@
-% Generated by roxygen2 (4.1.1): do not edit by hand
+% Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/predict.xgb.Booster.R
 \docType{methods}
 \name{predict,xgb.Booster-method}
 \alias{predict,xgb.Booster-method}
 \title{Predict method for eXtreme Gradient Boosting model}
 \usage{
-\S4method{predict}{xgb.Booster}(object, newdata, missing = NULL,
+\S4method{predict}{xgb.Booster}(object, newdata, missing = NA,
   outputmargin = FALSE, ntreelimit = NULL, predleaf = FALSE)
 }
 \arguments{
 \item{object}{Object of class "xgb.Boost"}
 
-\item{newdata}{takes \code{matrix}, \code{dgCMatrix}, local data file or
+\item{newdata}{takes \code{matrix}, \code{dgCMatrix}, local data file or 
 \code{xgb.DMatrix}.}
 
-\item{missing}{Missing is only used when input is dense matrix, pick a float
+\item{missing}{Missing is only used when input is dense matrix, pick a float 
 value that represents missing value. Sometime a data use 0 or other extreme value to represents missing values.}
 
 \item{outputmargin}{whether the prediction should be shown in the original
-value of sum of functions, when outputmargin=TRUE, the prediction is
+value of sum of functions, when outputmargin=TRUE, the prediction is 
 untransformed margin value. In logistic regression, outputmargin=T will
 output value before logistic transformation.}
 
 \item{ntreelimit}{limit number of trees used in prediction, this parameter is
-only valid for gbtree, but not for gblinear. set it to be value bigger
+only valid for gbtree, but not for gblinear. set it to be value bigger 
 than 0. It will use all trees by default.}
 
 \item{predleaf}{whether predict leaf index instead. If set to TRUE, the output will be a matrix object.}
@@ -31,12 +31,22 @@ than 0. It will use all trees by default.}
 \description{
 Predicted values based on xgboost model object.
 }
+\details{
+The option \code{ntreelimit} purpose is to let the user train a model with lots 
+of trees but use only the first trees for prediction to avoid overfitting 
+(without having to train a new model with less trees).
+
+The option \code{predleaf} purpose is inspired from §3.1 of the paper 
+\code{Practical Lessons from Predicting Clicks on Ads at Facebook}.
+The idea is to use the model as a generator of new features which capture non linear link 
+from original features.
+}
 \examples{
 data(agaricus.train, package='xgboost')
 data(agaricus.test, package='xgboost')
 train <- agaricus.train
 test <- agaricus.test
-bst <- xgboost(data = train$data, label = train$label, max.depth = 2,
+bst <- xgboost(data = train$data, label = train$label, max.depth = 2, 
                eta = 1, nthread = 2, nround = 2,objective = "binary:logistic")
 pred <- predict(bst, test$data)
 }
diff --git a/R-package/man/predict-xgb.Booster.handle-method.Rd b/R-package/man/predict-xgb.Booster.handle-method.Rd
index 7eb237a9471c..34454e555fd9 100644
--- a/R-package/man/predict-xgb.Booster.handle-method.Rd
+++ b/R-package/man/predict-xgb.Booster.handle-method.Rd
@@ -1,4 +1,4 @@
-% Generated by roxygen2 (4.1.1): do not edit by hand
+% Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/predict.xgb.Booster.handle.R
 \docType{methods}
 \name{predict,xgb.Booster.handle-method}
diff --git a/R-package/man/setinfo.Rd b/R-package/man/setinfo.Rd
index edf5284bd574..cb939721e964 100644
--- a/R-package/man/setinfo.Rd
+++ b/R-package/man/setinfo.Rd
@@ -1,4 +1,4 @@
-% Generated by roxygen2 (4.1.1): do not edit by hand
+% Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/setinfo.xgb.DMatrix.R
 \docType{methods}
 \name{setinfo}
diff --git a/R-package/man/slice.Rd b/R-package/man/slice.Rd
index 20a78a383280..b177221157ed 100644
--- a/R-package/man/slice.Rd
+++ b/R-package/man/slice.Rd
@@ -1,4 +1,4 @@
-% Generated by roxygen2 (4.1.1): do not edit by hand
+% Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/slice.xgb.DMatrix.R
 \docType{methods}
 \name{slice}
diff --git a/R-package/man/xgb.DMatrix.Rd b/R-package/man/xgb.DMatrix.Rd
index 9d4d19d37c2b..2e892cc6d952 100644
--- a/R-package/man/xgb.DMatrix.Rd
+++ b/R-package/man/xgb.DMatrix.Rd
@@ -1,13 +1,13 @@
-% Generated by roxygen2 (4.1.1): do not edit by hand
+% Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/xgb.DMatrix.R
 \name{xgb.DMatrix}
 \alias{xgb.DMatrix}
 \title{Contruct xgb.DMatrix object}
 \usage{
-xgb.DMatrix(data, info = list(), missing = 0, ...)
+xgb.DMatrix(data, info = list(), missing = NA, ...)
 }
 \arguments{
-\item{data}{a \code{matrix} object, a \code{dgCMatrix} object or a character
+\item{data}{a \code{matrix} object, a \code{dgCMatrix} object or a character 
 indicating the data file.}
 
 \item{info}{a list of information of the xgb.DMatrix object}
diff --git a/R-package/man/xgb.DMatrix.save.Rd b/R-package/man/xgb.DMatrix.save.Rd
index 3ba36f55a365..78348c3faa6d 100644
--- a/R-package/man/xgb.DMatrix.save.Rd
+++ b/R-package/man/xgb.DMatrix.save.Rd
@@ -1,4 +1,4 @@
-% Generated by roxygen2 (4.1.1): do not edit by hand
+% Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/xgb.DMatrix.save.R
 \name{xgb.DMatrix.save}
 \alias{xgb.DMatrix.save}
diff --git a/R-package/man/xgb.create.features.Rd b/R-package/man/xgb.create.features.Rd
new file mode 100644
index 000000000000..cab2ab654dd8
--- /dev/null
+++ b/R-package/man/xgb.create.features.Rd
@@ -0,0 +1,88 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/xgb.create.features.R
+\name{xgb.create.features}
+\alias{xgb.create.features}
+\title{Create new features from a previously learned model}
+\usage{
+xgb.create.features(model, training.data)
+}
+\arguments{
+\item{model}{decision tree boosting model learned on the original data}
+
+\item{training.data}{original data (usually provided as a \code{dgCMatrix} matrix)}
+}
+\value{
+\code{dgCMatrix} matrix including both the original data and the new features.
+}
+\description{
+May improve the learning by adding new features to the training data based on the decision trees from a previously learned model.
+}
+\details{
+This is the function inspired from the paragraph 3.1 of the paper:
+
+\strong{Practical Lessons from Predicting Clicks on Ads at Facebook}
+
+\emph{(Xinran He, Junfeng Pan, Ou Jin, Tianbing Xu, Bo Liu, Tao Xu, Yan, xin Shi, Antoine Atallah, Ralf Herbrich, Stuart Bowers, 
+Joaquin Quiñonero Candela)}
+ 
+International Workshop on Data Mining for Online Advertising (ADKDD) - August 24, 2014
+
+\url{https://research.facebook.com/publications/758569837499391/practical-lessons-from-predicting-clicks-on-ads-at-facebook/}.
+
+Extract explaining the method:
+
+"\emph{We found that boosted decision trees are a powerful and very
+convenient way to implement non-linear and tuple transformations
+of the kind we just described. We treat each individual
+tree as a categorical feature that takes as value the
+index of the leaf an instance ends up falling in. We use 
+1-of-K coding of this type of features. 
+
+For example, consider the boosted tree model in Figure 1 with 2 subtrees, 
+where the first subtree has 3 leafs and the second 2 leafs. If an
+instance ends up in leaf 2 in the first subtree and leaf 1 in
+second subtree, the overall input to the linear classifier will
+be the binary vector \code{[0, 1, 0, 1, 0]}, where the first 3 entries
+correspond to the leaves of the first subtree and last 2 to
+those of the second subtree.
+
+[...]
+
+We can understand boosted decision tree
+based transformation as a supervised feature encoding that
+converts a real-valued vector into a compact binary-valued
+vector. A traversal from root node to a leaf node represents
+a rule on certain features.}"
+}
+\examples{
+data(agaricus.train, package='xgboost')
+data(agaricus.test, package='xgboost')
+dtrain <- xgb.DMatrix(data = agaricus.train$data, label = agaricus.train$label)
+dtest <- xgb.DMatrix(data = agaricus.test$data, label = agaricus.test$label)
+
+param <- list(max.depth=2, eta=1, silent=1, objective='binary:logistic')
+nround = 4
+
+bst = xgb.train(params = param, data = dtrain, nrounds = nround, nthread = 2)
+
+# Model accuracy without new features
+accuracy.before <- sum((predict(bst, agaricus.test$data) >= 0.5) == agaricus.test$label) / length(agaricus.test$label)
+
+# Convert previous features to one hot encoding
+new.features.train <- xgb.create.features(model = bst, agaricus.train$data)
+new.features.test <- xgb.create.features(model = bst, agaricus.test$data)
+
+# learning with new features
+new.dtrain <- xgb.DMatrix(data = new.features.train, label = agaricus.train$label)
+new.dtest <- xgb.DMatrix(data = new.features.test, label = agaricus.test$label)
+watchlist <- list(train = new.dtrain)
+bst <- xgb.train(params = param, data = new.dtrain, nrounds = nround, nthread = 2)
+
+# Model accuracy with new features
+accuracy.after <- sum((predict(bst, new.dtest) >= 0.5) == agaricus.test$label) / length(agaricus.test$label)
+
+# Here the accuracy was already good and is now perfect.
+cat(paste("The accuracy was", accuracy.before, "before adding leaf features and it is now", accuracy.after, "!\\n"))
+
+}
+
diff --git a/R-package/man/xgb.cv.Rd b/R-package/man/xgb.cv.Rd
index bb23992a2e48..f3a1fcfd1916 100644
--- a/R-package/man/xgb.cv.Rd
+++ b/R-package/man/xgb.cv.Rd
@@ -1,14 +1,13 @@
-% Generated by roxygen2 (4.1.1): do not edit by hand
+% Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/xgb.cv.R
 \name{xgb.cv}
 \alias{xgb.cv}
 \title{Cross Validation}
 \usage{
-xgb.cv(params = list(), data, nrounds, nfold, label = NULL,
-  missing = NULL, prediction = FALSE, showsd = TRUE, metrics = list(),
-  obj = NULL, feval = NULL, stratified = TRUE, folds = NULL,
-  verbose = T, print.every.n = 1L, early.stop.round = NULL,
-  maximize = NULL, ...)
+xgb.cv(params = list(), data, nrounds, nfold, label = NULL, missing = NA,
+  prediction = FALSE, showsd = TRUE, metrics = list(), obj = NULL,
+  feval = NULL, stratified = TRUE, folds = NULL, verbose = T,
+  print.every.n = 1L, early.stop.round = NULL, maximize = NULL, ...)
 }
 \arguments{
 \item{params}{the list of parameters. Commonly used ones are:
@@ -41,7 +40,7 @@ value that represents missing value. Sometime a data use 0 or other extreme valu
 
 \item{showsd}{\code{boolean}, whether show standard deviation of cross validation}
 
-\item{metrics,}{list of evaluation metrics to be used in corss validation,
+\item{metrics, }{list of evaluation metrics to be used in corss validation,
   when it is not specified, the evaluation metric is chosen according to objective function.
   Possible options are:
 \itemize{
@@ -52,11 +51,11 @@ value that represents missing value. Sometime a data use 0 or other extreme valu
   \item \code{merror} Exact matching error, used to evaluate multi-class classification
 }}
 
-\item{obj}{customized objective function. Returns gradient and second order
+\item{obj}{customized objective function. Returns gradient and second order 
 gradient with given prediction and dtrain.}
 
-\item{feval}{custimized evaluation function. Returns
-\code{list(metric='metric-name', value='metric-value')} with given
+\item{feval}{custimized evaluation function. Returns 
+\code{list(metric='metric-name', value='metric-value')} with given 
 prediction and dtrain.}
 
 \item{stratified}{\code{boolean} whether sampling of folds should be stratified by the values of labels in \code{data}}
@@ -68,12 +67,12 @@ If folds are supplied, the nfold and stratified parameters would be ignored.}
 
 \item{print.every.n}{Print every N progress messages when \code{verbose>0}. Default is 1 which means all messages are printed.}
 
-\item{early.stop.round}{If \code{NULL}, the early stopping function is not triggered.
-If set to an integer \code{k}, training with a validation set will stop if the performance
+\item{early.stop.round}{If \code{NULL}, the early stopping function is not triggered. 
+If set to an integer \code{k}, training with a validation set will stop if the performance 
 keeps getting worse consecutively for \code{k} rounds.}
 
 \item{maximize}{If \code{feval} and \code{early.stop.round} are set, then \code{maximize} must be set as well.
-    \code{maximize=TRUE} means the larger the evaluation score the better.}
+\code{maximize=TRUE} means the larger the evaluation score the better.}
 
 \item{...}{other parameters to pass to \code{params}.}
 }
@@ -90,9 +89,9 @@ If \code{prediction = FALSE}, just a \code{data.table} with each mean and standa
 The cross valudation function of xgboost
 }
 \details{
-The original sample is randomly partitioned into \code{nfold} equal size subsamples.
+The original sample is randomly partitioned into \code{nfold} equal size subsamples. 
 
-Of the \code{nfold} subsamples, a single subsample is retained as the validation data for testing the model, and the remaining \code{nfold - 1} subsamples are used as training data.
+Of the \code{nfold} subsamples, a single subsample is retained as the validation data for testing the model, and the remaining \code{nfold - 1} subsamples are used as training data. 
 
 The cross-validation process is then repeated \code{nrounds} times, with each of the \code{nfold} subsamples used exactly once as the validation data.
 
diff --git a/R-package/man/xgb.dump.Rd b/R-package/man/xgb.dump.Rd
index eaf1ca52148b..cafa8ac14019 100644
--- a/R-package/man/xgb.dump.Rd
+++ b/R-package/man/xgb.dump.Rd
@@ -1,4 +1,4 @@
-% Generated by roxygen2 (4.1.1): do not edit by hand
+% Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/xgb.dump.R
 \name{xgb.dump}
 \alias{xgb.dump}
@@ -11,17 +11,17 @@ xgb.dump(model = NULL, fname = NULL, fmap = "", with.stats = FALSE)
 
 \item{fname}{the name of the text file where to save the model text dump. If not provided or set to \code{NULL} the function will return the model as a \code{character} vector.}
 
-\item{fmap}{feature map file representing the type of feature.
-Detailed description could be found at
+\item{fmap}{feature map file representing the type of feature. 
+Detailed description could be found at 
 \url{https://github.com/dmlc/xgboost/wiki/Binary-Classification#dump-model}.
 See demo/ for walkthrough example in R, and
-\url{https://github.com/dmlc/xgboost/blob/master/demo/data/featmap.txt}
+\url{https://github.com/dmlc/xgboost/blob/master/demo/data/featmap.txt} 
 for example Format.}
 
-\item{with.stats}{whether dump statistics of splits
-       When this option is on, the model dump comes with two additional statistics:
-       gain is the approximate loss function gain we get in each split;
-       cover is the sum of second order gradient in each node.}
+\item{with.stats}{whether dump statistics of splits 
+When this option is on, the model dump comes with two additional statistics:
+gain is the approximate loss function gain we get in each split;
+cover is the sum of second order gradient in each node.}
 }
 \value{
 if fname is not provided or set to \code{NULL} the function will return the model as a \code{character} vector. Otherwise it will return \code{TRUE}.
@@ -34,7 +34,7 @@ data(agaricus.train, package='xgboost')
 data(agaricus.test, package='xgboost')
 train <- agaricus.train
 test <- agaricus.test
-bst <- xgboost(data = train$data, label = train$label, max.depth = 2,
+bst <- xgboost(data = train$data, label = train$label, max.depth = 2, 
                eta = 1, nthread = 2, nround = 2,objective = "binary:logistic")
 # save the model in file 'xgb.model.dump'
 xgb.dump(bst, 'xgb.model.dump', with.stats = TRUE)
diff --git a/R-package/man/xgb.importance.Rd b/R-package/man/xgb.importance.Rd
index 11740e4acbc0..f30f8149adcd 100644
--- a/R-package/man/xgb.importance.Rd
+++ b/R-package/man/xgb.importance.Rd
@@ -1,18 +1,16 @@
-% Generated by roxygen2 (4.1.1): do not edit by hand
+% Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/xgb.importance.R
 \name{xgb.importance}
 \alias{xgb.importance}
 \title{Show importance of features in a model}
 \usage{
-xgb.importance(feature_names = NULL, filename_dump = NULL, model = NULL,
-  data = NULL, label = NULL, target = function(x) ((x + label) == 2))
+xgb.importance(feature_names = NULL, model = NULL, data = NULL,
+  label = NULL, target = function(x) ((x + label) == 2))
 }
 \arguments{
-\item{feature_names}{names of each feature as a character vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}.}
+\item{feature_names}{names of each feature as a \code{character} vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}.}
 
-\item{filename_dump}{the path to the text file storing the model. Model dump must include the gain per feature and per tree (\code{with.stats = T} in function \code{xgb.dump}).}
-
-\item{model}{generated by the \code{xgb.train} function. Avoid the creation of a dump file.}
+\item{model}{generated by the \code{xgb.train} function.}
 
 \item{data}{the dataset used for the training step. Will be used with \code{label} parameter for co-occurence computation. More information in \code{Detail} part. This parameter is optional.}
 
@@ -24,23 +22,24 @@ xgb.importance(feature_names = NULL, filename_dump = NULL, model = NULL,
 A \code{data.table} of the features used in the model with their average gain (and their weight for boosted tree model) in the model.
 }
 \description{
-Read a xgboost model text dump.
-Can be tree or linear model (text dump of linear model are only supported in dev version of \code{Xgboost} for now).
+Create a \code{data.table} of the most important features of a model.
 }
 \details{
-This is the function to understand the model trained (and through your model, your data).
-
-Results are returned for both linear and tree models.
+This function is for both linear and tree models.
 
-\code{data.table} is returned by the function.
-There are 3 columns :
+\code{data.table} is returned by the function. 
+The columns are :
 \itemize{
-  \item \code{Features} name of the features as provided in \code{feature_names} or already present in the model dump.
-  \item \code{Gain} contribution of each feature to the model. For boosted tree model, each gain of each feature of each tree is taken into account, then average per feature to give a vision of the entire model. Highest percentage means important feature to predict the \code{label} used for the training ;
-  \item \code{Cover} metric of the number of observation related to this feature (only available for tree models) ;
-  \item \code{Weight} percentage representing the relative number of times a feature have been taken into trees. \code{Gain} should be prefered to search the most important feature. For boosted linear model, this column has no meaning.
+  \item \code{Features} name of the features as provided in \code{feature_names} or already present in the model dump;
+  \item \code{Gain} contribution of each feature to the model. For boosted tree model, each gain of each feature of each tree is taken into account, then average per feature to give a vision of the entire model. Highest percentage means important feature to predict the \code{label} used for the training (only available for tree models);
+  \item \code{Cover} metric of the number of observation related to this feature (only available for tree models);
+  \item \code{Weight} percentage representing the relative number of times a feature have been taken into trees.
 }
 
+If you don't provide \code{feature_names}, index of the features will be used instead.
+
+Because the index is extracted from the model dump (made on the C++ side), it starts at 0 (usual in C++) instead of 1 (usual in R).
+
 Co-occurence count
 ------------------
 
@@ -53,18 +52,14 @@ If you need to remember one thing only: until you want to leave us early, don't
 \examples{
 data(agaricus.train, package='xgboost')
 
-# Both dataset are list with two items, a sparse matrix and labels
-# (labels = outcome column which will be learned).
-# Each column of the sparse Matrix is a feature in one hot encoding format.
-train <- agaricus.train
-
-bst <- xgboost(data = train$data, label = train$label, max.depth = 2,
+bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max.depth = 2, 
                eta = 1, nthread = 2, nround = 2,objective = "binary:logistic")
 
-# train$data@Dimnames[[2]] represents the column names of the sparse matrix.
-xgb.importance(train$data@Dimnames[[2]], model = bst)
+# agaricus.train$data@Dimnames[[2]] represents the column names of the sparse matrix.
+xgb.importance(agaricus.train$data@Dimnames[[2]], model = bst)
 
 # Same thing with co-occurence computation this time
-xgb.importance(train$data@Dimnames[[2]], model = bst, data = train$data, label = train$label)
+xgb.importance(agaricus.train$data@Dimnames[[2]], model = bst, data = agaricus.train$data, label = agaricus.train$label)
+
 }
 
diff --git a/R-package/man/xgb.load.Rd b/R-package/man/xgb.load.Rd
index 1331ff2496f0..92576ad95bbb 100644
--- a/R-package/man/xgb.load.Rd
+++ b/R-package/man/xgb.load.Rd
@@ -1,4 +1,4 @@
-% Generated by roxygen2 (4.1.1): do not edit by hand
+% Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/xgb.load.R
 \name{xgb.load}
 \alias{xgb.load}
@@ -17,7 +17,7 @@ data(agaricus.train, package='xgboost')
 data(agaricus.test, package='xgboost')
 train <- agaricus.train
 test <- agaricus.test
-bst <- xgboost(data = train$data, label = train$label, max.depth = 2,
+bst <- xgboost(data = train$data, label = train$label, max.depth = 2, 
                eta = 1, nthread = 2, nround = 2,objective = "binary:logistic")
 xgb.save(bst, 'xgb.model')
 bst <- xgb.load('xgb.model')
diff --git a/R-package/man/xgb.model.dt.tree.Rd b/R-package/man/xgb.model.dt.tree.Rd
index c53ed057f8b5..c82ba3cf447d 100644
--- a/R-package/man/xgb.model.dt.tree.Rd
+++ b/R-package/man/xgb.model.dt.tree.Rd
@@ -1,33 +1,33 @@
-% Generated by roxygen2 (4.1.1): do not edit by hand
+% Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/xgb.model.dt.tree.R
 \name{xgb.model.dt.tree}
 \alias{xgb.model.dt.tree}
-\title{Convert tree model dump to data.table}
+\title{Parse boosted tree model text dump}
 \usage{
-xgb.model.dt.tree(feature_names = NULL, filename_dump = NULL,
-  model = NULL, text = NULL, n_first_tree = NULL)
+xgb.model.dt.tree(feature_names = NULL, model = NULL, text = NULL,
+  n_first_tree = NULL)
 }
 \arguments{
-\item{feature_names}{names of each feature as a character vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}.}
+\item{feature_names}{names of each feature as a character vector. Can be extracted from a sparse matrix (see example). If the model already contains feature names, this argument should be \code{NULL} (default value).}
 
-\item{filename_dump}{the path to the text file storing the model. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}).}
+\item{model}{object created by the \code{xgb.train} function.}
 
-\item{model}{dump generated by the \code{xgb.train} function. Avoid the creation of a dump file.}
+\item{text}{\code{character} vector generated by the \code{xgb.dump} function. Model dump must include the gain per feature and per tree (parameter \code{with.stats = TRUE} in function \code{xgb.dump}).}
 
-\item{text}{dump generated by the \code{xgb.dump} function. Avoid the creation of a dump file. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}).}
-
-\item{n_first_tree}{limit the plot to the n first trees. If \code{NULL}, all trees of the model are plotted. Performance can be low for huge models.}
+\item{n_first_tree}{limit the plot to the \code{n} first trees. If set to \code{NULL}, all trees of the model are plotted. Performance can be low depending of the size of the model.}
 }
 \value{
-A \code{data.table} of the features used in the model with their gain, cover and few other thing.
+A \code{data.table} of the features used in the model with their gain, cover and few other information.
 }
 \description{
-Read a tree model text dump and return a data.table.
+Parse a boosted tree model text dump and return a \code{data.table}.
 }
 \details{
-General function to convert a text dump of tree model to a Matrix. The purpose is to help user to explore the model and get a better understanding of it.
+General function to convert a text dump of tree model to a \code{data.table}. 
+
+The purpose is to help user to explore the model and get a better understanding of it.
 
-The content of the \code{data.table} is organised that way:
+The columns of the \code{data.table} are:
 
 \itemize{
 \item \code{ID}: unique identifier of a node ;
@@ -39,21 +39,17 @@ The content of the \code{data.table} is organised that way:
  \item \code{Quality}: it's the gain related to the split in this specific node ;
  \item \code{Cover}: metric to measure the number of observation affected by the split ;
  \item \code{Tree}: ID of the tree. It is included in the main ID ;
- \item \code{Yes.X} or \code{No.X}: data related to the pointer in \code{Yes} or \code{No} column ;
+ \item \code{Yes.Feature}, \code{No.Feature}, \code{Yes.Cover}, \code{No.Cover}, \code{Yes.Quality} and \code{No.Quality}: data related to the pointer in \code{Yes} or \code{No} column ;
 }
 }
 \examples{
 data(agaricus.train, package='xgboost')
 
-#Both dataset are list with two items, a sparse matrix and labels
-#(labels = outcome column which will be learned).
-#Each column of the sparse Matrix is a feature in one hot encoding format.
-train <- agaricus.train
-
-bst <- xgboost(data = train$data, label = train$label, max.depth = 2,
+bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max.depth = 2, 
                eta = 1, nthread = 2, nround = 2,objective = "binary:logistic")
 
-#agaricus.test$data@Dimnames[[2]] represents the column names of the sparse matrix.
-xgb.model.dt.tree(agaricus.train$data@Dimnames[[2]], model = bst)
+# agaricus.train$data@Dimnames[[2]] represents the column names of the sparse matrix.
+xgb.model.dt.tree(feature_names = agaricus.train$data@Dimnames[[2]], model = bst)
+
 }
 
diff --git a/R-package/man/xgb.plot.deepness.Rd b/R-package/man/xgb.plot.deepness.Rd
new file mode 100644
index 000000000000..e11a7495eab6
--- /dev/null
+++ b/R-package/man/xgb.plot.deepness.Rd
@@ -0,0 +1,46 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/xgb.plot.deepness.R
+\name{xgb.plot.deepness}
+\alias{xgb.plot.deepness}
+\title{Plot model trees deepness}
+\usage{
+xgb.plot.deepness(model = NULL)
+}
+\arguments{
+\item{model}{dump generated by the \code{xgb.train} function.}
+}
+\value{
+Two graphs showing the distribution of the model deepness.
+}
+\description{
+Generate a graph to plot the distribution of deepness among trees.
+}
+\details{
+Display both the number of \code{leaf} and the distribution of \code{weighted observations}
+by tree deepness level.
+
+The purpose of this function is to help the user to find the best trade-off to set
+the \code{max.depth} and \code{min_child_weight} parameters according to the bias / variance trade-off.
+
+See \link{xgb.train} for more information about these parameters.
+
+The graph is made of two parts:
+
+\itemize{
+ \item Count: number of leaf per level of deepness;
+ \item Weighted cover: noramlized weighted cover per leaf (weighted number of instances).
+}
+
+This function is inspired by the blog post \url{http://aysent.github.io/2015/11/08/random-forest-leaf-visualization.html}
+}
+\examples{
+data(agaricus.train, package='xgboost')
+
+bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max.depth = 15,
+                 eta = 1, nthread = 2, nround = 30, objective = "binary:logistic",
+                 min_child_weight = 50)
+
+xgb.plot.deepness(model = bst)
+
+}
+
diff --git a/R-package/man/xgb.plot.importance.Rd b/R-package/man/xgb.plot.importance.Rd
index 4147278b90ba..2f9d5651dfef 100644
--- a/R-package/man/xgb.plot.importance.Rd
+++ b/R-package/man/xgb.plot.importance.Rd
@@ -1,4 +1,4 @@
-% Generated by roxygen2 (4.1.1): do not edit by hand
+% Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/xgb.plot.importance.R
 \name{xgb.plot.importance}
 \alias{xgb.plot.importance}
@@ -15,11 +15,11 @@ xgb.plot.importance(importance_matrix = NULL, numberOfClusters = c(1:10))
 A \code{ggplot2} bar graph representing each feature by a horizontal bar. Longer is the bar, more important is the feature. Features are classified by importance and clustered by importance. The group is represented through the color of the bar.
 }
 \description{
-Read a data.table containing feature importance details and plot it.
+Read a data.table containing feature importance details and plot it (for both GLM and Trees).
 }
 \details{
 The purpose of this function is to easily represent the importance of each feature of a model.
-The function return a ggplot graph, therefore each of its characteristic can be overriden (to customize it).
+The function returns a ggplot graph, therefore each of its characteristic can be overriden (to customize it).
 In particular you may want to override the title of the graph. To do so, add \code{+ ggtitle("A GRAPH NAME")} next to the value returned by this function.
 }
 \examples{
@@ -28,13 +28,13 @@ data(agaricus.train, package='xgboost')
 #Both dataset are list with two items, a sparse matrix and labels
 #(labels = outcome column which will be learned).
 #Each column of the sparse Matrix is a feature in one hot encoding format.
-train <- agaricus.train
 
-bst <- xgboost(data = train$data, label = train$label, max.depth = 2,
+bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max.depth = 2,
                eta = 1, nthread = 2, nround = 2,objective = "binary:logistic")
 
-#train$data@Dimnames[[2]] represents the column names of the sparse matrix.
-importance_matrix <- xgb.importance(train$data@Dimnames[[2]], model = bst)
+#agaricus.train$data@Dimnames[[2]] represents the column names of the sparse matrix.
+importance_matrix <- xgb.importance(agaricus.train$data@Dimnames[[2]], model = bst)
 xgb.plot.importance(importance_matrix)
+
 }
 
diff --git a/R-package/man/xgb.plot.multi.trees.Rd b/R-package/man/xgb.plot.multi.trees.Rd
new file mode 100644
index 000000000000..4d97c58b40c1
--- /dev/null
+++ b/R-package/man/xgb.plot.multi.trees.Rd
@@ -0,0 +1,58 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/xgb.plot.multi.trees.R
+\name{xgb.plot.multi.trees}
+\alias{xgb.plot.multi.trees}
+\title{Project all trees on one tree and plot it}
+\usage{
+xgb.plot.multi.trees(model, feature_names = NULL, features.keep = 5,
+  plot.width = NULL, plot.height = NULL)
+}
+\arguments{
+\item{model}{dump generated by the \code{xgb.train} function.}
+
+\item{feature_names}{names of each feature as a \code{character} vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}.}
+
+\item{features.keep}{number of features to keep in each position of the multi trees.}
+
+\item{plot.width}{width in pixels of the graph to produce}
+
+\item{plot.height}{height in pixels of the graph to produce}
+}
+\value{
+Two graphs showing the distribution of the model deepness.
+}
+\description{
+Visualization of the ensemble of trees as a single collective unit.
+}
+\details{
+This function tries to capture the complexity of gradient boosted tree ensemble 
+in a cohesive way. 
+
+The goal is to improve the interpretability of the model generally seen as black box.
+The function is dedicated to boosting applied to decision trees only.
+
+The purpose is to move from an ensemble of trees to a single tree only.
+
+It takes advantage of the fact that the shape of a binary tree is only defined by 
+its deepness (therefore in a boosting model, all trees have the same shape). 
+
+Moreover, the trees tend to reuse the same features.
+
+The function will project each tree on one, and keep for each position the 
+\code{features.keep} first features (based on Gain per feature measure).
+
+This function is inspired by this blog post:
+\url{https://wellecks.wordpress.com/2015/02/21/peering-into-the-black-box-visualizing-lambdamart/}
+}
+\examples{
+data(agaricus.train, package='xgboost')
+
+bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max.depth = 15,
+                 eta = 1, nthread = 2, nround = 30, objective = "binary:logistic",
+                 min_child_weight = 50)
+
+p <- xgb.plot.multi.trees(model = bst, feature_names = agaricus.train$data@Dimnames[[2]], features.keep = 3)
+print(p)
+
+}
+
diff --git a/R-package/man/xgb.plot.tree.Rd b/R-package/man/xgb.plot.tree.Rd
index 4501d87ce336..c087059e0301 100644
--- a/R-package/man/xgb.plot.tree.Rd
+++ b/R-package/man/xgb.plot.tree.Rd
@@ -1,58 +1,48 @@
-% Generated by roxygen2 (4.1.1): do not edit by hand
+% Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/xgb.plot.tree.R
 \name{xgb.plot.tree}
 \alias{xgb.plot.tree}
 \title{Plot a boosted tree model}
 \usage{
-xgb.plot.tree(feature_names = NULL, filename_dump = NULL, model = NULL,
-  n_first_tree = NULL, CSSstyle = NULL, width = NULL, height = NULL)
+xgb.plot.tree(feature_names = NULL, model = NULL, n_first_tree = NULL,
+  plot.width = NULL, plot.height = NULL)
 }
 \arguments{
-\item{feature_names}{names of each feature as a character vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}.}
-
-\item{filename_dump}{the path to the text file storing the model. Model dump must include the gain per feature and per tree (parameter \code{with.stats = T} in function \code{xgb.dump}). Possible to provide a model directly (see \code{model} argument).}
+\item{feature_names}{names of each feature as a \code{character} vector. Can be extracted from a sparse matrix (see example). If model dump already contains feature names, this argument should be \code{NULL}.}
 
 \item{model}{generated by the \code{xgb.train} function. Avoid the creation of a dump file.}
 
 \item{n_first_tree}{limit the plot to the n first trees. If \code{NULL}, all trees of the model are plotted. Performance can be low for huge models.}
 
-\item{CSSstyle}{a \code{character} vector storing a css style to customize the appearance of nodes. Look at the \href{https://github.com/knsv/mermaid/wiki}{Mermaid wiki} for more information.}
-
-\item{width}{the width of the diagram in pixels.}
+\item{plot.width}{the width of the diagram in pixels.}
 
-\item{height}{the height of the diagram in pixels.}
+\item{plot.height}{the height of the diagram in pixels.}
 }
 \value{
 A \code{DiagrammeR} of the model.
 }
 \description{
-Read a tree model text dump.
-Plotting only works for boosted tree model (not linear model).
+Read a tree model text dump and plot the model.
 }
 \details{
 The content of each node is organised that way:
 
 \itemize{
- \item \code{feature} value ;
- \item \code{cover}: the sum of second order gradient of training data classified to the leaf, if it is square loss, this simply corresponds to the number of instances in that branch. Deeper in the tree a node is, lower this metric will be ;
+ \item \code{feature} value;
+ \item \code{cover}: the sum of second order gradient of training data classified to the leaf, if it is square loss, this simply corresponds to the number of instances in that branch. Deeper in the tree a node is, lower this metric will be;
  \item \code{gain}: metric the importance of the node in the model.
-}
+} 
 
-Each branch finishes with a leaf. For each leaf, only the \code{cover} is indicated.
-It uses \href{https://github.com/knsv/mermaid/}{Mermaid} library for that purpose.
+The function uses \href{http://www.graphviz.org/}{GraphViz} library for that purpose.
 }
 \examples{
 data(agaricus.train, package='xgboost')
 
-#Both dataset are list with two items, a sparse matrix and labels
-#(labels = outcome column which will be learned).
-#Each column of the sparse Matrix is a feature in one hot encoding format.
-train <- agaricus.train
-
-bst <- xgboost(data = train$data, label = train$label, max.depth = 2,
+bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max.depth = 2, 
                eta = 1, nthread = 2, nround = 2,objective = "binary:logistic")
 
-#agaricus.test$data@Dimnames[[2]] represents the column names of the sparse matrix.
-xgb.plot.tree(agaricus.train$data@Dimnames[[2]], model = bst)
+# agaricus.train$data@Dimnames[[2]] represents the column names of the sparse matrix.
+xgb.plot.tree(feature_names = agaricus.train$data@Dimnames[[2]], model = bst)
+
 }
 
diff --git a/R-package/man/xgb.save.Rd b/R-package/man/xgb.save.Rd
index eca097fac54e..db335105c859 100644
--- a/R-package/man/xgb.save.Rd
+++ b/R-package/man/xgb.save.Rd
@@ -1,4 +1,4 @@
-% Generated by roxygen2 (4.1.1): do not edit by hand
+% Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/xgb.save.R
 \name{xgb.save}
 \alias{xgb.save}
@@ -19,7 +19,7 @@ data(agaricus.train, package='xgboost')
 data(agaricus.test, package='xgboost')
 train <- agaricus.train
 test <- agaricus.test
-bst <- xgboost(data = train$data, label = train$label, max.depth = 2,
+bst <- xgboost(data = train$data, label = train$label, max.depth = 2, 
                eta = 1, nthread = 2, nround = 2,objective = "binary:logistic")
 xgb.save(bst, 'xgb.model')
 bst <- xgb.load('xgb.model')
diff --git a/R-package/man/xgb.save.raw.Rd b/R-package/man/xgb.save.raw.Rd
index 79c356c0f031..1e9f4a4dbb04 100644
--- a/R-package/man/xgb.save.raw.Rd
+++ b/R-package/man/xgb.save.raw.Rd
@@ -1,4 +1,4 @@
-% Generated by roxygen2 (4.1.1): do not edit by hand
+% Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/xgb.save.raw.R
 \name{xgb.save.raw}
 \alias{xgb.save.raw}
@@ -18,7 +18,7 @@ data(agaricus.train, package='xgboost')
 data(agaricus.test, package='xgboost')
 train <- agaricus.train
 test <- agaricus.test
-bst <- xgboost(data = train$data, label = train$label, max.depth = 2,
+bst <- xgboost(data = train$data, label = train$label, max.depth = 2, 
                eta = 1, nthread = 2, nround = 2,objective = "binary:logistic")
 raw <- xgb.save.raw(bst)
 bst <- xgb.load(raw)
diff --git a/R-package/man/xgb.train.Rd b/R-package/man/xgb.train.Rd
index 15a0b0ba7743..7f7ae49627ef 100644
--- a/R-package/man/xgb.train.Rd
+++ b/R-package/man/xgb.train.Rd
@@ -1,4 +1,4 @@
-% Generated by roxygen2 (4.1.1): do not edit by hand
+% Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/xgb.train.R
 \name{xgb.train}
 \alias{xgb.train}
@@ -10,7 +10,7 @@ xgb.train(params = list(), data, nrounds, watchlist = list(), obj = NULL,
   save_name = "xgboost.model", ...)
 }
 \arguments{
-\item{params}{the list of parameters.
+\item{params}{the list of parameters. 
 
 1. General Parameters
 
@@ -18,30 +18,30 @@ xgb.train(params = list(), data, nrounds, watchlist = list(), obj = NULL,
   \item \code{booster} which booster to use, can be \code{gbtree} or \code{gblinear}. Default: \code{gbtree}
   \item \code{silent} 0 means printing running messages, 1 means silent mode. Default: 0
 }
-
+ 
 2. Booster Parameters
 
 2.1. Parameter for Tree Booster
 
 \itemize{
   \item \code{eta} control the learning rate: scale the contribution of each tree by a factor of \code{0 < eta < 1} when it is added to the current approximation. Used to prevent overfitting by making the boosting process more conservative. Lower value for \code{eta} implies larger value for \code{nrounds}: low \code{eta} value means model more robust to overfitting but slower to compute. Default: 0.3
-  \item \code{gamma} minimum loss reduction required to make a further partition on a leaf node of the tree. the larger, the more conservative the algorithm will be.
+  \item \code{gamma} minimum loss reduction required to make a further partition on a leaf node of the tree. the larger, the more conservative the algorithm will be. 
   \item \code{max_depth} maximum depth of a tree. Default: 6
-  \item \code{min_child_weight} minimum sum of instance weight(hessian) needed in a child. If the tree partition step results in a leaf node with the sum of instance weight less than min_child_weight, then the building process will give up further partitioning. In linear regression mode, this simply corresponds to minimum number of instances needed to be in each node. The larger, the more conservative the algorithm will be. Default: 1
-  \item \code{subsample} subsample ratio of the training instance. Setting it to 0.5 means that xgboost randomly collected half of the data instances to grow trees and this will prevent overfitting. It makes computation shorter (because less data to analyse). It is advised to use this parameter with \code{eta} and increase \code{nround}. Default: 1
+  \item \code{min_child_weight} minimum sum of instance weight (hessian) needed in a child. If the tree partition step results in a leaf node with the sum of instance weight less than min_child_weight, then the building process will give up further partitioning. In linear regression mode, this simply corresponds to minimum number of instances needed to be in each node. The larger, the more conservative the algorithm will be. Default: 1
+  \item \code{subsample} subsample ratio of the training instance. Setting it to 0.5 means that xgboost randomly collected half of the data instances to grow trees and this will prevent overfitting. It makes computation shorter (because less data to analyse). It is advised to use this parameter with \code{eta} and increase \code{nround}. Default: 1 
   \item \code{colsample_bytree} subsample ratio of columns when constructing each tree. Default: 1
   \item \code{num_parallel_tree} Experimental parameter. number of trees to grow per round. Useful to test Random Forest through Xgboost (set \code{colsample_bytree < 1}, \code{subsample  < 1}  and \code{round = 1}) accordingly. Default: 1
 }
 
 2.2. Parameter for Linear Booster
-
+ 
 \itemize{
   \item \code{lambda} L2 regularization term on weights. Default: 0
   \item \code{lambda_bias} L2 regularization term on bias. Default: 0
   \item \code{alpha} L1 regularization term on weights. (there is no L1 reg on bias because it is not important). Default: 0
 }
 
-3. Task Parameters
+3. Task Parameters 
 
 \itemize{
 \item \code{objective} specify the learning task and the corresponding learning objective, users can pass a self-defined function to it. The default objective options are below:
@@ -51,7 +51,7 @@ xgb.train(params = list(), data, nrounds, watchlist = list(), obj = NULL,
     \item \code{binary:logistic} logistic regression for binary classification. Output probability.
     \item \code{binary:logitraw} logistic regression for binary classification, output score before logistic transformation.
     \item \code{num_class} set the number of classes. To use only with multiclass objectives.
-    \item \code{multi:softmax} set xgboost to do multiclass classification using the softmax objective. Class is represented by a number and should be from 0 to \code{tonum_class}.
+    \item \code{multi:softmax} set xgboost to do multiclass classification using the softmax objective. Class is represented by a number and should be from 0 to \code{num_class}.
     \item \code{multi:softprob} same as softmax, but output a vector of ndata * nclass, which can be further reshaped to ndata, nclass matrix. The result contains predicted probabilities of each data point belonging to each class.
     \item \code{rank:pairwise} set xgboost to do ranking task by minimizing the pairwise loss.
   }
@@ -64,25 +64,25 @@ xgb.train(params = list(), data, nrounds, watchlist = list(), obj = NULL,
 \item{nrounds}{the max number of iterations}
 
 \item{watchlist}{what information should be printed when \code{verbose=1} or
-  \code{verbose=2}. Watchlist is used to specify validation set monitoring
-  during training. For example user can specify
-   watchlist=list(validation1=mat1, validation2=mat2) to watch
-   the performance of each round's model on mat1 and mat2}
+\code{verbose=2}. Watchlist is used to specify validation set monitoring
+during training. For example user can specify
+ watchlist=list(validation1=mat1, validation2=mat2) to watch
+ the performance of each round's model on mat1 and mat2}
 
-\item{obj}{customized objective function. Returns gradient and second order
+\item{obj}{customized objective function. Returns gradient and second order 
 gradient with given prediction and dtrain,}
 
-\item{feval}{custimized evaluation function. Returns
-\code{list(metric='metric-name', value='metric-value')} with given
+\item{feval}{custimized evaluation function. Returns 
+\code{list(metric='metric-name', value='metric-value')} with given 
 prediction and dtrain,}
 
-\item{verbose}{If 0, xgboost will stay silent. If 1, xgboost will print
+\item{verbose}{If 0, xgboost will stay silent. If 1, xgboost will print 
 information of performance. If 2, xgboost will print information of both}
 
 \item{print.every.n}{Print every N progress messages when \code{verbose>0}. Default is 1 which means all messages are printed.}
 
-\item{early.stop.round}{If \code{NULL}, the early stopping function is not triggered.
-If set to an integer \code{k}, training with a validation set will stop if the performance
+\item{early.stop.round}{If \code{NULL}, the early stopping function is not triggered. 
+If set to an integer \code{k}, training with a validation set will stop if the performance 
 keeps getting worse consecutively for \code{k} rounds.}
 
 \item{maximize}{If \code{feval} and \code{early.stop.round} are set, then \code{maximize} must be set as well.
@@ -98,24 +98,25 @@ keeps getting worse consecutively for \code{k} rounds.}
 An advanced interface for training xgboost model. Look at \code{\link{xgboost}} function for a simpler interface.
 }
 \details{
-This is the training function for \code{xgboost}.
+This is the training function for \code{xgboost}. 
 
 It supports advanced features such as \code{watchlist}, customized objective function (\code{feval}),
 therefore it is more flexible than \code{\link{xgboost}} function.
 
-Parallelization is automatically enabled if \code{OpenMP} is present.
+Parallelization is automatically enabled if \code{OpenMP} is present. 
 Number of threads can also be manually specified via \code{nthread} parameter.
 
 \code{eval_metric} parameter (not listed above) is set automatically by Xgboost but can be overriden by parameter. Below is provided the list of different metric optimized by Xgboost to help you to understand how it works inside or to use them with the \code{watchlist} parameter.
   \itemize{
      \item \code{rmse} root mean square error. \url{http://en.wikipedia.org/wiki/Root_mean_square_error}
      \item \code{logloss} negative log-likelihood. \url{http://en.wikipedia.org/wiki/Log-likelihood}
+     \item \code{mlogloss} multiclass logloss. \url{https://www.kaggle.com/wiki/MultiClassLogLoss}
      \item \code{error} Binary classification error rate. It is calculated as \code{(wrong cases) / (all cases)}. For the predictions, the evaluation will regard the instances with prediction value larger than 0.5 as positive instances, and the others as negative instances.
      \item \code{merror} Multiclass classification error rate. It is calculated as \code{(wrong cases) / (all cases)}.
      \item \code{auc} Area under the curve. \url{http://en.wikipedia.org/wiki/Receiver_operating_characteristic#'Area_under_curve} for ranking evaluation.
      \item \code{ndcg} Normalized Discounted Cumulative Gain (for ranking task). \url{http://en.wikipedia.org/wiki/NDCG}
   }
-
+  
 Full list of parameters is available in the Wiki \url{https://github.com/dmlc/xgboost/wiki/Parameters}.
 
 This function only accepts an \code{\link{xgb.DMatrix}} object as the input.
diff --git a/R-package/man/xgboost.Rd b/R-package/man/xgboost.Rd
index a05560a19506..e31e5da43058 100644
--- a/R-package/man/xgboost.Rd
+++ b/R-package/man/xgboost.Rd
@@ -1,22 +1,22 @@
-% Generated by roxygen2 (4.1.1): do not edit by hand
+% Generated by roxygen2: do not edit by hand
 % Please edit documentation in R/xgboost.R
 \name{xgboost}
 \alias{xgboost}
 \title{eXtreme Gradient Boosting (Tree) library}
 \usage{
-xgboost(data = NULL, label = NULL, missing = NULL, weight = NULL,
+xgboost(data = NULL, label = NULL, missing = NA, weight = NULL,
   params = list(), nrounds, verbose = 1, print.every.n = 1L,
   early.stop.round = NULL, maximize = NULL, save_period = 0,
   save_name = "xgboost.model", ...)
 }
 \arguments{
-\item{data}{takes \code{matrix}, \code{dgCMatrix}, local data file or
+\item{data}{takes \code{matrix}, \code{dgCMatrix}, local data file or 
 \code{xgb.DMatrix}.}
 
 \item{label}{the response variable. User should not set this field,
 if data is local data file or  \code{xgb.DMatrix}.}
 
-\item{missing}{Missing is only used when input is dense matrix, pick a float
+\item{missing}{Missing is only used when input is dense matrix, pick a float 
 value that represents missing value. Sometimes a data use 0 or other extreme value to represents missing values.}
 
 \item{weight}{a vector indicating the weight for each row of the input.}
@@ -34,21 +34,21 @@ Commonly used ones are:
   \item \code{max.depth} maximum depth of the tree
   \item \code{nthread} number of thread used in training, if not set, all threads are used
 }
-
+  
   Look at \code{\link{xgb.train}} for a more complete list of parameters or \url{https://github.com/dmlc/xgboost/wiki/Parameters} for the full list.
-
+  
   See also \code{demo/} for walkthrough example in R.}
 
 \item{nrounds}{the max number of iterations}
 
-\item{verbose}{If 0, xgboost will stay silent. If 1, xgboost will print
+\item{verbose}{If 0, xgboost will stay silent. If 1, xgboost will print 
 information of performance. If 2, xgboost will print information of both
 performance and construction progress information}
 
 \item{print.every.n}{Print every N progress messages when \code{verbose>0}. Default is 1 which means all messages are printed.}
 
-\item{early.stop.round}{If \code{NULL}, the early stopping function is not triggered.
-If set to an integer \code{k}, training with a validation set will stop if the performance
+\item{early.stop.round}{If \code{NULL}, the early stopping function is not triggered. 
+If set to an integer \code{k}, training with a validation set will stop if the performance 
 keeps getting worse consecutively for \code{k} rounds.}
 
 \item{maximize}{If \code{feval} and \code{early.stop.round} are set, then \code{maximize} must be set as well.
@@ -75,8 +75,9 @@ data(agaricus.train, package='xgboost')
 data(agaricus.test, package='xgboost')
 train <- agaricus.train
 test <- agaricus.test
-bst <- xgboost(data = train$data, label = train$label, max.depth = 2,
+bst <- xgboost(data = train$data, label = train$label, max.depth = 2, 
                eta = 1, nthread = 2, nround = 2, objective = "binary:logistic")
 pred <- predict(bst, test$data)
+
 }
 
diff --git a/R-package/tests/testthat/test_basic.R b/R-package/tests/testthat/test_basic.R
index 791f1246c30c..34d47103f0e5 100644
--- a/R-package/tests/testthat/test_basic.R
+++ b/R-package/tests/testthat/test_basic.R
@@ -4,30 +4,33 @@ context("basic functions")
 
 data(agaricus.train, package='xgboost')
 data(agaricus.test, package='xgboost')
-train = agaricus.train
-test = agaricus.test
+train <- agaricus.train
+test <- agaricus.test
+set.seed(1994)
 
 test_that("train and predict", {
-  bst = xgboost(data = train$data, label = train$label, max.depth = 2,
+  bst <- xgboost(data = train$data, label = train$label, max.depth = 2,
                 eta = 1, nthread = 2, nround = 2, objective = "binary:logistic")
-  pred = predict(bst, test$data)
+  pred <- predict(bst, test$data)
+  expect_equal(length(pred), 1611)
 })
 
-
 test_that("early stopping", {
-  res = xgb.cv(data = train$data, label = train$label, max.depth = 2, nfold = 5,
+  res <- xgb.cv(data = train$data, label = train$label, max.depth = 2, nfold = 5,
                eta = 0.3, nthread = 2, nround = 20, objective = "binary:logistic",
                early.stop.round = 3, maximize = FALSE)
-  expect_true(nrow(res)<20)
-  bst = xgboost(data = train$data, label = train$label, max.depth = 2,
+  expect_true(nrow(res) < 20)
+  bst <- xgboost(data = train$data, label = train$label, max.depth = 2,
                 eta = 0.3, nthread = 2, nround = 20, objective = "binary:logistic",
                 early.stop.round = 3, maximize = FALSE)
-  pred = predict(bst, test$data)
+  pred <- predict(bst, test$data)
+  expect_equal(length(pred), 1611)
 })
 
 test_that("save_period", {
-  bst = xgboost(data = train$data, label = train$label, max.depth = 2,
+  bst <- xgboost(data = train$data, label = train$label, max.depth = 2,
                 eta = 0.3, nthread = 2, nround = 20, objective = "binary:logistic",
                 save_period = 10, save_name = "xgb.model")
-  pred = predict(bst, test$data)
+  pred <- predict(bst, test$data)
+  expect_equal(length(pred), 1611)
 })
diff --git a/R-package/tests/testthat/test_custom_objective.R b/R-package/tests/testthat/test_custom_objective.R
index 9fcbeca4d230..7407246c643f 100644
--- a/R-package/tests/testthat/test_custom_objective.R
+++ b/R-package/tests/testthat/test_custom_objective.R
@@ -2,46 +2,47 @@ context('Test models with custom objective')
 
 require(xgboost)
 
+data(agaricus.train, package='xgboost')
+data(agaricus.test, package='xgboost')
+dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
+dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label)
+
 test_that("custom objective works", {
-  data(agaricus.train, package='xgboost')
-  data(agaricus.test, package='xgboost')
-  dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
-  dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label)
-  
+
   watchlist <- list(eval = dtest, train = dtrain)
   num_round <- 2
-  
+
   logregobj <- function(preds, dtrain) {
     labels <- getinfo(dtrain, "label")
-    preds <- 1/(1 + exp(-preds))
+    preds <- 1 / (1 + exp(-preds))
     grad <- preds - labels
     hess <- preds * (1 - preds)
     return(list(grad = grad, hess = hess))
   }
   evalerror <- function(preds, dtrain) {
     labels <- getinfo(dtrain, "label")
-    err <- as.numeric(sum(labels != (preds > 0)))/length(labels)
+    err <- as.numeric(sum(labels != (preds > 0))) / length(labels)
     return(list(metric = "error", value = err))
   }
-  
-  param <- list(max.depth=2, eta=1, nthread = 2, silent=1, 
+
+  param <- list(max.depth=2, eta=1, nthread = 2, silent=1,
                 objective=logregobj, eval_metric=evalerror)
-  
+
   bst <- xgb.train(param, dtrain, num_round, watchlist)
   expect_equal(class(bst), "xgb.Booster")
   expect_equal(length(bst$raw), 1064)
   attr(dtrain, 'label') <- getinfo(dtrain, 'label')
-  
+
   logregobjattr <- function(preds, dtrain) {
     labels <- attr(dtrain, 'label')
-    preds <- 1/(1 + exp(-preds))
+    preds <- 1 / (1 + exp(-preds))
     grad <- preds - labels
     hess <- preds * (1 - preds)
     return(list(grad = grad, hess = hess))
   }
-  param <- list(max.depth=2, eta=1, nthread = 2, silent=1, 
-                objective=logregobjattr, eval_metric=evalerror)
+  param <- list(max.depth=2, eta=1, nthread = 2, silent = 1,
+                objective = logregobjattr, eval_metric = evalerror)
   bst <- xgb.train(param, dtrain, num_round, watchlist)
   expect_equal(class(bst), "xgb.Booster")
   expect_equal(length(bst$raw), 1064)
-})
\ No newline at end of file
+})
diff --git a/R-package/tests/testthat/test_helpers.R b/R-package/tests/testthat/test_helpers.R
index 4d80146e30a1..efc22f0b90c4 100644
--- a/R-package/tests/testthat/test_helpers.R
+++ b/R-package/tests/testthat/test_helpers.R
@@ -5,28 +5,64 @@ require(data.table)
 require(Matrix)
 require(vcd)
 
+set.seed(1982)
 data(Arthritis)
 data(agaricus.train, package='xgboost')
 df <- data.table(Arthritis, keep.rownames = F)
-df[,AgeDiscret:= as.factor(round(Age/10,0))]
-df[,AgeCat:= as.factor(ifelse(Age > 30, "Old", "Young"))]
-df[,ID:=NULL]
-sparse_matrix = sparse.model.matrix(Improved~.-1, data = df)
-output_vector = df[,Y:=0][Improved == "Marked",Y:=1][,Y]
-bst <- xgboost(data = sparse_matrix, label = output_vector, max.depth = 9,
-               eta = 1, nthread = 2, nround = 10,objective = "binary:logistic")
+df[,AgeDiscret := as.factor(round(Age / 10,0))]
+df[,AgeCat := as.factor(ifelse(Age > 30, "Old", "Young"))]
+df[,ID := NULL]
+sparse_matrix <- sparse.model.matrix(Improved~.-1, data = df)
+output_vector <- df[,Y := 0][Improved == "Marked",Y := 1][,Y]
+bst.Tree <- xgboost(data = sparse_matrix, label = output_vector, max.depth = 9,
+               eta = 1, nthread = 2, nround = 10, objective = "binary:logistic", booster = "gbtree")
 
+bst.GLM <- xgboost(data = sparse_matrix, label = output_vector,
+                   eta = 1, nthread = 2, nround = 10, objective = "binary:logistic", booster = "gblinear")
+
+feature.names <- agaricus.train$data@Dimnames[[2]]
 
 test_that("xgb.dump works", {
-  capture.output(print(xgb.dump(bst)))
+  capture.output(print(xgb.dump(bst.Tree)))
+  capture.output(print(xgb.dump(bst.GLM)))
+  expect_true(xgb.dump(bst.Tree, 'xgb.model.dump', with.stats = T))
+})
+
+test_that("xgb.model.dt.tree works with and without feature names", {
+  names.dt.trees <- c("ID", "Feature", "Split", "Yes", "No", "Missing", "Quality", "Cover",
+   "Tree", "Yes.Feature", "Yes.Cover", "Yes.Quality", "No.Feature", "No.Cover", "No.Quality")
+  dt.tree <- xgb.model.dt.tree(feature_names = feature.names, model = bst.Tree)
+  expect_equal(names.dt.trees, names(dt.tree))
+  expect_equal(dim(dt.tree), c(162, 15))
+  xgb.model.dt.tree(model = bst.Tree)
+})
+
+test_that("xgb.importance works with and without feature names", {
+  importance.Tree <- xgb.importance(feature_names = sparse_matrix@Dimnames[[2]], model = bst.Tree)
+  expect_equal(dim(importance.Tree), c(7, 4))
+  expect_equal(colnames(importance.Tree), c("Feature", "Gain", "Cover", "Frequency"))
+  xgb.importance(model = bst.Tree)
+  xgb.plot.importance(importance_matrix = importance.Tree)
 })
 
-test_that("xgb.importance works", {
-  xgb.dump(bst, 'xgb.model.dump', with.stats = T)
-  importance <- xgb.importance(sparse_matrix@Dimnames[[2]], 'xgb.model.dump')
-  expect_equal(dim(importance), c(7, 4))
+test_that("xgb.importance works with GLM model", {
+  importance.GLM <- xgb.importance(feature_names = sparse_matrix@Dimnames[[2]], model = bst.GLM)
+  expect_equal(dim(importance.GLM), c(10, 2))
+  expect_equal(colnames(importance.GLM), c("Feature", "Weight"))
+  xgb.importance(model = bst.GLM)
+  xgb.plot.importance(importance.GLM)
 })
 
-test_that("xgb.plot.tree works", {
-  xgb.plot.tree(agaricus.train$data@Dimnames[[2]], model = bst)
-})
\ No newline at end of file
+test_that("xgb.plot.tree works with and without feature names", {
+  xgb.plot.tree(feature_names = feature.names, model = bst.Tree)
+  xgb.plot.tree(model = bst.Tree)
+})
+
+test_that("xgb.plot.multi.trees works with and without feature names", {
+  xgb.plot.multi.trees(model = bst.Tree, feature_names = feature.names, features.keep = 3)
+  xgb.plot.multi.trees(model = bst.Tree, features.keep = 3)
+})
+
+test_that("xgb.plot.deepness works", {
+  xgb.plot.deepness(model = bst.Tree)
+})
diff --git a/R-package/tests/testthat/test_lint.R b/R-package/tests/testthat/test_lint.R
new file mode 100644
index 000000000000..2f2a07d54356
--- /dev/null
+++ b/R-package/tests/testthat/test_lint.R
@@ -0,0 +1,27 @@
+context("Code is of high quality and lint free")
+test_that("Code Lint", {
+  skip_on_cran()
+  skip_on_travis()
+  skip_if_not_installed("lintr")
+  my_linters <- list(
+    absolute_paths_linter=lintr::absolute_paths_linter,
+    assignment_linter=lintr::assignment_linter,
+    closed_curly_linter=lintr::closed_curly_linter,
+    commas_linter=lintr::commas_linter,
+    # commented_code_linter=lintr::commented_code_linter,
+    infix_spaces_linter=lintr::infix_spaces_linter,
+    line_length_linter=lintr::line_length_linter,
+    no_tab_linter=lintr::no_tab_linter,
+    object_usage_linter=lintr::object_usage_linter,
+    # snake_case_linter=lintr::snake_case_linter,
+    # multiple_dots_linter=lintr::multiple_dots_linter,
+    object_length_linter=lintr::object_length_linter,
+    open_curly_linter=lintr::open_curly_linter,
+    # single_quotes_linter=lintr::single_quotes_linter,
+    spaces_inside_linter=lintr::spaces_inside_linter,
+    spaces_left_parentheses_linter=lintr::spaces_left_parentheses_linter,
+    trailing_blank_lines_linter=lintr::trailing_blank_lines_linter,
+    trailing_whitespace_linter=lintr::trailing_whitespace_linter
+  )
+  # lintr::expect_lint_free(linters=my_linters) # uncomment this if you want to check code quality
+})
diff --git a/R-package/tests/testthat/test_parameter_exposure.R b/R-package/tests/testthat/test_parameter_exposure.R
new file mode 100644
index 000000000000..769059b76ae0
--- /dev/null
+++ b/R-package/tests/testthat/test_parameter_exposure.R
@@ -0,0 +1,32 @@
+context('Test model params and call are exposed to R')
+
+require(xgboost)
+
+data(agaricus.train, package='xgboost')
+data(agaricus.test, package='xgboost')
+
+dtrain <- xgb.DMatrix(agaricus.train$data, label = agaricus.train$label)
+dtest <- xgb.DMatrix(agaricus.test$data, label = agaricus.test$label)
+
+bst <- xgboost(data = dtrain,
+               max.depth = 2,
+               eta = 1,
+               nround = 10,
+               nthread = 1,
+               verbose = 0,
+               objective = "binary:logistic")
+
+test_that("call is exposed to R", {
+  model_call <- attr(bst, "call")
+  expect_is(model_call, "call")
+})
+
+test_that("params is exposed to R", {
+  model_params <- attr(bst, "params")
+
+  expect_is(model_params, "list")
+
+  expect_equal(model_params$eta, 1)
+  expect_equal(model_params$max.depth, 2)
+  expect_equal(model_params$objective, "binary:logistic")
+})
diff --git a/R-package/tests/testthat/test_poisson_regression.R b/R-package/tests/testthat/test_poisson_regression.R
index 5d3d78e27ca0..c5389dd0ff53 100644
--- a/R-package/tests/testthat/test_poisson_regression.R
+++ b/R-package/tests/testthat/test_poisson_regression.R
@@ -1,13 +1,14 @@
 context('Test poisson regression model')
 
 require(xgboost)
+set.seed(1994)
 
 test_that("poisson regression works", {
   data(mtcars)
-  bst = xgboost(data=as.matrix(mtcars[,-11]),label=mtcars[,11],
-                objective='count:poisson',nrounds=5)
+  bst <- xgboost(data = as.matrix(mtcars[,-11]),label = mtcars[,11],
+                objective = 'count:poisson', nrounds=5)
   expect_equal(class(bst), "xgb.Booster")
-  pred = predict(bst,as.matrix(mtcars[,-11]))
+  pred <- predict(bst,as.matrix(mtcars[, -11]))
   expect_equal(length(pred), 32)
-  sqrt(mean((pred-mtcars[,11])^2))
-})
\ No newline at end of file
+  expect_equal(sqrt(mean( (pred - mtcars[,11]) ^ 2)), 1.16, tolerance = 0.01)
+})
diff --git a/R-package/vignettes/discoverYourData.Rmd b/R-package/vignettes/discoverYourData.Rmd
index fa780ee94224..08d6bfdf5144 100644
--- a/R-package/vignettes/discoverYourData.Rmd
+++ b/R-package/vignettes/discoverYourData.Rmd
@@ -190,7 +190,7 @@ Measure feature importance
 In the code below, `sparse_matrix@Dimnames[[2]]` represents the column names of the sparse matrix. These names are the original values of the features (remember, each binary column == one value of one *categorical* feature).
 
 ```{r}
-importance <- xgb.importance(sparse_matrix@Dimnames[[2]], model = bst)
+importance <- xgb.importance(feature_names = sparse_matrix@Dimnames[[2]], model = bst)
 head(importance)
 ```
 
@@ -202,7 +202,7 @@ head(importance)
 
 `Cover` measures the relative quantity of observations concerned by a feature.
 
-`Frequence` is a simpler way to measure the `Gain`. It just counts the number of times a feature is used in all generated trees. You should not use it (unless you know why you want to use it).
+`Frequency` is a simpler way to measure the `Gain`. It just counts the number of times a feature is used in all generated trees. You should not use it (unless you know why you want to use it).
 
 ### Improvement in the interpretability of feature importance data.table
 
@@ -213,10 +213,10 @@ One simple solution is to count the co-occurrences of a feature and a class of t
 For that purpose we will execute the same function as above but using two more parameters, `data` and `label`.
 
 ```{r}
-importanceRaw <- xgb.importance(sparse_matrix@Dimnames[[2]], model = bst, data = sparse_matrix, label = output_vector)
+importanceRaw <- xgb.importance(feature_names = sparse_matrix@Dimnames[[2]], model = bst, data = sparse_matrix, label = output_vector)
 
 # Cleaning for better display
-importanceClean <- importanceRaw[,`:=`(Cover=NULL, Frequence=NULL)]
+importanceClean <- importanceRaw[,`:=`(Cover=NULL, Frequency=NULL)]
 
 head(importanceClean)
 ```
diff --git a/R-package/vignettes/xgboostPresentation.Rmd b/R-package/vignettes/xgboostPresentation.Rmd
index 45d2e8b8ea27..7534240ac287 100644
--- a/R-package/vignettes/xgboostPresentation.Rmd
+++ b/R-package/vignettes/xgboostPresentation.Rmd
@@ -345,7 +345,7 @@ Feature importance is similar to R gbm package's relative influence (rel.inf).
 ```
 importance_matrix <- xgb.importance(model = bst)
 print(importance_matrix)
-xgb.plot.importance(importance_matrix)
+xgb.plot.importance(importance_matrix = importance_matrix)
 ```
 
 View the trees from a model
diff --git a/README.md b/README.md
index 4c14e722bd1f..f33394d40762 100644
--- a/README.md
+++ b/README.md
@@ -2,7 +2,9 @@
 ===========
 [![Build Status](https://travis-ci.org/dmlc/xgboost.svg?branch=master)](https://travis-ci.org/dmlc/xgboost)
 [![Documentation Status](https://readthedocs.org/projects/xgboost/badge/?version=latest)](https://xgboost.readthedocs.org)
+[![GitHub license](http://dmlc.github.io/img/apache2.svg)](./LICENSE)
 [![CRAN Status Badge](http://www.r-pkg.org/badges/version/xgboost)](http://cran.r-project.org/web/packages/xgboost)
+[![PyPI version](https://badge.fury.io/py/xgboost.svg)](https://pypi.python.org/pypi/xgboost/)
 [![Gitter chat for developers at https://gitter.im/dmlc/xgboost](https://badges.gitter.im/Join%20Chat.svg)](https://gitter.im/dmlc/xgboost?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge)
 
 An optimized general purpose gradient boosting library. The library is parallelized, and also provides an optimized distributed version.
@@ -29,6 +31,9 @@ Contents
 What's New
 ----------
 
+* XGBoost helps Vlad Mironov, Alexander Guschin to win the [CERN LHCb experiment Flavour of Physics competition](https://www.kaggle.com/c/flavours-of-physics). Check out the [interview from Kaggle](http://blog.kaggle.com/2015/11/30/flavour-of-physics-technical-write-up-1st-place-go-polar-bears/).
+* XGBoost helps Mario Filho, Josef Feigl, Lucas, Gilberto to win the [Caterpillar Tube Pricing competition](https://www.kaggle.com/c/caterpillar-tube-pricing). Check out the [interview from Kaggle](http://blog.kaggle.com/2015/09/22/caterpillar-winners-interview-1st-place-gilberto-josef-leustagos-mario/).
+* XGBoost helps Halla Yang to win the [Recruit Coupon Purchase Prediction Challenge](https://www.kaggle.com/c/coupon-purchase-prediction). Check out the [interview from Kaggle](http://blog.kaggle.com/2015/10/21/recruit-coupon-purchase-winners-interview-2nd-place-halla-yang/).
 * XGBoost helps Owen Zhang to win the [Avito Context Ad Click competition](https://www.kaggle.com/c/avito-context-ad-clicks). Check out the [interview from Kaggle](http://blog.kaggle.com/2015/08/26/avito-winners-interview-1st-place-owen-zhang/).
 * XGBoost helps Chenglong Chen to win [Kaggle CrowdFlower Competition](https://www.kaggle.com/c/crowdflower-search-relevance)
   Check out the [winning solution](https://github.com/ChenglongChen/Kaggle_CrowdFlower)
diff --git a/demo/README.md b/demo/README.md
index d6f061484962..5a7a25f7611f 100644
--- a/demo/README.md
+++ b/demo/README.md
@@ -22,8 +22,8 @@ This is a list of short codes introducing different functionalities of xgboost p
   [Julia](https://github.com/antinucleon/XGBoost.jl/blob/master/demo/boost_from_prediction.jl)
 * Predicting using first n trees
   [python](guide-python/predict_first_ntree.py)
-  [R](../R-package/demo/boost_from_prediction.R)
-  [Julia](https://github.com/antinucleon/XGBoost.jl/blob/master/demo/boost_from_prediction.jl)
+  [R](../R-package/demo/predict_first_ntree.R)
+  [Julia](https://github.com/antinucleon/XGBoost.jl/blob/master/demo/predict_first_ntree.jl)
 * Generalized Linear Model
   [python](guide-python/generalized_linear_model.py)
   [R](../R-package/demo/generalized_linear_model.R)
@@ -49,4 +49,3 @@ Benchmarks
 ----------
 * [Starter script for Kaggle Higgs Boson](kaggle-higgs)
 * [Kaggle Tradeshift winning solution by daxiongshu](https://github.com/daxiongshu/kaggle-tradeshift-winning-solution)
-
diff --git a/demo/guide-python/README.md b/demo/guide-python/README.md
index d26b8fcf2b06..d84095f2bf29 100644
--- a/demo/guide-python/README.md
+++ b/demo/guide-python/README.md
@@ -9,4 +9,6 @@ XGBoost Python Feature Walkthrough
 * [Predicting leaf indices](predict_leaf_indices.py)
 * [Sklearn Wrapper](sklearn_examples.py)
 * [Sklearn Parallel](sklearn_parallel.py)
+* [Sklearn access evals result](sklearn_evals_result.py)
+* [Access evals result](evals_result.py)
 * [External Memory](external_memory.py)
diff --git a/demo/guide-python/evals_result.py b/demo/guide-python/evals_result.py
new file mode 100644
index 000000000000..8449b93077c4
--- /dev/null
+++ b/demo/guide-python/evals_result.py
@@ -0,0 +1,30 @@
+##
+#  This script demonstrate how to access the eval metrics in xgboost
+##
+
+import xgboost as xgb
+dtrain = xgb.DMatrix('../data/agaricus.txt.train', silent=True)
+dtest = xgb.DMatrix('../data/agaricus.txt.test', silent=True)
+
+param = [('max_depth', 2), ('objective', 'binary:logistic'), ('eval_metric', 'logloss'), ('eval_metric', 'error')]
+ 
+num_round = 2
+watchlist = [(dtest,'eval'), (dtrain,'train')]
+
+evals_result = {}
+bst = xgb.train(param, dtrain, num_round, watchlist, evals_result=evals_result)
+
+print('Access logloss metric directly from evals_result:')
+print(evals_result['eval']['logloss'])
+
+print('')
+print('Access metrics through a loop:')
+for e_name, e_mtrs in evals_result.items():
+    print('- {}'.format(e_name))
+    for e_mtr_name, e_mtr_vals in e_mtrs.items():
+        print('   - {}'.format(e_mtr_name))
+        print('      - {}'.format(e_mtr_vals))
+
+print('')
+print('Access complete dictionary:')
+print(evals_result)
diff --git a/demo/guide-python/sklearn_evals_result.py b/demo/guide-python/sklearn_evals_result.py
new file mode 100644
index 000000000000..a72cdfc5275f
--- /dev/null
+++ b/demo/guide-python/sklearn_evals_result.py
@@ -0,0 +1,43 @@
+##
+#  This script demonstrate how to access the xgboost eval metrics by using sklearn
+##
+
+import xgboost as xgb
+import numpy as np
+from sklearn.datasets import make_hastie_10_2
+
+X, y = make_hastie_10_2(n_samples=2000, random_state=42)
+
+# Map labels from {-1, 1} to {0, 1}
+labels, y = np.unique(y, return_inverse=True)
+
+X_train, X_test = X[:1600], X[1600:]
+y_train, y_test = y[:1600], y[1600:]
+
+param_dist = {'objective':'binary:logistic', 'n_estimators':2}
+
+clf = xgb.XGBModel(**param_dist)
+# Or you can use: clf = xgb.XGBClassifier(**param_dist)
+
+clf.fit(X_train, y_train,
+        eval_set=[(X_train, y_train), (X_test, y_test)], 
+        eval_metric='logloss',
+        verbose=True)
+
+# Load evals result by calling the evals_result() function
+evals_result = clf.evals_result()
+
+print('Access logloss metric directly from validation_0:')
+print(evals_result['validation_0']['logloss'])
+
+print('')
+print('Access metrics through a loop:')
+for e_name, e_mtrs in evals_result.items():
+    print('- {}'.format(e_name))
+    for e_mtr_name, e_mtr_vals in e_mtrs.items():
+        print('   - {}'.format(e_mtr_name))
+        print('      - {}'.format(e_mtr_vals))
+ 
+print('')
+print('Access complete dict:')
+print(evals_result)
diff --git a/doc/build.md b/doc/build.md
index b97237bcbac3..7eae0bbd07ce 100644
--- a/doc/build.md
+++ b/doc/build.md
@@ -15,66 +15,17 @@ Build XGBoost in OS X with OpenMP
 ---------------------------------
 Here is the complete solution to use OpenMp-enabled compilers to install XGBoost.
 
-1. Obtain gcc with openmp support by `brew install gcc --without-multilib` **or** clang with openmp by `brew install clang-omp`. The clang one is recommended because the first method requires us compiling gcc inside the machine (more than an hour in mine)! (BTW, `brew` is the de facto standard of `apt-get` on OS X. So installing [HPC](http://hpc.sourceforge.net/) separately is not recommended, but it should work.)
+1. Obtain gcc-5.x.x with openmp support by `brew install gcc --without-multilib`. (`brew` is the de facto standard of `apt-get` on OS X. So installing [HPC](http://hpc.sourceforge.net/) separately is not recommended, but it should work.)
 
-2. **if you are planing to use clang-omp** - in step 3 and/or 4, change line 9 in `xgboost/src/utils/omp.h` to
+2. `cd xgboost` then `bash build.sh` to compile XGBoost.
 
-  ```C++
-  #include <libiomp/omp.h> /* instead of #include <omp.h> */`
-  ```
-
-  to make it work, otherwise you might get this error
-
-  `src/tree/../utils/omp.h:9:10: error: 'omp.h' file not found...`
-
-
-
-3. Set the `Makefile` correctly for compiling cpp version xgboost then python version xgboost.
-
-  ```Makefile
-  export CC  = gcc-4.9
-  export CXX = g++-4.9
-  ```
-
-  Or
-
-  ```Makefile
-  export CC = clang-omp
-  export CXX = clang-omp++
-  ```
-
-  Remember to change `header` (mentioned in step 2) if using clang-omp.
-
-  Then `cd xgboost` then `bash build.sh` to compile XGBoost. And go to `wrapper` sub-folder to install python version.
+3. Install xgboost package for Python and R
 
-4. Set the `Makevars` file in highest piority for R.
+- For Python: go to `python-package` sub-folder to install python version with `python setup.py install` (or `sudo python setup.py install`).
+- For R: Set the `Makevars` file in highest piority for R.
 
   The point is, there are three `Makevars` : `~/.R/Makevars`, `xgboost/R-package/src/Makevars`, and `/usr/local/Cellar/r/3.2.0/R.framework/Resources/etc/Makeconf` (the last one obtained by running `file.path(R.home("etc"), "Makeconf")` in R), and `SHLIB_OPENMP_CXXFLAGS` is not set by default!! After trying, it seems that the first one has highest piority (surprise!).
 
-  So, **add** or **change** `~/.R/Makevars` to the following lines:
-
-  ```Makefile
-  CC=gcc-4.9
-  CXX=g++-4.9
-  SHLIB_OPENMP_CFLAGS = -fopenmp
-  SHLIB_OPENMP_CXXFLAGS = -fopenmp
-  SHLIB_OPENMP_FCFLAGS = -fopenmp
-  SHLIB_OPENMP_FFLAGS = -fopenmp
-  ```
-
-  Or
-
-  ```Makefile
-  CC=clang-omp
-  CXX=clang-omp++
-  SHLIB_OPENMP_CFLAGS = -fopenmp
-  SHLIB_OPENMP_CXXFLAGS = -fopenmp
-  SHLIB_OPENMP_FCFLAGS = -fopenmp
-  SHLIB_OPENMP_FFLAGS = -fopenmp
-  ```
-
-  Again, remember to change `header` if using clang-omp.
-
   Then inside R, run
 
   ```R
diff --git a/doc/index.md b/doc/index.md
index 40b7c519252f..2329c9677008 100644
--- a/doc/index.md
+++ b/doc/index.md
@@ -11,7 +11,7 @@ This document is hosted at http://xgboost.readthedocs.org/. You can also browse
 How to Get Started
 ------------------
 The best way to get started to learn xgboost is by the examples. There are three types of examples you can find in xgboost.
-* [Tutorials](#tutorials) are self-conatained tutorials on a complete data science tasks.
+* [Tutorials](#tutorials) are self-contained tutorials on complete data science tasks.
 * [XGBoost Code Examples](../demo/) are collections of code and benchmarks of xgboost.
   - There is a walkthrough section in this to walk you through specific API features.
 * [Highlight Solutions](#highlight-solutions) are presentations using xgboost to solve real world problems.
diff --git a/doc/model.md b/doc/model.md
index d9ecd2620f7c..7874a4cfcc8a 100644
--- a/doc/model.md
+++ b/doc/model.md
@@ -1,8 +1,12 @@
 Introduction to Boosted Trees
 =============================
-XGBoost is short for "Extreme Gradient Boosting", where the term "Gradient Boosting" is proposed in the paper _Greedy Function Approximation: A Gradient Boosting Machine_, Friedman. Based on this original model. This is a tutorial on boosted trees, most of content are based on this [slide](http://homes.cs.washington.edu/~tqchen/pdf/BoostedTree.pdf) by the author of xgboost.
+XGBoost is short for "Extreme Gradient Boosting", where the term "Gradient Boosting" is proposed in the paper _Greedy Function Approximation: A Gradient Boosting Machine_, by Friedman.
+XGBoost is based on this original model.
+This is a tutorial on gradient boosted trees, and most of the content is based on these [slides](http://homes.cs.washington.edu/~tqchen/pdf/BoostedTree.pdf) by the author of xgboost.
 
-The GBM(boosted trees) has been around for really a while, and there are a lot of materials on the topic. This tutorial tries to explain boosted trees in a self-contained and principled way of supervised learning. We think this explanation is cleaner, more formal, and motivates the variant used in xgboost.
+The GBM (boosted trees) has been around for really a while, and there are a lot of materials on the topic.
+This tutorial tries to explain boosted trees in a self-contained and principled way using the elements of supervised learning.
+We think this explanation is cleaner, more formal, and motivates the variant used in xgboost.
 
 Elements of Supervised Learning
 -------------------------------
@@ -10,21 +14,21 @@ XGBoost is used for supervised learning problems, where we use the training data
 Before we dive into trees, let us start by reviewing the basic elements in supervised learning.
 
 ### Model and Parameters
-The ***model*** in supervised learning usually refers to the mathematical structure on how to given the prediction ``$ y_i $`` given ``$ x_i $``.
-For example, a common model is *linear model*, where the prediction is given by ``$ \hat{y}_i = \sum_j w_j x_{ij} $``, a linear combination of weighted input features.
+The ***model*** in supervised learning usually refers to the mathematical structure of how to make the prediction ``$ y_i $`` given ``$ x_i $``.
+For example, a common model is a *linear model*, where the prediction is given by ``$ \hat{y}_i = \sum_j w_j x_{ij} $``, a linear combination of weighted input features.
 The prediction value can have different interpretations, depending on the task.
-For example, it can be logistic transformed to get the probability of positive class in logistic regression, and it can also be used as ranking score when we want to rank the outputs.
+For example, it can be logistic transformed to get the probability of positive class in logistic regression, and it can also be used as a ranking score when we want to rank the outputs.
 
-The ***parameters*** are the undermined part that we need to learn from data. In linear regression problem, the parameters are the co-efficients ``$ w $``.
+The ***parameters*** are the undetermined part that we need to learn from data. In linear regression problems, the parameters are the coefficients ``$ w $``.
 Usually we will use ``$ \Theta $`` to denote the parameters.
 
 ### Objective Function : Training Loss + Regularization
 
-Based on different understanding or assumption of ``$ y_i $``, we can have different problems as regression, classification, ordering, etc.
-We need to find a way to find the best parameters given the training data. In order to do so, we need to define a so called ***objective function***,
-to measure the performance of the model under certain set of parameters.
+Based on different understandings of ``$ y_i $`` we can have different problems, such as regression, classification, ordering, etc.
+We need to find a way to find the best parameters given the training data. In order to do so, we need to define a so-called ***objective function***,
+to measure the performance of the model given a certain set of parameters.
 
-A very important fact about objective functions, is they ***must always*** contains two parts: training loss and regularization.
+A very important fact about objective functions is they ***must always*** contain two parts: training loss and regularization.
 
 ```math
 Obj(\Theta) = L(\Theta) + \Omega(\Theta)
@@ -44,7 +48,8 @@ L(\theta) = \sum_i[ y_i\ln (1+e^{-\hat{y}_i}) + (1-y_i)\ln (1+e^{\hat{y}_i})]
 
 The ***regularization term*** is what people usually forget to add. The regularization term controls the complexity of the model, which helps us to avoid overfitting.
 This sounds a bit abstract, so let us consider the following problem in the following picture. You are asked to *fit* visually a step function given the input data points
-on the upper left corner of the image, which solution among the tree you think is the best fit?
+on the upper left corner of the image.
+Which solution among the three do you think is the best fit?
 
 ![Step function](img/step_fit.png)
 
@@ -53,26 +58,26 @@ The tradeoff between the two is also referred as bias-variance tradeoff in machi
 
 
 ### Why introduce the general principle
-The elements introduced in above forms the basic elements of supervised learning, and they are naturally the building blocks of machine learning toolkits.
-For example, you should be able to answer what is the difference and common parts between boosted trees and random forest.
-Understanding the process in a formalized way also helps us to understand the objective that we are learning and the reason behind the heurestics such as
+The elements introduced above form the basic elements of supervised learning, and they are naturally the building blocks of machine learning toolkits.
+For example, you should be able to describe the differences and commonalities between boosted trees and random forests.
+Understanding the process in a formalized way also helps us to understand the objective that we are learning and the reason behind the heuristics such as
 pruning and smoothing.
 
 Tree Ensemble
 -------------
 Now that we have introduced the elements of supervised learning, let us get started with real trees.
-To begin with, let us first learn what is the ***model*** of xgboost: tree ensembles.
+To begin with, let us first learn about the ***model*** of xgboost: tree ensembles.
 The tree ensemble model is a set of classification and regression trees (CART). Here's a simple example of a CART
-that classifies is someone will like computer games.
+that classifies whether someone will like computer games.
 
 ![CART](img/cart.png)
 
-We classify the members in thie family into different leaves, and assign them the score on corresponding leaf.
-A CART is a bit different from decision trees, where the leaf only contain decision values. In CART, a real score
+We classify the members of a family into different leaves, and assign them the score on corresponding leaf.
+A CART is a bit different from decision trees, where the leaf only contains decision values. In CART, a real score
 is associated with each of the leaves, which gives us richer interpretations that go beyond classification.
 This also makes the unified optimization step easier, as we will see in later part of this tutorial.
 
-Usually, a single tree is not so strong enough to be used in practice. What is actually used is the so called
+Usually, a single tree is not strong enough to be used in practice. What is actually used is the so-called
 tree ensemble model, that sums the prediction of multiple trees together.
 
 ![TwoCART](img/twocart.png)
@@ -90,9 +95,9 @@ where ``$ K $`` is the number of trees, ``$ f $`` is a function in the functiona
 ```math
 obj(\Theta) = \sum_i^n l(y_i, \hat{y}_i) + \sum_{k=1}^K \Omega(f_k)
 ```
-Now here comes the question, what is the *model* of random forest? It is exactly tree ensembles! So random forest and boosted trees are not different in terms of model,
+Now here comes the question, what is the *model* for random forests? It is exactly tree ensembles! So random forests and boosted trees are not different in terms of model,
 the difference is how we train them. This means if you write a predictive service of tree ensembles, you only need to write one of them and they should directly work
-for both random forest and boosted trees. One example of elements of supervised learning rocks.
+for both random forests and boosted trees. One example of why elements of supervised learning rocks.
 
 Tree Boosting
 -------------
@@ -106,10 +111,11 @@ Obj = \sum_{i=1}^n l(y_i, \hat{y}_i^{(t)}) + \sum_{i=1}^t\Omega(f_i) \\
 
 ### Additive Training
 
-First thing we want to ask is what are ***parameters*** of trees. You can find what we need to learn are those functions ``$f_i$``, with each contains the structure
-of the tree, and the leaf score. This is much harder than traditional optimization problem where you can take the gradient and go.
+First thing we want to ask is what are the ***parameters*** of trees.
+You can find what we need to learn are those functions ``$f_i$``, with each containing the structure
+of the tree and the leaf scores. This is much harder than traditional optimization problem where you can take the gradient and go.
 It is not easy to train all the trees at once.
-Instead, we use an additive strategy: fix what we have learned, add a new tree at a time.
+Instead, we use an additive strategy: fix what we have learned, add one new tree at a time.
 We note the prediction value at step ``$t$`` by ``$ \hat{y}_i^{(t)}$``, so we have
 
 ```math
@@ -120,7 +126,7 @@ We note the prediction value at step ``$t$`` by ``$ \hat{y}_i^{(t)}$``, so we ha
 \hat{y}_i^{(t)} &= \sum_{k=1}^t f_k(x_i)= \hat{y}_i^{(t-1)} + f_t(x_i)
 ```
 
-It remains to ask Which tree do we want at each step?  A natural thing is to add the one that optimizes our objective.
+It remains to ask, which tree do we want at each step?  A natural thing is to add the one that optimizes our objective.
 
 ```math
 Obj^{(t)} & = \sum_{i=1}^n l(y_i, \hat{y}_i^{(t)}) + \sum_{i=1}^t\Omega(f_i) \\
@@ -135,8 +141,8 @@ Obj^{(t)} & = \sum_{i=1}^n (y_i - (\hat{y}_i^{(t-1)} + f_t(x_i)))^2 + \sum_{i=1}
 ```
 
 The form of MSE is friendly, with a first order term (usually called residual) and a quadratic term.
-For other loss of interest (for example, logistic loss), it is not so easy to get such a nice form.
-So in general case, we take the Taylor expansion of the loss function up to the second order
+For other losses of interest (for example, logistic loss), it is not so easy to get such a nice form.
+So in the general case, we take the Taylor expansion of the loss function up to the second order
 
 ```math
 Obj^{(t)} = \sum_{i=1}^n [l(y_i, \hat{y}_i^{(t-1)}) + g_i f_t(x_i) + \frac{1}{2} h_i f_t^2(x_i)] + \Omega(f_t) + constant
@@ -148,15 +154,15 @@ g_i &= \partial_{\hat{y}_i^{(t)}} l(y_i, \hat{y}_i^{(t-1)})\\
 h_i &= \partial_{\hat{y}_i^{(t)}}^2 l(y_i, \hat{y}_i^{(t-1)})
 ```
 
-After we remove all the constants, the specific objective at t step becomes
+After we remove all the constants, the specific objective at step ``$t$`` becomes
 
 ```math
 \sum_{i=1}^n [g_i f_t(x_i) + \frac{1}{2} h_i f_t^2(x_i)] + \Omega(f_t)
 ```
 
-This becomes our optimization goal for the new tree. One important advantage of this definition, is that
-it only depends on ``$g_i$`` and ``$h_i$``, this is how xgboost allows support of customization of loss functions.
-We can optimized every loss function, including logistic regression, weighted logistic regression, using the exactly
+This becomes our optimization goal for the new tree. One important advantage of this definition is that
+it only depends on ``$g_i$`` and ``$h_i$``. This is how xgboost can support custom loss functions.
+We can optimize every loss function, including logistic regression, weighted logistic regression, using the exactly
 the same solver that takes ``$g_i$`` and ``$h_i$`` as input!
 
 ### Model Complexity
@@ -173,9 +179,9 @@ In XGBoost, we define the complexity as
 ```math
 \Omega(f) = \gamma T + \frac{1}{2}\lambda \sum_{j=1}^T w_j^2
 ```
-Of course there is more than one way to define the complexity, but this specific one works well in practice. The regularization is one part most tree packages takes
-less carefully, or simply ignore. This was due to the traditional treatment tree learning only emphasize improving impurity, while the complexity control part
-are more lies as part of heuristics. By defining it formally, we can get a better idea of what we are learning, and yes it works well in practice.
+Of course there is more than one way to define the complexity, but this specific one works well in practice. The regularization is one part most tree packages treat
+less carefully, or simply ignore. This was because the traditional treatment of tree learning only emphasized improving impurity, while the complexity control was left to heuristics.
+By defining it formally, we can get a better idea of what we are learning, and yes it works well in practice.
 
 ### The Structure Score
 
@@ -186,13 +192,15 @@ Obj^{(t)} &\approx \sum_{i=1}^n [g_i w_q(x_i) + \frac{1}{2} h_i w_{q(x_i)}^2] +
 &= \sum^T_{j=1} [(\sum_{i\in I_j} g_i) w_j + \frac{1}{2} (\sum_{i\in I_j} h_i + \lambda) w_j^2 ] + \gamma T
 ```
 
-where ``$ I_j = \{i|q(x_i)=j\} $`` is the set of indices of data points assigned to the ``$ j $``-th leaf. Notice that in the second line we have change the index of the summation because all the data points on the same leaf get the same score. We could further compress the expression by defining ``$ G_j = \sum_{i\in I_j} g_i $`` and ``$ H_j = \sum_{i\in I_j} h_i $``:
+where ``$ I_j = \{i|q(x_i)=j\} $`` is the set of indices of data points assigned to the ``$ j $``-th leaf.
+Notice that in the second line we have changed the index of the summation because all the data points on the same leaf get the same score.
+We could further compress the expression by defining ``$ G_j = \sum_{i\in I_j} g_i $`` and ``$ H_j = \sum_{i\in I_j} h_i $``:
 
 ```math
 Obj^{(t)} = \sum^T_{j=1} [G_jw_j + \frac{1}{2} (H_j+\lambda) w_j^2] +\gamma T
 ```
 
-In this equation ``$ w_j $`` are independent to each other, the form ``$ G_jw_j+\frac{1}{2}(H_j+\lambda)w_j^2 $`` is quadratic and the best ``$ w_j $`` for a given structure ``$q(x)$`` and the best objective reduction we can get:
+In this equation ``$ w_j $`` are independent to each other, the form ``$ G_jw_j+\frac{1}{2}(H_j+\lambda)w_j^2 $`` is quadratic and the best ``$ w_j $`` for a given structure ``$q(x)$`` and the best objective reduction we can get is:
 
 ```math
 w_j^\ast = -\frac{G_j}{H_j+\lambda}\\
@@ -202,30 +210,31 @@ The last equation measures ***how good*** a tree structure ``$q(x)$`` is.
 
 ![Structure Score](img/struct_score.png)
 
-If all these sounds a bit complicated. Let us take a look the the picture, and see how the scores can be calculated.
+If all this sounds a bit complicated, let's take a look at the picture, and see how the scores can be calculated.
 Basically, for a given tree structure, we push the statistics ``$g_i$`` and ``$h_i$`` to the leaves they belong to,
-sum the statistics together, and use the formula to calulate how good the tree is.
-This score is like impurity measure in decision tree, except that it also takes the model complexity into account.
+sum the statistics together, and use the formula to calculate how good the tree is.
+This score is like the impurity measure in a decision tree, except that it also takes the model complexity into account.
 
 ### Learn the tree structure
-Now we have a way to measure how good a tree is ideally we can enumerate all possible trees and pick the best one.
-In practice it is impossible, so we will try to one level of the tree at a time.
+Now that we have a way to measure how good a tree is, ideally we would enumerate all possible trees and pick the best one.
+In practice it is intractable, so we will try to optimize one level of the tree at a time.
 Specifically we try to split a leaf into two leaves, and the score it gains is
 
 ```math
 Gain = \frac{1}{2} \left[\frac{G_L^2}{H_L+\lambda}+\frac{G_R^2}{H_R+\lambda}-\frac{(G_L+G_R)^2}{H_L+H_R+\lambda}\right] - \gamma
 ```
-This formula can be decomposited as 1) the score on the new left leaf 2) the score on the new right leaf 3) The score on the original leaf 4) regularization on the additional leaf.
-We can find an important fact here: if the gain is smaller than ``$\gamma$``, we would better not to add that branch. This is exactly the ***prunning*** techniques in tree based
-models! By using the principles of supervised learning, we can naturally comes up with the reason these techniques :)
+This formula can be decomposed as 1) the score on the new left leaf 2) the score on the new right leaf 3) The score on the original leaf 4) regularization on the additional leaf.
+We can see an important fact here: if the gain is smaller than ``$\gamma$``, we would do better not to add that branch. This is exactly the ***pruning*** techniques in tree based
+models! By using the principles of supervised learning, we can naturally come up with the reason these techniques work :)
 
-For real valued data, we usually want to search for an optimal split. To efficiently do so, we place all the instances in a sorted way, like the following picture.
+For real valued data, we usually want to search for an optimal split. To efficiently do so, we place all the instances in sorted order, like the following picture.
 ![Best split](img/split_find.png)
+
 Then a left to right scan is sufficient to calculate the structure score of all possible split solutions, and we can find the best split efficiently.
 
 Final words on XGBoost
 ----------------------
-Now you have understand what is a boosted tree, you may ask, where is the introduction on [XGBoost](https://github.com/dmlc/xgboost)?
+Now that you understand what boosted trees are, you may ask, where is the introduction on [XGBoost](https://github.com/dmlc/xgboost)?
 XGBoost is exactly a tool motivated by the formal principle introduced in this tutorial!
 More importantly, it is developed with both deep consideration in terms of ***systems optimization*** and ***principles in machine learning***.
 The goal of this library is to push the extreme of the computation limits of machines to provide a ***scalable***, ***portable*** and ***accurate*** library.
diff --git a/doc/parameter.md b/doc/parameter.md
index ba0a18870df9..b22a20df540e 100644
--- a/doc/parameter.md
+++ b/doc/parameter.md
@@ -1,6 +1,6 @@
 XGBoost Parameters
 ==================
-Before running XGboost, we must set three types of parameters, general parameters, booster parameters and task parameters:
+Before running XGboost, we must set three types of parameters: general parameters, booster parameters and task parameters.
 - General parameters relates to which booster we are using to do boosting, commonly tree or linear model
 - Booster parameters depends on which booster you have chosen
 - Learning Task parameters that decides on the learning scenario, for example, regression tasks may use different parameters with ranking tasks.
@@ -62,8 +62,8 @@ Parameters for Linear Booster
 
 Learning Task Parameters
 ------------------------
+Specify the learning task and the corresponding learning objective. The objective options are below:
 * objective [ default=reg:linear ]
- - specify the learning task and the corresponding learning objective, and the objective options are below:
  - "reg:linear" --linear regression
  - "reg:logistic" --logistic regression
  - "binary:logistic" --logistic regression for binary classification, output probability
@@ -97,9 +97,9 @@ Command Line Parameters
 -----------------------
 The following parameters are only used in the console version of xgboost
 * use_buffer [ default=1 ]
- -  whether create binary buffer for text input, this normally will speedup loading when do
+ -  Whether to create a binary buffer from text input. Doing so normally will speed up loading times
 * num_round
- - the number of round for boosting.
+ - The number of rounds for boosting
 * data
   - The path of training data
 * test:data
diff --git a/doc/python/python_intro.md b/doc/python/python_intro.md
index b46358877dd4..9e07d3c73aa4 100644
--- a/doc/python/python_intro.md
+++ b/doc/python/python_intro.md
@@ -8,7 +8,7 @@ This document gives a basic walkthrough of xgboost python package.
 
 Install XGBoost
 ---------------
-To install XGBoost, do the following steps.
+To install XGBoost, do the following steps:
 
 * You need to run `make` in the root directory of the project
 * In the  `python-package` directory run
@@ -22,34 +22,39 @@ import xgboost as xgb
 
 Data Interface
 --------------
-XGBoost python module is able to loading from libsvm txt format file, Numpy 2D array and xgboost binary buffer file. The data will be store in ```DMatrix``` object.
+The XGBoost python module is able to load data from:
+- libsvm txt format file
+- Numpy 2D array, and 
+- xgboost binary buffer file. 
 
-* To load libsvm text format file and XGBoost binary file into ```DMatrix```, the usage is like
+The data will be store in a ```DMatrix``` object.
+
+* To load a libsvm text file or a XGBoost binary file into ```DMatrix```, the command is:
 ```python
 dtrain = xgb.DMatrix('train.svm.txt')
 dtest = xgb.DMatrix('test.svm.buffer')
 ```
-* To load numpy array into ```DMatrix```, the usage is like
+* To load a numpy array into ```DMatrix```, the command is:
 ```python
 data = np.random.rand(5,10) # 5 entities, each contains 10 features
 label = np.random.randint(2, size=5) # binary target
 dtrain = xgb.DMatrix( data, label=label)
 ```
-* Build ```DMatrix``` from ```scipy.sparse```
+* To load a scpiy.sparse array into ```DMatrix```, the command is:
 ```python
 csr = scipy.sparse.csr_matrix((dat, (row, col)))
 dtrain = xgb.DMatrix(csr)
 ```
-* Saving ```DMatrix``` into XGBoost binary file will make loading faster in next time. The usage is like:
+* Saving ```DMatrix``` into XGBoost binary file will make loading faster in next time:
 ```python
 dtrain = xgb.DMatrix('train.svm.txt')
 dtrain.save_binary("train.buffer")
 ```
-* To handle missing value in ```DMatrix```, you can initialize the ```DMatrix``` like:
+* To handle missing value in ```DMatrix```, you can initialize the ```DMatrix``` by specifying missing values:
 ```python
 dtrain = xgb.DMatrix(data, label=label, missing = -999.0)
 ```
-* Weight can be set when needed, like
+* Weight can be set when needed:
 ```python
 w = np.random.rand(5, 1)
 dtrain = xgb.DMatrix(data, label=label, missing = -999.0, weight=w)
@@ -62,10 +67,17 @@ XGBoost use list of pair to save [parameters](../parameter.md). Eg
 ```python
 param = {'bst:max_depth':2, 'bst:eta':1, 'silent':1, 'objective':'binary:logistic' }
 param['nthread'] = 4
-plst = param.items()
-plst += [('eval_metric', 'auc')] # Multiple evals can be handled in this way
-plst += [('eval_metric', 'ams@0')]
+param['eval_metric'] = 'auc'
+```
+* You can also specify multiple eval metrics:
+```python
+param['eval_metric'] = ['auc', 'ams@0'] 
+
+# alternativly:
+# plst = param.items()
+# plst += [('eval_metric', 'ams@0')]
 ```
+
 * Specify validations set to watch performance
 ```python
 evallist  = [(dtest,'eval'), (dtrain,'train')]
@@ -109,9 +121,9 @@ Early stopping requires at least one set in `evals`. If there's more than one, i
 
 The model will train until the validation score stops improving. Validation error needs to decrease at least every `early_stopping_rounds` to continue training.
 
-If early stopping occurs, the model will have two additional fields: `bst.best_score` and `bst.best_iteration`. Note that `train()` will return a model from the last iteration, not the best one.
+If early stopping occurs, the model will have three additional fields: `bst.best_score`, `bst.best_iteration` and `bst.best_ntree_limit`. Note that `train()` will return a model from the last iteration, not the best one.
 
-This works with both metrics to minimize (RMSE, log loss, etc.) and to maximize (MAP, NDCG, AUC).
+This works with both metrics to minimize (RMSE, log loss, etc.) and to maximize (MAP, NDCG, AUC). Note that if you specify more than one evaluation metric the last one in `param['eval_metric']` is used for early stopping.
 
 Prediction
 ----------
@@ -123,9 +135,9 @@ dtest = xgb.DMatrix(data)
 ypred = bst.predict(xgmat)
 ```
 
-If early stopping is enabled during training, you can predict with the best iteration.
+If early stopping is enabled during training, you can get predicticions from the best iteration with `bst.best_ntree_limit`:
 ```python
-ypred = bst.predict(xgmat,ntree_limit=bst.best_iteration)
+ypred = bst.predict(xgmat,ntree_limit=bst.best_ntree_limit)
 ```
 
 Plotting
@@ -150,4 +162,4 @@ When you use ``IPython``, you can use ``to_graphviz`` function which converts th
 
 ```python
 xgb.to_graphviz(bst, num_trees=2)
-```
\ No newline at end of file
+```
diff --git a/python-package/.pylintrc b/python-package/.pylintrc
new file mode 100644
index 000000000000..1e63cdabe703
--- /dev/null
+++ b/python-package/.pylintrc
@@ -0,0 +1,9 @@
+[MASTER]
+
+ignore=tests
+
+unexpected-special-method-signature,too-many-nested-blocks
+
+dummy-variables-rgx=(unused|)_.*
+
+reports=no
\ No newline at end of file
diff --git a/python-package/MANIFEST.in b/python-package/MANIFEST.in
index 2d93429a9ff6..83596d8269f2 100644
--- a/python-package/MANIFEST.in
+++ b/python-package/MANIFEST.in
@@ -1,7 +1,14 @@
-include *.sh *.md
+include *.sh *.md *.rst
 recursive-include xgboost *
 recursive-include xgboost/wrapper *
 recursive-include xgboost/windows *
 recursive-include xgboost/subtree *
 recursive-include xgboost/src *
 recursive-include xgboost/multi-node *
+#exclude pre-compiled .o file for less confusions
+#include the pre-compiled .so is needed as a placeholder
+#since it will be copy after compiling on the fly
+global-exclude xgboost/wrapper/*.so.gz
+global-exclude xgboost/*.o
+global-exclude *.pyo
+global-exclude *.pyc
diff --git a/python-package/README.md b/python-package/README.md
deleted file mode 100644
index eb0fa8cca53f..000000000000
--- a/python-package/README.md
+++ /dev/null
@@ -1,27 +0,0 @@
-XGBoost Python Package
-======================
-Installation
-------------
-We are on [PyPI](https://pypi.python.org/pypi/xgboost) now. For stable version, please install using pip:
-
-* ```pip install xgboost```
-* Note for windows users: this pip installation may not work on some windows environment, and it may cause unexpected errors. pip installation on windows is currently disabled for further invesigation, please install from github.
-
-For up-to-date version, please install from github.
-
-* To make the python module, type ```./build.sh``` in the root directory of project
-* Make sure you have [setuptools](https://pypi.python.org/pypi/setuptools)
-* Install with `python setup.py install` from this directory.
-* For windows users, please use the Visual Studio project file under [windows folder](../windows/). See also the [installation tutorial](https://www.kaggle.com/c/otto-group-product-classification-challenge/forums/t/13043/run-xgboost-from-windows-and-python) from Kaggle Otto Forum.
-
-Examples
-------
-
-* Refer also to the walk through example in [demo folder](../demo/guide-python)
-* See also the [example scripts](../demo/kaggle-higgs) for Kaggle Higgs Challenge, including [speedtest script](../demo/kaggle-higgs/speedtest.py) on this dataset.
-
-Note
------
-
-* If you want to build xgboost on Mac OS X with multiprocessing support where clang in XCode by default doesn't support, please install gcc 4.9 or higher using [homebrew](http://brew.sh/) ```brew tap homebrew/versions; brew install gcc49```
-* If you want to run XGBoost process in parallel using the fork backend for joblib/multiprocessing, you must build XGBoost without support for OpenMP by `make no_omp=1`. Otherwise, use the forkserver (in Python 3.4) or spawn backend. See the [sklearn_parallel.py](../demo/guide-python/sklearn_parallel.py) demo.
diff --git a/python-package/README.rst b/python-package/README.rst
new file mode 100644
index 000000000000..04f349e1c48d
--- /dev/null
+++ b/python-package/README.rst
@@ -0,0 +1,56 @@
+XGBoost Python Package
+======================
+
+|PyPI version| |PyPI downloads|
+
+Installation
+------------
+
+We are on `PyPI <https://pypi.python.org/pypi/xgboost>`__ now. For
+stable version, please install using pip:
+
+-  ``pip install xgboost``
+-  Note for windows users: this pip installation may not work on some
+   windows environment, and it may cause unexpected errors. pip
+   installation on windows is currently disabled for further
+   invesigation, please install from github.
+
+For up-to-date version, please install from github.
+
+-  To make the python module, type ``./build.sh`` in the root directory
+   of project
+-  Make sure you have
+   `setuptools <https://pypi.python.org/pypi/setuptools>`__
+-  Install with ``cd python-package; python setup.py install`` from this directory.
+-  For windows users, please use the Visual Studio project file under
+   `windows folder <../windows/>`__. See also the `installation
+   tutorial <https://www.kaggle.com/c/otto-group-product-classification-challenge/forums/t/13043/run-xgboost-from-windows-and-python>`__
+   from Kaggle Otto Forum.
+
+Examples
+--------
+
+-  Refer also to the walk through example in `demo
+   folder <../demo/guide-python>`__
+-  See also the `example scripts <../demo/kaggle-higgs>`__ for Kaggle
+   Higgs Challenge, including `speedtest
+   script <../demo/kaggle-higgs/speedtest.py>`__ on this dataset.
+
+Note
+----
+
+-  If you want to build xgboost on Mac OS X with multiprocessing support
+   where clang in XCode by default doesn't support, please install gcc
+   4.9 or higher using `homebrew <http://brew.sh/>`__
+   ``brew tap homebrew/versions; brew install gcc49``
+-  If you want to run XGBoost process in parallel using the fork backend
+   for joblib/multiprocessing, you must build XGBoost without support
+   for OpenMP by ``make no_omp=1``. Otherwise, use the forkserver (in
+   Python 3.4) or spawn backend. See the
+   `sklearn\_parallel.py <../demo/guide-python/sklearn_parallel.py>`__
+   demo.
+
+.. |PyPI version| image:: https://badge.fury.io/py/xgboost.svg
+   :target: http://badge.fury.io/py/xgboost
+.. |PyPI downloads| image:: https://img.shields.io/pypi/dm/xgboost.svg
+   :target: https://pypi.python.org/pypi/xgboost/
diff --git a/python-package/build_trouble_shooting.md b/python-package/build_trouble_shooting.md
new file mode 100644
index 000000000000..67bcfda8c047
--- /dev/null
+++ b/python-package/build_trouble_shooting.md
@@ -0,0 +1,52 @@
+XGBoost Python Package Troubleshooting
+======================
+Windows platform
+------------
+The current best solution for installing xgboost on windows machine is building from github. Please go to [windows](/windows/), build with the Visual Studio project file, and install. Additional detailed instruction can be found at this [installation tutorial](https://www.kaggle.com/c/otto-group-product-classification-challenge/forums/t/13043/run-xgboost-from-windows-and-python) from Kaggle Otto Forum.
+
+`pip install xgboost` is **not** tested nor supported in windows platform for now. 
+
+Linux platform (also Mac OS X in general)
+------------
+**Trouble 0**: I see error messages like this when install from github using `python setup.py install`.
+
+    XGBoostLibraryNotFound: Cannot find XGBoost Libarary in the candicate path, did you install compilers and run build.sh in root path?
+    List of candidates:
+    /home/dmlc/anaconda/lib/python2.7/site-packages/xgboost-0.4-py2.7.egg/xgboost/libxgboostwrapper.so
+    /home/dmlc/anaconda/lib/python2.7/site-packages/xgboost-0.4-py2.7.egg/xgboost/../../wrapper/libxgboostwrapper.so
+    /home/dmlc/anaconda/lib/python2.7/site-packages/xgboost-0.4-py2.7.egg/xgboost/./wrapper/libxgboostwrapper.so
+
+**Solution 0**: Please check if you have:
+
+* installed the latest C++ compilers and `make`, for example `g++` and `gcc` (Linux) or `clang LLVM` (Mac OS X). Recommended compilers are `g++-5` or newer (Linux and Mac), or `clang` comes with Xcode in Mac OS X. For installting compilers, please refer to your system package management commands, e.g. `apt-get` `yum` or `brew`(Mac).
+* compilers in your `$PATH`. Try typing `gcc` and see if your have it in your path.
+* Do you use other shells than `bash` and install from `pip`? In some old version of pip installation, the shell script used `pushd` for changing directory and triggering the build process, which may failed some shells without `pushd` command. Please update to the latest version by removing the old installation and redo `pip install xgboost`
+* Some outdated `make` may not recognize the recent changes in the `Makefile` and gives this error, please update to the latest `make`:
+
+    `/usr/lib/ruby/gems/1.8/gems/make-0.3.1/bin/make:4: undefined local variable or method 'make' for main:Object (NameError)`    
+
+**Trouble 1**: I see the same error message in **Trouble 0** when install from `pip install xgboost`.
+
+**Solution 1**: the problem is the same as in **Trouble 0**, please see **Solution 0**.
+
+**Trouble 2**: I see this error message when `pip install xgboost`. It says I have `libxgboostwrapper.so` but it is not valid.
+
+    OSError: /home/dmlc/anaconda/lib/python2.7/site-packages/xgboost/./wrapper/libxgboostwrapper.so: invalid ELF header
+   
+**Solution 2**: Solution is as in 0 and 1 by installing the latest `g++` compiler and the latest `make`. The reason for this rare error is that, `pip` ships with a pre-compiled `libxgboostwrapper.so` with Mac for placeholder for allowing `setup.py` to find the right lib path. If a system doesn't compile, it may refer to this placeholder lib and fail. This placeholder `libxgboostwrapper.so` will be automatically removed and correctly generated by the compiling on-the-fly for the system.
+
+**Trouble 3**: My system's `pip` says it can't find a valid `xgboost` installation release on `PyPI`.
+**Solution 3**: Some linux system comes with an old `pip` version. Please update to the latest `pip` by following the official installation document at <http://pip.readthedocs.org/en/stable/installing/>
+
+**Trouble 4**: I tried `python setup.py install` but it says `setuptools` import fail.
+**Solution 4**: Please make sure you have [setuptools](https://pypi.python.org/pypi/setuptools) before installing the python package.
+
+Mac OS X (specific)
+------------
+Most of the troubles and solutions are the same with that in the Linux platform. Mac has the following specific problems.
+
+**Trouble 0**: I successfully installed `xgboost` using github installation/using `pip install xgboost`. But it runs very slow with only single thread, what is going on?
+**Solution 0**: `clang LLVM` compiler on Mac OS X from Xcode doesn't support OpenMP multi-thread. An alternative choice is installing `homebrew` <http://brew.sh/> and `brew install g++-5` which provides multi-thread OpenMP support.
+
+**Trouble 1**: Can I install `clang-omp` for supporting OpenMP without using `gcc`?
+**Solution 1**: it is not support and may have linking errors.
\ No newline at end of file
diff --git a/python-package/setup.cfg b/python-package/setup.cfg
index b88034e414bc..5aef279b98f5 100644
--- a/python-package/setup.cfg
+++ b/python-package/setup.cfg
@@ -1,2 +1,2 @@
 [metadata]
-description-file = README.md
+description-file = README.rst
diff --git a/python-package/setup.py b/python-package/setup.py
index c9dfa415ccbe..4b05bc710af2 100644
--- a/python-package/setup.py
+++ b/python-package/setup.py
@@ -2,22 +2,11 @@
 """Setup xgboost package."""
 from __future__ import absolute_import
 import sys
+import os
 from setuptools import setup, find_packages
-import subprocess
+#import subprocess
 sys.path.insert(0, '.')
 
-import os
-#build on the fly if install in pip
-#otherwise, use build.sh in the parent directory
-
-if 'pip' in __file__:
-    if not os.name == 'nt': #if not windows
-        build_sh = subprocess.Popen(['sh', 'xgboost/build-python.sh'])
-        build_sh.wait()
-        output = build_sh.communicate()
-        print(output)
-
-
 CURRENT_DIR = os.path.dirname(__file__)
 
 # We can not import `xgboost.libpath` in setup.py directly since xgboost/__init__.py
@@ -28,16 +17,13 @@
 exec(compile(open(libpath_py, "rb").read(), libpath_py, 'exec'), libpath, libpath)
 
 LIB_PATH = libpath['find_lib_path']()
-#print LIB_PATH
 
-#to deploy to pip, please use
-#make pythonpack
-#python setup.py register sdist upload
-#and be sure to test it firstly using "python setup.py register sdist upload -r pypitest"
+#Please use setup_pip.py for generating and deploying pip installation
+#detailed instruction in setup_pip.py
 setup(name='xgboost',
       version=open(os.path.join(CURRENT_DIR, 'xgboost/VERSION')).read().strip(),
-      #version='0.4a13',
-      description=open(os.path.join(CURRENT_DIR, 'README.md')).read(),
+      #version='0.4a23',
+      description=open(os.path.join(CURRENT_DIR, 'README.rst')).read(),
       install_requires=[
           'numpy',
           'scipy',
@@ -46,10 +32,6 @@
       maintainer_email='phunter.lau@gmail.com',
       zip_safe=False,
       packages=find_packages(),
-      #don't need this and don't use this, give everything to MANIFEST.in
-      #package_dir = {'':'xgboost'},
-      #package_data = {'': ['*.txt','*.md','*.sh'],
-      #               }
       #this will use MANIFEST.in during install where we specify additional files,
       #this is the golden line
       include_package_data=True,
diff --git a/python-package/setup_pip.py b/python-package/setup_pip.py
new file mode 100644
index 000000000000..a6a1638e6c0e
--- /dev/null
+++ b/python-package/setup_pip.py
@@ -0,0 +1,58 @@
+# pylint: disable=invalid-name, exec-used
+"""Setup xgboost package."""
+from __future__ import absolute_import
+import sys
+import os
+from setuptools import setup, find_packages
+#import subprocess
+sys.path.insert(0, '.')
+
+#this script is for packing and shipping pip installation
+#it builds xgboost code on the fly and packs for pip
+#please don't use this file for installing from github
+
+if os.name != 'nt': #if not windows, compile and install
+    os.system('sh ./xgboost/build-python.sh')
+else:
+    print('Windows users please use github installation.')
+    sys.exit()
+
+CURRENT_DIR = os.path.dirname(__file__)
+
+# We can not import `xgboost.libpath` in setup.py directly since xgboost/__init__.py
+# import `xgboost.core` and finally will import `numpy` and `scipy` which are setup
+# `install_requires`. That's why we're using `exec` here.
+libpath_py = os.path.join(CURRENT_DIR, 'xgboost/libpath.py')
+libpath = {'__file__': libpath_py}
+exec(compile(open(libpath_py, "rb").read(), libpath_py, 'exec'), libpath, libpath)
+
+LIB_PATH = libpath['find_lib_path']()
+
+#to deploy to pip, please use
+#make pythonpack
+#python setup.py register sdist upload
+#and be sure to test it firstly using "python setup.py register sdist upload -r pypitest"
+setup(name='xgboost',
+      #version=open(os.path.join(CURRENT_DIR, 'xgboost/VERSION')).read().strip(),
+      version='0.4a30',
+      description=open(os.path.join(CURRENT_DIR, 'README.rst')).read(),
+      install_requires=[
+          'numpy',
+          'scipy',
+      ],
+      maintainer='Hongliang Liu',
+      maintainer_email='phunter.lau@gmail.com',
+      zip_safe=False,
+      packages=find_packages(),
+      #don't need this and don't use this, give everything to MANIFEST.in
+      #package_dir = {'':'xgboost'},
+      #package_data = {'': ['*.txt','*.md','*.sh'],
+      #               }
+      #this will use MANIFEST.in during install where we specify additional files,
+      #this is the golden line
+      include_package_data=True,
+      #!!! don't use data_files for creating pip installation,
+      #otherwise install_data process will copy it to
+      #root directory for some machines, and cause confusions on building
+      #data_files=[('xgboost', LIB_PATH)],
+      url='https://github.com/dmlc/xgboost')
diff --git a/python-package/xgboost/__init__.py b/python-package/xgboost/__init__.py
index 06892851fe8b..cd50ca6cc2a7 100644
--- a/python-package/xgboost/__init__.py
+++ b/python-package/xgboost/__init__.py
@@ -10,8 +10,11 @@
 
 from .core import DMatrix, Booster
 from .training import train, cv
-from .sklearn import XGBModel, XGBClassifier, XGBRegressor
-from .plotting import plot_importance, plot_tree, to_graphviz
+try:
+    from .sklearn import XGBModel, XGBClassifier, XGBRegressor
+    from .plotting import plot_importance, plot_tree, to_graphviz
+except ImportError:
+    print('Error when loading sklearn/plotting. Please install scikit-learn')
 
 VERSION_FILE = os.path.join(os.path.dirname(__file__), 'VERSION')
 __version__ = open(VERSION_FILE).read().strip()
diff --git a/python-package/xgboost/build-python.sh b/python-package/xgboost/build-python.sh
index 398b076b819d..4bec205a281e 100755
--- a/python-package/xgboost/build-python.sh
+++ b/python-package/xgboost/build-python.sh
@@ -10,7 +10,11 @@
 #       conflict with build.sh which is for everything. 
 
 
-pushd xgboost
+#pushd xgboost
+oldpath=`pwd`
+cd ./xgboost/
+#remove the pre-compiled .so and trigger the system's on-the-fly compiling
+make clean
 if make python; then
     echo "Successfully build multi-thread xgboost"
 else
@@ -23,4 +27,4 @@ else
     echo "If you want multi-threaded version"
     echo "See additional instructions in doc/build.md"
 fi
-popd
+cd $oldpath
diff --git a/python-package/xgboost/compat.py b/python-package/xgboost/compat.py
new file mode 100644
index 000000000000..8499b7824699
--- /dev/null
+++ b/python-package/xgboost/compat.py
@@ -0,0 +1,47 @@
+# coding: utf-8
+# pylint: disable=unused-import, invalid-name, wrong-import-position
+"""For compatibility"""
+
+from __future__ import absolute_import
+
+import sys
+
+
+PY3 = (sys.version_info[0] == 3)
+
+if PY3:
+    # pylint: disable=invalid-name, redefined-builtin
+    STRING_TYPES = str,
+else:
+    # pylint: disable=invalid-name
+    STRING_TYPES = basestring,
+
+# pandas
+try:
+    from pandas import DataFrame
+    PANDAS_INSTALLED = True
+except ImportError:
+
+    class DataFrame(object):
+        """ dummy for pandas.DataFrame """
+        pass
+
+    PANDAS_INSTALLED = False
+
+# sklearn
+try:
+    from sklearn.base import BaseEstimator
+    from sklearn.base import RegressorMixin, ClassifierMixin
+    from sklearn.preprocessing import LabelEncoder
+    SKLEARN_INSTALLED = True
+
+    XGBModelBase = BaseEstimator
+    XGBRegressorBase = RegressorMixin
+    XGBClassifierBase = ClassifierMixin
+except ImportError:
+    SKLEARN_INSTALLED = False
+
+    # used for compatiblity without sklearn
+    XGBModelBase = object
+    XGBClassifierBase = object
+    XGBRegressorBase = object
diff --git a/python-package/xgboost/core.py b/python-package/xgboost/core.py
index 0273b7230da1..ba76f31a66e2 100644
--- a/python-package/xgboost/core.py
+++ b/python-package/xgboost/core.py
@@ -4,7 +4,6 @@
 from __future__ import absolute_import
 
 import os
-import sys
 import ctypes
 import collections
 
@@ -13,20 +12,12 @@
 
 from .libpath import find_lib_path
 
+from .compat import STRING_TYPES, PY3, DataFrame
 
 class XGBoostError(Exception):
     """Error throwed by xgboost trainer."""
     pass
 
-PY3 = (sys.version_info[0] == 3)
-
-if PY3:
-    # pylint: disable=invalid-name, redefined-builtin
-    STRING_TYPES = str,
-else:
-    # pylint: disable=invalid-name
-    STRING_TYPES = basestring,
-
 
 def from_pystr_to_cstr(data):
     """Convert a list of Python str to C pointer
@@ -138,28 +129,50 @@ def c_array(ctype, values):
     return (ctype * len(values))(*values)
 
 
-def _maybe_from_pandas(data, feature_names, feature_types):
-    """ Extract internal data from pd.DataFrame """
-    try:
-        import pandas as pd
-    except ImportError:
-        return data, feature_names, feature_types
 
-    if not isinstance(data, pd.DataFrame):
+PANDAS_DTYPE_MAPPER = {'int8': 'int', 'int16': 'int', 'int32': 'int', 'int64': 'int',
+                       'uint8': 'int', 'uint16': 'int', 'uint32': 'int', 'uint64': 'int',
+                       'float16': 'float', 'float32': 'float', 'float64': 'float',
+                       'bool': 'i'}
+
+
+def _maybe_pandas_data(data, feature_names, feature_types):
+    """ Extract internal data from pd.DataFrame for DMatrix data """
+
+    if not isinstance(data, DataFrame):
         return data, feature_names, feature_types
 
-    dtypes = data.dtypes
-    if not all(dtype.name in ('int64', 'float64', 'bool') for dtype in dtypes):
-        raise ValueError('DataFrame.dtypes must be int, float or bool')
+    data_dtypes = data.dtypes
+    if not all(dtype.name in PANDAS_DTYPE_MAPPER for dtype in data_dtypes):
+        raise ValueError('DataFrame.dtypes for data must be int, float or bool')
 
     if feature_names is None:
         feature_names = data.columns.format()
+
     if feature_types is None:
-        mapper = {'int64': 'int', 'float64': 'q', 'bool': 'i'}
-        feature_types = [mapper[dtype.name] for dtype in dtypes]
+        feature_types = [PANDAS_DTYPE_MAPPER[dtype.name] for dtype in data_dtypes]
+
     data = data.values.astype('float')
+
     return data, feature_names, feature_types
 
+
+def _maybe_pandas_label(label):
+    """ Extract internal data from pd.DataFrame for DMatrix label """
+
+    if isinstance(label, DataFrame):
+        if len(label.columns) > 1:
+            raise ValueError('DataFrame for label cannot have multiple columns')
+
+        label_dtypes = label.dtypes
+        if not all(dtype.name in PANDAS_DTYPE_MAPPER for dtype in label_dtypes):
+            raise ValueError('DataFrame.dtypes for label must be int, float or bool')
+        else:
+            label = label.values.astype('float')
+    # pd.Series can be passed to xgb as it is
+
+    return label
+
 class DMatrix(object):
     """Data Matrix used in XGBoost.
 
@@ -192,20 +205,19 @@ def __init__(self, data, label=None, missing=0.0,
         silent : boolean, optional
             Whether print messages during construction
         feature_names : list, optional
-            Labels for features.
+            Set names for features.
         feature_types : list, optional
-            Labels for features.
+            Set types for features.
         """
         # force into void_p, mac need to pass things in as void_p
         if data is None:
             self.handle = None
             return
 
-        klass = getattr(getattr(data, '__class__', None), '__name__', None)
-        if klass == 'DataFrame':
-            # once check class name to avoid unnecessary pandas import
-            data, feature_names, feature_types = _maybe_from_pandas(data, feature_names,
-                                                                    feature_types)
+        data, feature_names, feature_types = _maybe_pandas_data(data,
+                                                                feature_names,
+                                                                feature_types)
+        label = _maybe_pandas_label(label)
 
         if isinstance(data, STRING_TYPES):
             self.handle = ctypes.c_void_p()
@@ -223,7 +235,7 @@ def __init__(self, data, label=None, missing=0.0,
                 csr = scipy.sparse.csr_matrix(data)
                 self._init_from_csr(csr)
             except:
-                raise TypeError('can not intialize DMatrix from {}'.format(type(data).__name__))
+                raise TypeError('can not initialize DMatrix from {}'.format(type(data).__name__))
         if label is not None:
             self.set_label(label)
         if weight is not None:
@@ -511,7 +523,7 @@ def feature_names(self, feature_names):
         feature_names : list or None
             Labels for features. None will reset existing feature names
         """
-        if not feature_names is None:
+        if feature_names is not None:
             # validate feature name
             if not isinstance(feature_names, list):
                 feature_names = list(feature_names)
@@ -520,10 +532,11 @@ def feature_names(self, feature_names):
             if len(feature_names) != self.num_col():
                 msg = 'feature_names must have the same length as data'
                 raise ValueError(msg)
-            # prohibit to use symbols may affect to parse. e.g. ``[]=.``
-            if not all(isinstance(f, STRING_TYPES) and f.isalnum()
+            # prohibit to use symbols may affect to parse. e.g. []<
+            if not all(isinstance(f, STRING_TYPES) and
+                       not any(x in f for x in set(('[', ']', '<')))
                        for f in feature_names):
-                raise ValueError('all feature_names must be alphanumerics')
+                raise ValueError('feature_names may not contain [, ] or <')
         else:
             # reset feature_types also
             self.feature_types = None
@@ -541,7 +554,7 @@ def feature_types(self, feature_types):
         feature_types : list or None
             Labels for features. None will reset existing feature names
         """
-        if not feature_types is None:
+        if feature_types is not None:
 
             if self.feature_names is None:
                 msg = 'Unable to set feature types before setting names'
@@ -556,12 +569,11 @@ def feature_types(self, feature_types):
             if len(feature_types) != self.num_col():
                 msg = 'feature_types must have the same length as data'
                 raise ValueError(msg)
-            # prohibit to use symbols may affect to parse. e.g. ``[]=.``
 
-            valid = ('q', 'i', 'int', 'float')
+            valid = ('int', 'float', 'i', 'q')
             if not all(isinstance(f, STRING_TYPES) and f in valid
                        for f in feature_types):
-                raise ValueError('all feature_names must be {i, q, int, float}')
+                raise ValueError('All feature_names must be {int, float, i, q}')
         self._feature_types = feature_types
 
 
@@ -745,8 +757,13 @@ def eval_set(self, evals, iteration=0, feval=None):
         else:
             res = '[%d]' % iteration
             for dmat, evname in evals:
-                name, val = feval(self.predict(dmat), dmat)
-                res += '\t%s-%s:%f' % (evname, name, val)
+                feval_ret = feval(self.predict(dmat), dmat)
+                if isinstance(feval_ret, list):
+                    for name, val in feval_ret:
+                        res += '\t%s-%s:%f' % (evname, name, val)
+                else:
+                    name, val = feval_ret
+                    res += '\t%s-%s:%f' % (evname, name, val)
             return res
 
     def eval(self, data, name='eval', iteration=0):
@@ -873,6 +890,7 @@ def load_model(self, fname):
             _check_call(_LIB.XGBoosterLoadModelFromBuffer(self.handle, ptr, length))
 
     def dump_model(self, fout, fmap='', with_stats=False):
+        # pylint: disable=consider-using-enumerate
         """
         Dump model into a text file.
 
diff --git a/python-package/xgboost/libpath.py b/python-package/xgboost/libpath.py
index 293719f01bc4..5df72dd3da4a 100644
--- a/python-package/xgboost/libpath.py
+++ b/python-package/xgboost/libpath.py
@@ -36,9 +36,10 @@ def find_lib_path():
     else:
         dll_path = [os.path.join(p, 'libxgboostwrapper.so') for p in dll_path]
     lib_path = [p for p in dll_path if os.path.exists(p) and os.path.isfile(p)]
+    #From github issues, most of installation errors come from machines w/o compilers
     if len(lib_path) == 0 and not os.environ.get('XGBOOST_BUILD_DOC', False):
         raise XGBoostLibraryNotFound(
             'Cannot find XGBoost Libarary in the candicate path, ' +
-            'did you run build.sh in root path?\n'
+            'did you install compilers and run build.sh in root path?\n'
             'List of candidates:\n' + ('\n'.join(dll_path)))
     return lib_path
diff --git a/python-package/xgboost/plotting.py b/python-package/xgboost/plotting.py
index 50a844a1e6a1..96f705a689b0 100644
--- a/python-package/xgboost/plotting.py
+++ b/python-package/xgboost/plotting.py
@@ -5,13 +5,13 @@
 from __future__ import absolute_import
 
 import re
+from io import BytesIO
 import numpy as np
 from .core import Booster
-
-from io import BytesIO
+from .sklearn import XGBModel
 
 def plot_importance(booster, ax=None, height=0.2,
-                    xlim=None, title='Feature importance',
+                    xlim=None, ylim=None, title='Feature importance',
                     xlabel='F score', ylabel='Features',
                     grid=True, **kwargs):
 
@@ -19,14 +19,16 @@ def plot_importance(booster, ax=None, height=0.2,
 
     Parameters
     ----------
-    booster : Booster or dict
-        Booster instance, or dict taken by Booster.get_fscore()
+    booster : Booster, XGBModel or dict
+        Booster or XGBModel instance, or dict taken by Booster.get_fscore()
     ax : matplotlib Axes, default None
         Target axes instance. If None, new figure and axes will be created.
     height : float, default 0.2
         Bar height, passed to ax.barh()
     xlim : tuple, default None
         Tuple passed to axes.xlim()
+    ylim : tuple, default None
+        Tuple passed to axes.ylim()
     title : str, default "Feature importance"
         Axes title. To disable, pass None.
     xlabel : str, default "F score"
@@ -46,12 +48,14 @@ def plot_importance(booster, ax=None, height=0.2,
     except ImportError:
         raise ImportError('You must install matplotlib to plot importance')
 
-    if isinstance(booster, Booster):
+    if isinstance(booster, XGBModel):
+        importance = booster.booster().get_fscore()
+    elif isinstance(booster, Booster):
         importance = booster.get_fscore()
     elif isinstance(booster, dict):
         importance = booster
     else:
-        raise ValueError('tree must be Booster or dict instance')
+        raise ValueError('tree must be Booster, XGBModel or dict instance')
 
     if len(importance) == 0:
         raise ValueError('Booster.get_fscore() results in empty')
@@ -73,12 +77,19 @@ def plot_importance(booster, ax=None, height=0.2,
     ax.set_yticklabels(labels)
 
     if xlim is not None:
-        if not isinstance(xlim, tuple) or len(xlim, 2):
+        if not isinstance(xlim, tuple) or len(xlim) != 2:
             raise ValueError('xlim must be a tuple of 2 elements')
     else:
         xlim = (0, max(values) * 1.1)
     ax.set_xlim(xlim)
 
+    if ylim is not None:
+        if not isinstance(ylim, tuple) or len(ylim) != 2:
+            raise ValueError('ylim must be a tuple of 2 elements')
+    else:
+        ylim = (-1, len(importance))
+    ax.set_ylim(ylim)
+
     if title is not None:
         ax.set_title(title)
     if xlabel is not None:
@@ -142,8 +153,8 @@ def to_graphviz(booster, num_trees=0, rankdir='UT',
 
     Parameters
     ----------
-    booster : Booster
-        Booster instance
+    booster : Booster, XGBModel
+        Booster or XGBModel instance
     num_trees : int, default 0
         Specify the ordinal number of target tree
     rankdir : str, default "UT"
@@ -165,8 +176,11 @@ def to_graphviz(booster, num_trees=0, rankdir='UT',
     except ImportError:
         raise ImportError('You must install graphviz to plot tree')
 
-    if not isinstance(booster, Booster):
-        raise ValueError('booster must be Booster instance')
+    if not isinstance(booster, (Booster, XGBModel)):
+        raise ValueError('booster must be Booster or XGBModel instance')
+
+    if isinstance(booster, XGBModel):
+        booster = booster.booster()
 
     tree = booster.get_dump()[num_trees]
     tree = tree.split()
@@ -193,8 +207,8 @@ def plot_tree(booster, num_trees=0, rankdir='UT', ax=None, **kwargs):
 
     Parameters
     ----------
-    booster : Booster
-        Booster instance
+    booster : Booster, XGBModel
+        Booster or XGBModel instance
     num_trees : int, default 0
         Specify the ordinal number of target tree
     rankdir : str, default "UT"
@@ -216,7 +230,6 @@ def plot_tree(booster, num_trees=0, rankdir='UT', ax=None, **kwargs):
     except ImportError:
         raise ImportError('You must install matplotlib to plot tree')
 
-
     if ax is None:
         _, ax = plt.subplots(1, 1)
 
diff --git a/python-package/xgboost/sklearn.py b/python-package/xgboost/sklearn.py
index a2761c5abcf7..f3e38059a792 100644
--- a/python-package/xgboost/sklearn.py
+++ b/python-package/xgboost/sklearn.py
@@ -7,23 +7,9 @@
 from .core import Booster, DMatrix, XGBoostError
 from .training import train
 
-try:
-    from sklearn.base import BaseEstimator
-    from sklearn.base import RegressorMixin, ClassifierMixin
-    from sklearn.preprocessing import LabelEncoder
-    SKLEARN_INSTALLED = True
-except ImportError:
-    SKLEARN_INSTALLED = False
-
-# used for compatiblity without sklearn
-XGBModelBase = object
-XGBClassifierBase = object
-XGBRegressorBase = object
-
-if SKLEARN_INSTALLED:
-    XGBModelBase = BaseEstimator
-    XGBRegressorBase = RegressorMixin
-    XGBClassifierBase = ClassifierMixin
+from .compat import (SKLEARN_INSTALLED, XGBModelBase,
+                     XGBClassifierBase, XGBRegressorBase, LabelEncoder)
+
 
 class XGBModel(XGBModelBase):
     # pylint: disable=too-many-arguments, too-many-instance-attributes, invalid-name
@@ -54,6 +40,14 @@ class XGBModel(XGBModelBase):
         Subsample ratio of the training instance.
     colsample_bytree : float
         Subsample ratio of columns when constructing each tree.
+    colsample_bylevel : float
+        Subsample ratio of columns for each split, in each level.
+    reg_alpha : float (xgb's alpha)
+        L2 regularization term on weights
+    reg_lambda : float (xgb's lambda)
+        L1 regularization term on weights
+    scale_pos_weight : float
+        Balancing of positive and negative weights.
 
     base_score:
         The initial prediction score of all instances, global bias.
@@ -66,7 +60,8 @@ class XGBModel(XGBModelBase):
     def __init__(self, max_depth=3, learning_rate=0.1, n_estimators=100,
                  silent=True, objective="reg:linear",
                  nthread=-1, gamma=0, min_child_weight=1, max_delta_step=0,
-                 subsample=1, colsample_bytree=1,
+                 subsample=1, colsample_bytree=1, colsample_bylevel=1,
+                 reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
                  base_score=0.5, seed=0, missing=None):
         if not SKLEARN_INSTALLED:
             raise XGBoostError('sklearn needs to be installed in order to use this module')
@@ -82,6 +77,10 @@ def __init__(self, max_depth=3, learning_rate=0.1, n_estimators=100,
         self.max_delta_step = max_delta_step
         self.subsample = subsample
         self.colsample_bytree = colsample_bytree
+        self.colsample_bylevel = colsample_bylevel
+        self.reg_alpha = reg_alpha
+        self.reg_lambda = reg_lambda
+        self.scale_pos_weight = scale_pos_weight
 
         self.base_score = base_score
         self.seed = seed
@@ -131,7 +130,7 @@ def get_xgb_params(self):
 
     def fit(self, X, y, eval_set=None, eval_metric=None,
             early_stopping_rounds=None, verbose=True):
-        # pylint: disable=missing-docstring,invalid-name,attribute-defined-outside-init
+        # pylint: disable=missing-docstring,invalid-name,attribute-defined-outside-init, redefined-variable-type
         """
         Fit the gradient boosting model
 
@@ -165,7 +164,7 @@ def fit(self, X, y, eval_set=None, eval_metric=None,
         """
         trainDmatrix = DMatrix(X, label=y, missing=self.missing)
 
-        eval_results = {}
+        evals_result = {}
         if eval_set is not None:
             evals = list(DMatrix(x[0], label=x[1]) for x in eval_set)
             evals = list(zip(evals, ["validation_{}".format(i) for i in
@@ -185,23 +184,62 @@ def fit(self, X, y, eval_set=None, eval_metric=None,
         self._Booster = train(params, trainDmatrix,
                               self.n_estimators, evals=evals,
                               early_stopping_rounds=early_stopping_rounds,
-                              evals_result=eval_results, feval=feval,
+                              evals_result=evals_result, feval=feval,
                               verbose_eval=verbose)
-        if eval_results:
-            eval_results = {k: np.array(v, dtype=float)
-                            for k, v in eval_results.items()}
-            eval_results = {k: np.array(v) for k, v in eval_results.items()}
-            self.eval_results = eval_results
+
+        if evals_result:
+            for val in evals_result.items():
+                evals_result_key = list(val[1].keys())[0]
+                evals_result[val[0]][evals_result_key] = val[1][evals_result_key]
+            self.evals_result_ = evals_result
 
         if early_stopping_rounds is not None:
             self.best_score = self._Booster.best_score
             self.best_iteration = self._Booster.best_iteration
         return self
 
-    def predict(self, data):
+    def predict(self, data, output_margin=False, ntree_limit=0):
         # pylint: disable=missing-docstring,invalid-name
         test_dmatrix = DMatrix(data, missing=self.missing)
-        return self.booster().predict(test_dmatrix)
+        return self.booster().predict(test_dmatrix,
+                                      output_margin=output_margin,
+                                      ntree_limit=ntree_limit)
+
+    def evals_result(self):
+        """Return the evaluation results.
+
+        If eval_set is passed to the `fit` function, you can call evals_result() to
+        get evaluation results for all passed eval_sets. When eval_metric is also
+        passed to the `fit` function, the evals_result will contain the eval_metrics
+        passed to the `fit` function
+
+        Returns
+        -------
+        evals_result : dictionary
+
+        Example
+        -------
+        param_dist = {'objective':'binary:logistic', 'n_estimators':2}
+
+        clf = xgb.XGBModel(**param_dist)
+
+        clf.fit(X_train, y_train,
+                eval_set=[(X_train, y_train), (X_test, y_test)],
+                eval_metric='logloss',
+                verbose=True)
+
+        evals_result = clf.evals_result()
+
+        The variable evals_result will contain:
+        {'validation_0': {'logloss': ['0.604835', '0.531479']},
+         'validation_1': {'logloss': ['0.41965', '0.17686']}}
+        """
+        if self.evals_result_:
+            evals_result = self.evals_result_
+        else:
+            raise XGBoostError('No results.')
+
+        return evals_result
 
 
 class XGBClassifier(XGBModel, XGBClassifierBase):
@@ -214,18 +252,20 @@ def __init__(self, max_depth=3, learning_rate=0.1,
                  n_estimators=100, silent=True,
                  objective="binary:logistic",
                  nthread=-1, gamma=0, min_child_weight=1,
-                 max_delta_step=0, subsample=1, colsample_bytree=1,
+                 max_delta_step=0, subsample=1, colsample_bytree=1, colsample_bylevel=1,
+                 reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
                  base_score=0.5, seed=0, missing=None):
         super(XGBClassifier, self).__init__(max_depth, learning_rate,
                                             n_estimators, silent, objective,
                                             nthread, gamma, min_child_weight,
                                             max_delta_step, subsample,
-                                            colsample_bytree,
-                                            base_score, seed, missing)
+                                            colsample_bytree, colsample_bylevel,
+                                            reg_alpha, reg_lambda,
+                                            scale_pos_weight, base_score, seed, missing)
 
     def fit(self, X, y, sample_weight=None, eval_set=None, eval_metric=None,
             early_stopping_rounds=None, verbose=True):
-        # pylint: disable = attribute-defined-outside-init,arguments-differ
+        # pylint: disable = attribute-defined-outside-init,arguments-differ, redefined-variable-type
         """
         Fit gradient boosting classifier
 
@@ -259,7 +299,7 @@ def fit(self, X, y, sample_weight=None, eval_set=None, eval_metric=None,
             If `verbose` and an evaluation set is used, writes the evaluation
             metric measured on the validation set to stderr.
         """
-        eval_results = {}
+        evals_result = {}
         self.classes_ = list(np.unique(y))
         self.n_classes_ = len(self.classes_)
         if self.n_classes_ > 2:
@@ -299,13 +339,14 @@ def fit(self, X, y, sample_weight=None, eval_set=None, eval_metric=None,
         self._Booster = train(xgb_options, train_dmatrix, self.n_estimators,
                               evals=evals,
                               early_stopping_rounds=early_stopping_rounds,
-                              evals_result=eval_results, feval=feval,
+                              evals_result=evals_result, feval=feval,
                               verbose_eval=verbose)
 
-        if eval_results:
-            eval_results = {k: np.array(v, dtype=float)
-                            for k, v in eval_results.items()}
-            self.eval_results = eval_results
+        if evals_result:
+            for val in evals_result.items():
+                evals_result_key = list(val[1].keys())[0]
+                evals_result[val[0]][evals_result_key] = val[1][evals_result_key]
+            self.evals_result_ = evals_result
 
         if early_stopping_rounds is not None:
             self.best_score = self._Booster.best_score
@@ -313,9 +354,11 @@ def fit(self, X, y, sample_weight=None, eval_set=None, eval_metric=None,
 
         return self
 
-    def predict(self, data):
+    def predict(self, data, output_margin=False, ntree_limit=0):
         test_dmatrix = DMatrix(data, missing=self.missing)
-        class_probs = self.booster().predict(test_dmatrix)
+        class_probs = self.booster().predict(test_dmatrix,
+                                             output_margin=output_margin,
+                                             ntree_limit=ntree_limit)
         if len(class_probs.shape) > 1:
             column_indexes = np.argmax(class_probs, axis=1)
         else:
@@ -323,9 +366,11 @@ def predict(self, data):
             column_indexes[class_probs > 0.5] = 1
         return self._le.inverse_transform(column_indexes)
 
-    def predict_proba(self, data):
+    def predict_proba(self, data, output_margin=False, ntree_limit=0):
         test_dmatrix = DMatrix(data, missing=self.missing)
-        class_probs = self.booster().predict(test_dmatrix)
+        class_probs = self.booster().predict(test_dmatrix,
+                                             output_margin=output_margin,
+                                             ntree_limit=ntree_limit)
         if self.objective == "multi:softprob":
             return class_probs
         else:
@@ -333,6 +378,42 @@ def predict_proba(self, data):
             classzero_probs = 1.0 - classone_probs
             return np.vstack((classzero_probs, classone_probs)).transpose()
 
+    def evals_result(self):
+        """Return the evaluation results.
+
+        If eval_set is passed to the `fit` function, you can call evals_result() to
+        get evaluation results for all passed eval_sets. When eval_metric is also
+        passed to the `fit` function, the evals_result will contain the eval_metrics
+        passed to the `fit` function
+
+        Returns
+        -------
+        evals_result : dictionary
+
+        Example
+        -------
+        param_dist = {'objective':'binary:logistic', 'n_estimators':2}
+
+        clf = xgb.XGBClassifier(**param_dist)
+
+        clf.fit(X_train, y_train,
+                eval_set=[(X_train, y_train), (X_test, y_test)],
+                eval_metric='logloss',
+                verbose=True)
+
+        evals_result = clf.evals_result()
+
+        The variable evals_result will contain:
+        {'validation_0': {'logloss': ['0.604835', '0.531479']},
+         'validation_1': {'logloss': ['0.41965', '0.17686']}}
+        """
+        if self.evals_result_:
+            evals_result = self.evals_result_
+        else:
+            raise XGBoostError('No results.')
+
+        return evals_result
+
 class XGBRegressor(XGBModel, XGBRegressorBase):
     # pylint: disable=missing-docstring
     __doc__ = """Implementation of the scikit-learn API for XGBoost regression.
diff --git a/python-package/xgboost/training.py b/python-package/xgboost/training.py
index a6a7c203b3bd..e47db0bc60cf 100644
--- a/python-package/xgboost/training.py
+++ b/python-package/xgboost/training.py
@@ -10,7 +10,8 @@
 from .core import Booster, STRING_TYPES
 
 def train(params, dtrain, num_boost_round=10, evals=(), obj=None, feval=None,
-          early_stopping_rounds=None, evals_result=None, verbose_eval=True):
+          maximize=False, early_stopping_rounds=None, evals_result=None,
+          verbose_eval=True, learning_rates=None, xgb_model=None):
     # pylint: disable=too-many-statements,too-many-branches, attribute-defined-outside-init
     """Train a booster with given parameters.
 
@@ -29,26 +30,83 @@ def train(params, dtrain, num_boost_round=10, evals=(), obj=None, feval=None,
         Customized objective function.
     feval : function
         Customized evaluation function.
+    maximize : bool
+        Whether to maximize feval.
     early_stopping_rounds: int
         Activates early stopping. Validation error needs to decrease at least
         every <early_stopping_rounds> round(s) to continue training.
         Requires at least one item in evals.
         If there's more than one, will use the last.
         Returns the model from the last iteration (not the best one).
-        If early stopping occurs, the model will have two additional fields:
-        bst.best_score and bst.best_iteration.
+        If early stopping occurs, the model will have three additional fields:
+        bst.best_score, bst.best_iteration and bst.best_ntree_limit.
+        (Use bst.best_ntree_limit to get the correct value if num_parallel_tree
+        and/or num_class appears in the parameters)
     evals_result: dict
-        This dictionary stores the evaluation results of all the items in watchlist
-    verbose_eval : bool
-        If `verbose_eval` then the evaluation metric on the validation set, if
-        given, is printed at each boosting stage.
+        This dictionary stores the evaluation results of all the items in watchlist.
+        Example: with a watchlist containing [(dtest,'eval'), (dtrain,'train')] and
+        and a paramater containing ('eval_metric', 'logloss')
+        Returns: {'train': {'logloss': ['0.48253', '0.35953']},
+                  'eval': {'logloss': ['0.480385', '0.357756']}}
+    verbose_eval : bool or int
+        Requires at least one item in evals.
+        If `verbose_eval` is True then the evaluation metric on the validation set is
+        printed at each boosting stage.
+        If `verbose_eval` is an integer then the evaluation metric on the validation set
+        is printed at every given `verbose_eval` boosting stage. The last boosting stage
+        / the boosting stage found by using `early_stopping_rounds` is also printed.
+        Example: with verbose_eval=4 and at least one item in evals, an evaluation metric
+        is printed every 4 boosting stages, instead of every boosting stage.
+    learning_rates: list or function
+        List of learning rate for each boosting round
+        or a customized function that calculates eta in terms of
+        current number of round and the total number of boosting round (e.g. yields
+        learning rate decay)
+        - list l: eta = l[boosting round]
+        - function f: eta = f(boosting round, num_boost_round)
+    xgb_model : file name of stored xgb model or 'Booster' instance
+        Xgb model to be loaded before training (allows training continuation).
 
     Returns
     -------
     booster : a trained booster model
     """
     evals = list(evals)
+    if isinstance(params, dict) \
+            and 'eval_metric' in params \
+            and isinstance(params['eval_metric'], list):
+        params = dict((k, v) for k, v in params.items())
+        eval_metrics = params['eval_metric']
+        params.pop("eval_metric", None)
+        params = list(params.items())
+        for eval_metric in eval_metrics:
+            params += [('eval_metric', eval_metric)]
+
     bst = Booster(params, [dtrain] + [d[0] for d in evals])
+    nboost = 0
+    num_parallel_tree = 1
+
+    if isinstance(verbose_eval, bool):
+        verbose_eval_every_line = False
+    else:
+        if isinstance(verbose_eval, int):
+            verbose_eval_every_line = verbose_eval
+            verbose_eval = True if verbose_eval_every_line > 0 else False
+
+    if xgb_model is not None:
+        if not isinstance(xgb_model, STRING_TYPES):
+            xgb_model = xgb_model.save_raw()
+        bst = Booster(params, [dtrain] + [d[0] for d in evals], model_file=xgb_model)
+        nboost = len(bst.get_dump())
+    else:
+        bst = Booster(params, [dtrain] + [d[0] for d in evals])
+
+    _params = dict(params) if isinstance(params, list) else params
+    if 'num_parallel_tree' in _params:
+        num_parallel_tree = _params['num_parallel_tree']
+        nboost //= num_parallel_tree
+    if 'num_class' in _params:
+        nboost //= _params['num_class']
 
     if evals_result is not None:
         if not isinstance(evals_result, dict):
@@ -56,11 +114,12 @@ def train(params, dtrain, num_boost_round=10, evals=(), obj=None, feval=None,
         else:
             evals_name = [d[1] for d in evals]
             evals_result.clear()
-            evals_result.update({key: [] for key in evals_name})
+            evals_result.update(dict([(key, {}) for key in evals_name]))
 
     if not early_stopping_rounds:
         for i in range(num_boost_round):
             bst.update(dtrain, i, obj)
+            nboost += 1
             if len(evals) != 0:
                 bst_eval_set = bst.eval_set(evals, i, feval)
                 if isinstance(bst_eval_set, STRING_TYPES):
@@ -69,11 +128,27 @@ def train(params, dtrain, num_boost_round=10, evals=(), obj=None, feval=None,
                     msg = bst_eval_set.decode()
 
                 if verbose_eval:
-                    sys.stderr.write(msg + '\n')
+                    if verbose_eval_every_line:
+                        if i % verbose_eval_every_line == 0 or i == num_boost_round - 1:
+                            sys.stderr.write(msg + '\n')
+                    else:
+                        sys.stderr.write(msg + '\n')
+
                 if evals_result is not None:
-                    res = re.findall(":-?([0-9.]+).", msg)
-                    for key, val in zip(evals_name, res):
-                        evals_result[key].append(val)
+                    res = re.findall("([0-9a-zA-Z@]+[-]*):-?([0-9.]+).", msg)
+                    for key in evals_name:
+                        evals_idx = evals_name.index(key)
+                        res_per_eval = len(res) // len(evals_name)
+                        for r in range(res_per_eval):
+                            res_item = res[(evals_idx*res_per_eval) + r]
+                            res_key = res_item[0]
+                            res_val = res_item[1]
+                            if res_key in evals_result[key]:
+                                evals_result[key][res_key].append(res_val)
+                            else:
+                                evals_result[key][res_key] = [res_val]
+        bst.best_iteration = (nboost - 1)
+        bst.best_ntree_limit = nboost * num_parallel_tree
         return bst
 
     else:
@@ -81,15 +156,18 @@ def train(params, dtrain, num_boost_round=10, evals=(), obj=None, feval=None,
         if len(evals) < 1:
             raise ValueError('For early stopping you need at least one set in evals.')
 
-        sys.stderr.write("Will train until {} error hasn't decreased in {} rounds.\n".format(\
+        if verbose_eval:
+            sys.stderr.write("Will train until {} error hasn't decreased in {} rounds.\n".format(\
                 evals[-1][1], early_stopping_rounds))
 
         # is params a list of tuples? are we using multiple eval metrics?
         if isinstance(params, list):
             if len(params) != len(dict(params).items()):
-                raise ValueError('Check your params.'\
-                                     'Early stopping works with single eval metric only.')
-            params = dict(params)
+                params = dict(params)
+                sys.stderr.write("Multiple eval metrics have been passed: " \
+                "'{0}' will be used for early stopping.\n\n".format(params['eval_metric']))
+            else:
+                params = dict(params)
 
         # either minimize loss or maximize AUC/MAP/NDCG
         maximize_score = False
@@ -97,6 +175,8 @@ def train(params, dtrain, num_boost_round=10, evals=(), obj=None, feval=None,
             maximize_metrics = ('auc', 'map', 'ndcg')
             if any(params['eval_metric'].startswith(x) for x in maximize_metrics):
                 maximize_score = True
+        if feval is not None:
+            maximize_score = maximize
 
         if maximize_score:
             best_score = 0.0
@@ -104,10 +184,19 @@ def train(params, dtrain, num_boost_round=10, evals=(), obj=None, feval=None,
             best_score = float('inf')
 
         best_msg = ''
-        best_score_i = 0
+        best_score_i = (nboost - 1)
+
+        if isinstance(learning_rates, list) and len(learning_rates) != num_boost_round:
+            raise ValueError("Length of list 'learning_rates' has to equal 'num_boost_round'.")
 
         for i in range(num_boost_round):
+            if learning_rates is not None:
+                if isinstance(learning_rates, list):
+                    bst.set_param({'eta': learning_rates[i]})
+                else:
+                    bst.set_param({'eta': learning_rates(i, num_boost_round)})
             bst.update(dtrain, i, obj)
+            nboost += 1
             bst_eval_set = bst.eval_set(evals, i, feval)
 
             if isinstance(bst_eval_set, STRING_TYPES):
@@ -116,26 +205,41 @@ def train(params, dtrain, num_boost_round=10, evals=(), obj=None, feval=None,
                 msg = bst_eval_set.decode()
 
             if verbose_eval:
-                sys.stderr.write(msg + '\n')
+                if verbose_eval_every_line:
+                    if i % verbose_eval_every_line == 0 or i == num_boost_round - 1:
+                        sys.stderr.write(msg + '\n')
+                else:
+                    sys.stderr.write(msg + '\n')
 
             if evals_result is not None:
-                res = re.findall(":-?([0-9.]+).", msg)
-                for key, val in zip(evals_name, res):
-                    evals_result[key].append(val)
+                res = re.findall("([0-9a-zA-Z@]+[-]*):-?([0-9.]+).", msg)
+                for key in evals_name:
+                    evals_idx = evals_name.index(key)
+                    res_per_eval = len(res) // len(evals_name)
+                    for r in range(res_per_eval):
+                        res_item = res[(evals_idx*res_per_eval) + r]
+                        res_key = res_item[0]
+                        res_val = res_item[1]
+                        if res_key in evals_result[key]:
+                            evals_result[key][res_key].append(res_val)
+                        else:
+                            evals_result[key][res_key] = [res_val]
 
             score = float(msg.rsplit(':', 1)[1])
             if (maximize_score and score > best_score) or \
                     (not maximize_score and score < best_score):
                 best_score = score
-                best_score_i = i
+                best_score_i = (nboost - 1)
                 best_msg = msg
             elif i - best_score_i >= early_stopping_rounds:
-                sys.stderr.write("Stopping. Best iteration:\n{}\n\n".format(best_msg))
+                if verbose_eval:
+                    sys.stderr.write("Stopping. Best iteration:\n{}\n\n".format(best_msg))
                 bst.best_score = best_score
                 bst.best_iteration = best_score_i
                 break
         bst.best_score = best_score
         bst.best_iteration = best_score_i
+        bst.best_ntree_limit = (bst.best_iteration + 1) * num_parallel_tree
         return bst
 
 
@@ -179,11 +283,14 @@ def mknfold(dall, nfold, param, seed, evals=(), fpreproc=None):
         ret.append(CVPack(dtrain, dtest, plst))
     return ret
 
-
-def aggcv(rlist, show_stdv=True, show_progress=None, as_pandas=True):
+def aggcv(rlist, show_stdv=True, show_progress=None, as_pandas=True, trial=0):
     # pylint: disable=invalid-name
     """
     Aggregate cross-validation results.
+    
+    If show_progress is true, progress is displayed in every call. If
+    show_progress is an integer, progress will only be displayed every
+    `show_progress` trees, tracked via trial.
     """
     cvmap = {}
     idx = rlist[0].split()[0]
@@ -217,8 +324,6 @@ def aggcv(rlist, show_stdv=True, show_progress=None, as_pandas=True):
         index.extend([k + '-mean', k + '-std'])
         results.extend([mean, std])
 
-
-
     if as_pandas:
         try:
             import pandas as pd
@@ -232,15 +337,16 @@ def aggcv(rlist, show_stdv=True, show_progress=None, as_pandas=True):
         if show_progress is None:
             show_progress = True
 
-    if show_progress:
+    if (isinstance(show_progress, int) and trial % show_progress == 0) or (isinstance(show_progress, bool) and show_progress):
         sys.stderr.write(msg + '\n')
+        sys.stderr.flush()
 
     return results
 
 
 def cv(params, dtrain, num_boost_round=10, nfold=3, metrics=(),
-       obj=None, feval=None, fpreproc=None, as_pandas=True,
-       show_progress=None, show_stdv=True, seed=0):
+       obj=None, feval=None, maximize=False, early_stopping_rounds=None,
+       fpreproc=None, as_pandas=True, show_progress=None, show_stdv=True, seed=0):
     # pylint: disable = invalid-name
     """Cross-validation with given paramaters.
 
@@ -260,15 +366,23 @@ def cv(params, dtrain, num_boost_round=10, nfold=3, metrics=(),
         Custom objective function.
     feval : function
         Custom evaluation function.
+    maximize : bool
+        Whether to maximize feval.
+    early_stopping_rounds: int
+        Activates early stopping. CV error needs to decrease at least
+        every <early_stopping_rounds> round(s) to continue.
+        Last entry in evaluation history is the one from best iteration.
     fpreproc : function
         Preprocessing function that takes (dtrain, dtest, param) and returns
         transformed versions of those.
     as_pandas : bool, default True
         Return pd.DataFrame when pandas is installed.
         If False or pandas is not installed, return np.ndarray
-    show_progress : bool or None, default None
+    show_progress : bool, int, or None, default None
         Whether to display the progress. If None, progress will be displayed
-        when np.ndarray is returned.
+        when np.ndarray is returned. If True, progress will be displayed at 
+        boosting stage. If an integer is given, progress will be displayed 
+        at every given `show_progress` boosting stage. 
     show_stdv : bool, default True
         Whether to display the standard deviation in progress.
         Results are not affected, and always contains std.
@@ -279,6 +393,28 @@ def cv(params, dtrain, num_boost_round=10, nfold=3, metrics=(),
     -------
     evaluation history : list(string)
     """
+    if early_stopping_rounds is not None:
+        if len(metrics) > 1:
+            raise ValueError('Check your params.'\
+                                     'Early stopping works with single eval metric only.')
+
+        sys.stderr.write("Will train until cv error hasn't decreased in {} rounds.\n".format(\
+            early_stopping_rounds))
+
+        maximize_score = False
+        if len(metrics) == 1:
+            maximize_metrics = ('auc', 'map', 'ndcg')
+            if any(metrics[0].startswith(x) for x in maximize_metrics):
+                maximize_score = True
+        if feval is not None:
+            maximize_score = maximize
+
+        if maximize_score:
+            best_score = 0.0
+        else:
+            best_score = float('inf')
+
+    best_score_i = 0
     results = []
     cvfolds = mknfold(dtrain, nfold, params, seed, metrics, fpreproc)
     for i in range(num_boost_round):
@@ -286,9 +422,20 @@ def cv(params, dtrain, num_boost_round=10, nfold=3, metrics=(),
             fold.update(i, obj)
         res = aggcv([f.eval(i, feval) for f in cvfolds],
                     show_stdv=show_stdv, show_progress=show_progress,
-                    as_pandas=as_pandas)
+                    as_pandas=as_pandas, trial=i)
         results.append(res)
 
+        if early_stopping_rounds is not None:
+            score = res[0]
+            if (maximize_score and score > best_score) or \
+                    (not maximize_score and score < best_score):
+                best_score = score
+                best_score_i = i
+            elif i - best_score_i >= early_stopping_rounds:
+                sys.stderr.write("Stopping. Best iteration: {}\n".format(best_score_i))
+                results = results[:best_score_i+1]
+                break
+
     if as_pandas:
         try:
             import pandas as pd
@@ -299,4 +446,3 @@ def cv(params, dtrain, num_boost_round=10, nfold=3, metrics=(),
         results = np.array(results)
 
     return results
-
diff --git a/scripts/travis_script.sh b/scripts/travis_script.sh
index 3a026966dc78..1e62b5b46f7e 100755
--- a/scripts/travis_script.sh
+++ b/scripts/travis_script.sh
@@ -64,7 +64,7 @@ if [ ${TASK} == "python-package" -o ${TASK} == "python-package3" ]; then
         conda create -n myenv python=2.7
     fi
     source activate myenv
-    conda install numpy scipy pandas matplotlib nose
+    conda install numpy scipy pandas matplotlib nose scikit-learn
     python -m pip install graphviz
 
     make all CXX=${CXX} || exit -1
diff --git a/src/data.h b/src/data.h
index 3c4a14987216..9bcb84cedacd 100644
--- a/src/data.h
+++ b/src/data.h
@@ -14,7 +14,7 @@
 
 namespace xgboost {
 /*!
- * \brief unsigned interger type used in boost,
+ * \brief unsigned integer type used in boost,
  *        used for feature index and row index
  */
 typedef unsigned bst_uint;
@@ -35,8 +35,8 @@ struct bst_gpair {
 };
 
 /*!
- * \brief extra information that might needed by gbm and tree module
- * these information are not necessarily presented, and can be empty
+ * \brief extra information that might be needed by gbm and tree module
+ * this information is not necessarily present, and can be empty
  */
 struct BoosterInfo {
   /*! \brief number of rows in the data */
@@ -53,7 +53,7 @@ struct BoosterInfo {
   /*! \brief number of rows, number of columns */
   BoosterInfo(void) : num_row(0), num_col(0) {
   }
-  /*! \brief get root of ith instance */
+  /*! \brief get root of i-th instance */
   inline unsigned GetRoot(size_t i) const {
     return root_index.size() == 0 ? 0 : root_index[i];
   }
@@ -120,13 +120,13 @@ struct ColBatch : public SparseBatch {
 };
 /**
  * \brief interface of feature matrix, needed for tree construction
- *  this interface defines two way to access features,
- *  row access is defined by iterator of RowBatch
- *  col access is optional, checked by HaveColAccess, and defined by iterator of ColBatch
+ *  this interface defines two ways to access features:
+ *   row access is defined by iterator of RowBatch
+ *   col access is optional, checked by HaveColAccess, and defined by iterator of ColBatch
  */
 class IFMatrix {
  public:
-  // the interface only need to ganrantee row iter
+  // the interface only need to guarantee row iter
   // column iter is active, when ColIterator is called, row_iter can be disabled
   /*! \brief get the row iterator associated with FMatrix */
   virtual utils::IIterator<RowBatch> *RowIterator(void) = 0;
@@ -142,7 +142,7 @@ class IFMatrix {
    * \brief check if column access is supported, if not, initialize column access
    * \param enabled whether certain feature should be included in column access
    * \param subsample subsample ratio when generating column access
-   * \param max_row_perbatch auxilary information, maximum row used in each column batch
+   * \param max_row_perbatch auxiliary information, maximum row used in each column batch
    *         this is a hint information that can be ignored by the implementation
    */
   virtual void InitColAccess(const std::vector<bool> &enabled,
diff --git a/src/gbm/gbm.h b/src/gbm/gbm.h
index 60b7474e1e33..8ff692c057ff 100644
--- a/src/gbm/gbm.h
+++ b/src/gbm/gbm.h
@@ -58,7 +58,7 @@ class IGradBooster {
     return false;
   }
   /*!
-   * \brief peform update to the model(boosting)
+   * \brief perform update to the model(boosting)
    * \param p_fmat feature matrix that provide access to features
    * \param buffer_offset buffer index offset of these instances, if equals -1
    *        this means we do not have buffer index allocated to the gbm
@@ -88,7 +88,7 @@ class IGradBooster {
                        std::vector<float> *out_preds,
                        unsigned ntree_limit = 0) = 0;
   /*!
-   * \brief online prediction funciton, predict score for one instance at a time
+   * \brief online prediction function, predict score for one instance at a time
    *  NOTE: use the batch prediction interface if possible, batch prediction is usually
    *        more efficient than online prediction
    *        This function is NOT threadsafe, make sure you only call from one thread
@@ -119,7 +119,7 @@ class IGradBooster {
   /*!
    * \brief dump the model in text format
    * \param fmap feature map that may help give interpretations of feature
-   * \param option extra option of the dumo model
+   * \param option extra option of the dump model
    * \return a vector of dump for boosters
    */
   virtual std::vector<std::string> DumpModel(const utils::FeatMap& fmap, int option) = 0;
diff --git a/src/gbm/gbtree-inl.hpp b/src/gbm/gbtree-inl.hpp
index 9335ef8e78e6..65fe7e9da1b6 100644
--- a/src/gbm/gbtree-inl.hpp
+++ b/src/gbm/gbtree-inl.hpp
@@ -31,7 +31,7 @@ class GBTree : public IGradBooster {
     using namespace std;
     if (!strncmp(name, "bst:", 4)) {
       cfg.push_back(std::make_pair(std::string(name+4), std::string(val)));
-      // set into updaters, if already intialized
+      // set into updaters, if already initialized
       for (size_t i = 0; i < updaters.size(); ++i) {
         updaters[i]->SetParam(name+4, val);
       }
@@ -85,7 +85,7 @@ class GBTree : public IGradBooster {
       fo.Write(BeginPtr(pred_counter), pred_counter.size() * sizeof(unsigned));
     }
   }
-  // initialize the predic buffer
+  // initialize the predict buffer
   virtual void InitModel(void) {
     pred_buffer.clear(); pred_counter.clear();
     pred_buffer.resize(mparam.PredBufferSize(), 0.0f);
@@ -138,10 +138,7 @@ class GBTree : public IGradBooster {
     {
       nthread = omp_get_num_threads();
     }
-    thread_temp.resize(nthread, tree::RegTree::FVec());
-    for (int i = 0; i < nthread; ++i) {
-      thread_temp[i].Init(mparam.num_feature);
-    }
+    InitThreadTemp(nthread);
     std::vector<float> &preds = *out_preds;
     const size_t stride = info.num_row * mparam.num_output_group;
     preds.resize(stride * (mparam.size_leaf_vector+1));
@@ -194,10 +191,7 @@ class GBTree : public IGradBooster {
     {
       nthread = omp_get_num_threads();
     }
-    thread_temp.resize(nthread, tree::RegTree::FVec());
-    for (int i = 0; i < nthread; ++i) {
-      thread_temp[i].Init(mparam.num_feature);
-    }
+    InitThreadTemp(nthread);
     this->PredPath(p_fmat, info, out_preds, ntree_limit);
   }
   virtual std::vector<std::string> DumpModel(const utils::FeatMap& fmap, int option) {
@@ -391,6 +385,16 @@ class GBTree : public IGradBooster {
       }
     }
   }
+  // init thread buffers
+  inline void InitThreadTemp(int nthread) {
+    int prev_thread_temp_size = thread_temp.size();
+    if (prev_thread_temp_size < nthread) {
+      thread_temp.resize(nthread, tree::RegTree::FVec());
+      for (int i = prev_thread_temp_size; i < nthread; ++i) {
+        thread_temp[i].Init(mparam.num_feature);
+      }
+    }
+  }
 
   // --- data structure ---
   /*! \brief training parameters */
@@ -442,7 +446,7 @@ class GBTree : public IGradBooster {
     int num_roots;
     /*! \brief number of features to be used by trees */
     int num_feature;
-    /*! \brief size of predicton buffer allocated used for buffering */
+    /*! \brief size of prediction buffer allocated used for buffering */
     int64_t num_pbuffer;
     /*!
      * \brief how many output group a single instance can produce
diff --git a/src/io/io.h b/src/io/io.h
index 267bb0bfffd7..6ceff26980be 100644
--- a/src/io/io.h
+++ b/src/io/io.h
@@ -22,7 +22,7 @@ typedef learner::DMatrix DataMatrix;
  * \param silent whether print message during loading
  * \param savebuffer whether temporal buffer the file if the file is in text format
  * \param loadsplit whether we only load a split of input files
- *   such that each worker node get a split of the data
+ *        such that each worker node get a split of the data
  * \param cache_file name of cache_file, used by external memory version
  *        can be NULL, if cache_file is specified, this will be the temporal
  *        space that can be re-used to store intermediate data
@@ -38,7 +38,7 @@ DataMatrix* LoadDataMatrix(const char *fname,
  *  note: the saved dmatrix format may not be in exactly same as input
  *  SaveDMatrix will choose the best way to materialize the dmatrix.
  * \param dmat the dmatrix to be saved
- * \param fname file name to be savd
+ * \param fname file name to be saved
  * \param silent whether print message during saving
  */
 void SaveDataMatrix(const DataMatrix &dmat, const char *fname, bool silent = false);
diff --git a/src/io/libsvm_parser.h b/src/io/libsvm_parser.h
index 92eeaf35d2ed..43b8d6b90e5b 100644
--- a/src/io/libsvm_parser.h
+++ b/src/io/libsvm_parser.h
@@ -31,7 +31,7 @@ struct LibSVMPage : public SparsePage {
 /*!
  * \brief libsvm parser that parses the input lines
  * and returns rows in input data
- * factry that was used by threadbuffer template
+ * factory that was used by threadbuffer template
  */
 class LibSVMPageFactory  {
  public:
diff --git a/src/io/page_fmatrix-inl.hpp b/src/io/page_fmatrix-inl.hpp
index 2fa5c83bd950..d2b71e50f4aa 100644
--- a/src/io/page_fmatrix-inl.hpp
+++ b/src/io/page_fmatrix-inl.hpp
@@ -200,7 +200,7 @@ class FMatrixPage : public IFMatrix {
   virtual bool HaveColAccess(void) const {
     return col_size_.size() != 0;
   }
-  /*! \brief get number of colmuns */
+  /*! \brief get number of columns */
   virtual size_t NumCol(void) const {
     utils::Check(this->HaveColAccess(), "NumCol:need column access");
     return col_size_.size();
@@ -246,7 +246,7 @@ class FMatrixPage : public IFMatrix {
     return &col_iter_;
   }
   /*!
-   * \brief colmun based iterator
+   * \brief column based iterator
    */
   virtual utils::IIterator<ColBatch> *ColIterator(const std::vector<bst_uint> &fset) {
     size_t ncol = this->NumCol();
@@ -290,8 +290,10 @@ class FMatrixPage : public IFMatrix {
     fo->Write(col_size_);
   }
   /*!
-   * \brief intialize column data
+   * \brief initialize column data
+   * \param enabled the list of enabled columns
    * \param pkeep probability to keep a row
+   * \param max_row_perbatch maximum row per batch
    */
   inline void InitColData(const std::vector<bool> &enabled,
                           float pkeep, size_t max_row_perbatch) {
@@ -319,7 +321,7 @@ class FMatrixPage : public IFMatrix {
       bytes_write += spage;
       double tnow = rabit::utils::GetTime();
       double tdiff = tnow - tstart;
-      utils::Printf("Writting to %s in %g MB/s, %lu MB written\n",
+      utils::Printf("Writing to %s in %g MB/s, %lu MB written\n",
                     col_data_name_.c_str(),
                     (bytes_write >> 20UL) / tdiff,
                     (bytes_write >> 20UL));
diff --git a/src/io/simple_dmatrix-inl.hpp b/src/io/simple_dmatrix-inl.hpp
index 190cbdcdf597..063b016655e7 100644
--- a/src/io/simple_dmatrix-inl.hpp
+++ b/src/io/simple_dmatrix-inl.hpp
@@ -51,7 +51,7 @@ class DMatrixSimple : public DataMatrix {
   inline void CopyFrom(const DataMatrix &src) {
     this->Clear();
     this->info = src.info;
-    // clone data content in thos matrix
+    // clone data contents from src matrix
     utils::IIterator<RowBatch> *iter = src.fmat()->RowIterator();
     iter->BeforeFirst();
     while (iter->Next()) {
@@ -313,7 +313,7 @@ class DMatrixSimple : public DataMatrix {
    private:
     // whether is at first
     bool at_first_;
-    // pointer to parient
+    // pointer to parent
     DMatrixSimple *parent_;
     // temporal space for batch
     RowBatch batch_;
diff --git a/src/io/simple_fmatrix-inl.hpp b/src/io/simple_fmatrix-inl.hpp
index 0e0da4461900..e467263fada5 100644
--- a/src/io/simple_fmatrix-inl.hpp
+++ b/src/io/simple_fmatrix-inl.hpp
@@ -40,7 +40,7 @@ class FMatrixS : public IFMatrix {
   virtual bool HaveColAccess(void) const {
     return col_size_.size() != 0;
   }
-  /*! \brief get number of colmuns */
+  /*! \brief get number of columns */
   virtual size_t NumCol(void) const {
     utils::Check(this->HaveColAccess(), "NumCol:need column access");
     return col_size_.size();
@@ -83,7 +83,7 @@ class FMatrixS : public IFMatrix {
     return &col_iter_;
   }
   /*!
-   * \brief colmun based iterator
+   * \brief column based iterator
    */
   virtual utils::IIterator<ColBatch> *ColIterator(const std::vector<bst_uint> &fset) {
     size_t ncol = this->NumCol();
@@ -112,7 +112,7 @@ class FMatrixS : public IFMatrix {
 
  protected:
   /*!
-   * \brief intialize column data
+   * \brief initialize column data
    * \param enabled the list of enabled columns
    * \param pkeep probability to keep a row
    * \param max_row_perbatch maximum row per batch
diff --git a/src/io/sparse_batch_page.h b/src/io/sparse_batch_page.h
index 24546f785543..96810c0fb7be 100644
--- a/src/io/sparse_batch_page.h
+++ b/src/io/sparse_batch_page.h
@@ -33,8 +33,7 @@ class SparsePage {
     return offset.size() - 1;
   }
   /*!
-   * \brief load the by providing a list of interested segments
-   *        only the interested segments are loaded
+   * \brief load only the segments we are interested in
    * \param fi the input stream of the file
    * \param sorted_index_set sorted index of segments we are interested in
    * \return true of the loading as successful, false if end of file was reached
diff --git a/src/learner/dmatrix.h b/src/learner/dmatrix.h
index 3fbc579de57e..52828c3be9eb 100644
--- a/src/learner/dmatrix.h
+++ b/src/learner/dmatrix.h
@@ -35,7 +35,7 @@ struct MetaInfo {
   std::vector<float> weights;
   /*!
    * \brief initialized margins,
-   * if specified, xgboost will start from this init margin
+   * if specified, xgboost will start from this initial margin
    * can be used to specify initial prediction to boost from
    */
   std::vector<float> base_margin;
diff --git a/src/learner/evaluation-inl.hpp b/src/learner/evaluation-inl.hpp
index 2b69a43a839c..d28702728963 100644
--- a/src/learner/evaluation-inl.hpp
+++ b/src/learner/evaluation-inl.hpp
@@ -21,7 +21,7 @@
 namespace xgboost {
 namespace learner {
 /*!
- * \brief base class of elementwise evaluation
+ * \brief base class of element-wise evaluation
  * \tparam Derived the name of subclass
  */
 template<typename Derived>
@@ -57,7 +57,7 @@ struct EvalEWiseBase : public IEvaluator {
    */
   inline static float EvalRow(float label, float pred);
   /*!
-   * \brief to be overide by subclas, final trasnformation
+   * \brief to be overridden by subclass, final transformation
    * \param esum the sum statistics returned by EvalRow
    * \param wsum sum of weight
    */
@@ -109,7 +109,7 @@ struct EvalError : public EvalEWiseBase<EvalError> {
   }
 };
 
-/*! \brief loglikelihood of poission distribution */
+/*! \brief log-likelihood of Poission distribution */
 struct EvalPoissionNegLogLik : public EvalEWiseBase<EvalPoissionNegLogLik> {
   virtual const char *Name(void) const {
     return "poisson-nloglik";
@@ -174,7 +174,7 @@ struct EvalMClassBase : public IEvaluator {
                               const float *pred,
                               size_t nclass);
   /*!
-   * \brief to be overide by subclas, final trasnformation
+   * \brief to be overridden by subclass, final transformation
    * \param esum the sum statistics returned by EvalRow
    * \param wsum sum of weight
    */
@@ -367,7 +367,7 @@ struct EvalPrecisionRatio : public IEvaluator{
   std::string name_;
 };
 
-/*! \brief Area under curve, for both classification and rank */
+/*! \brief Area Under Curve, for both classification and rank */
 struct EvalAuc : public IEvaluator {
   virtual float Eval(const std::vector<float> &preds,
                      const MetaInfo &info,
@@ -382,7 +382,7 @@ struct EvalAuc : public IEvaluator {
     utils::Check(gptr.back() == info.labels.size(),
                  "EvalAuc: group structure must match number of prediction");
     const bst_omp_uint ngroup = static_cast<bst_omp_uint>(gptr.size() - 1);
-    // sum statictis
+    // sum statistics
     double sum_auc = 0.0f;
     #pragma omp parallel reduction(+:sum_auc)
     {
@@ -404,13 +404,16 @@ struct EvalAuc : public IEvaluator {
           // keep bucketing predictions in same bucket
           if (j != 0 && rec[j].first != rec[j - 1].first) {
             sum_pospair += buf_neg * (sum_npos + buf_pos *0.5);
-            sum_npos += buf_pos; sum_nneg += buf_neg;
+            sum_npos += buf_pos;
+            sum_nneg += buf_neg;
             buf_neg = buf_pos = 0.0f;
           }
-          buf_pos += ctr * wt; buf_neg += (1.0f - ctr) * wt;
+          buf_pos += ctr * wt;
+          buf_neg += (1.0f - ctr) * wt;
         }
         sum_pospair += buf_neg * (sum_npos + buf_pos *0.5);
-        sum_npos += buf_pos; sum_nneg += buf_neg;
+        sum_npos += buf_pos;
+        sum_nneg += buf_neg;
         // check weird conditions
         utils::Check(sum_npos > 0.0 && sum_nneg > 0.0,
                      "AUC: the dataset only contains pos or neg samples");
@@ -443,7 +446,8 @@ struct EvalRankList : public IEvaluator {
     utils::Check(preds.size() == info.labels.size(),
                   "label size predict size not match");
     // quick consistency when group is not available
-    std::vector<unsigned> tgptr(2, 0); tgptr[1] = static_cast<unsigned>(preds.size());
+    std::vector<unsigned> tgptr(2, 0);
+    tgptr[1] = static_cast<unsigned>(preds.size());
     const std::vector<unsigned> &gptr = info.group_ptr.size() == 0 ? tgptr : info.group_ptr;
     utils::Assert(gptr.size() != 0, "must specify group when constructing rank file");
     utils::Assert(gptr.back() == preds.size(),
@@ -468,7 +472,7 @@ struct EvalRankList : public IEvaluator {
       float dat[2];
       dat[0] = static_cast<float>(sum_metric);
       dat[1] = static_cast<float>(ngroup);
-      // approximately estimate auc using mean
+      // approximately estimate the metric using mean
       rabit::Allreduce<rabit::op::Sum>(dat, 2);
       return dat[0] / dat[1];
     } else {
@@ -500,14 +504,14 @@ struct EvalRankList : public IEvaluator {
   bool minus_;
 };
 
-/*! \brief Precison at N, for both classification and rank */
+/*! \brief Precision at N, for both classification and rank */
 struct EvalPrecision : public EvalRankList{
  public:
   explicit EvalPrecision(const char *name) : EvalRankList(name) {}
 
  protected:
   virtual float EvalMetric(std::vector< std::pair<float, unsigned> > &rec) const {
-    // calculate Preicsion
+    // calculate Precision
     std::sort(rec.begin(), rec.end(), CmpFirst);
     unsigned nhit = 0;
     for (size_t j = 0; j < rec.size() && j < this->topn_; ++j) {
@@ -517,7 +521,7 @@ struct EvalPrecision : public EvalRankList{
   }
 };
 
-/*! \brief NDCG */
+/*! \brief NDCG: Normalized Discounted Cumulative Gain at N */
 struct EvalNDCG : public EvalRankList{
  public:
   explicit EvalNDCG(const char *name) : EvalRankList(name) {}
@@ -549,7 +553,7 @@ struct EvalNDCG : public EvalRankList{
   }
 };
 
-/*! \brief Precison at N, for both classification and rank */
+/*! \brief Mean Average Precision at N, for both classification and rank */
 struct EvalMAP : public EvalRankList {
  public:
   explicit EvalMAP(const char *name) : EvalRankList(name) {}
diff --git a/src/learner/helper_utils.h b/src/learner/helper_utils.h
index 7ca7ba59c67c..0db1b46f3dd0 100644
--- a/src/learner/helper_utils.h
+++ b/src/learner/helper_utils.h
@@ -45,7 +45,7 @@ inline static int FindMaxIndex(const std::vector<float>& rec) {
   return FindMaxIndex(BeginPtr(rec), rec.size());
 }
 
-// perform numerical safe logsum
+// perform numerically safe logsum
 inline float LogSum(float x, float y) {
   if (x < y) {
     return y + std::log(std::exp(x - y) + 1.0f);
@@ -53,7 +53,7 @@ inline float LogSum(float x, float y) {
     return x + std::log(std::exp(y - x) + 1.0f);
   }
 }
-// numerical safe logsum
+// numerically safe logsum
 inline float LogSum(const float *rec, size_t size) {
   float mx = rec[0];
   for (size_t i = 1; i < size; ++i) {
@@ -66,11 +66,11 @@ inline float LogSum(const float *rec, size_t size) {
   return mx + std::log(sum);
 }
 
+// comparator functions for sorting pairs in descending order
 inline static bool CmpFirst(const std::pair<float, unsigned> &a,
                             const std::pair<float, unsigned> &b) {
   return a.first > b.first;
 }
-
 inline static bool CmpSecond(const std::pair<float, unsigned> &a,
                              const std::pair<float, unsigned> &b) {
   return a.second > b.second;
diff --git a/src/learner/learner-inl.hpp b/src/learner/learner-inl.hpp
index f051992d3531..0e84806632bc 100644
--- a/src/learner/learner-inl.hpp
+++ b/src/learner/learner-inl.hpp
@@ -22,8 +22,8 @@ namespace xgboost {
 /*! \brief namespace for learning algorithm */
 namespace learner {
 /*!
- * \brief learner that takes do gradient boosting on specific objective functions
- *  and do training and prediction
+ * \brief learner that performs gradient boosting for a specific objective function.
+ *  It does training and prediction.
  */
 class BoostLearner : public rabit::Serializable {
  public:
@@ -258,7 +258,7 @@ class BoostLearner : public rabit::Serializable {
   }
   /*!
    * \brief check if data matrix is ready to be used by training,
-   *  if not intialize it
+   *  if not initialize it
    * \param p_train pointer to the matrix used by training
    */
   inline void CheckInit(DMatrix *p_train) {
@@ -283,7 +283,7 @@ class BoostLearner : public rabit::Serializable {
   /*!
    * \brief update the model for one iteration
    * \param iter current iteration number
-   * \param p_train pointer to the data matrix
+   * \param train reference to the data matrix
    */
   inline void UpdateOneIter(int iter, const DMatrix &train) {
     if (seed_per_iteration != 0 || rabit::IsDistributed()) {
@@ -342,6 +342,7 @@ class BoostLearner : public rabit::Serializable {
    * \param out_preds output vector that stores the prediction
    * \param ntree_limit limit number of trees used for boosted tree
    *   predictor, when it equals 0, this means we are using all the trees
+   * \param pred_leaf whether to only predict the leaf index of each tree in a boosted tree predictor
    */
   inline void Predict(const DMatrix &data,
                       bool output_margin,
@@ -358,7 +359,7 @@ class BoostLearner : public rabit::Serializable {
     }
   }
   /*!
-   * \brief online prediction funciton, predict score for one instance at a time
+   * \brief online prediction function, predict score for one instance at a time
    *  NOTE: use the batch prediction interface if possible, batch prediction is usually
    *        more efficient than online prediction
    *        This function is NOT threadsafe, make sure you only call from one thread
@@ -367,7 +368,6 @@ class BoostLearner : public rabit::Serializable {
    * \param output_margin whether to only predict margin value instead of transformed prediction
    * \param out_preds output vector to hold the predictions
    * \param ntree_limit limit the number of trees used in prediction
-   * \param root_index the root index
    * \sa Predict
    */
   inline void Predict(const SparseBatch::Inst &inst,
@@ -452,7 +452,7 @@ class BoostLearner : public rabit::Serializable {
     float base_score;
     /* \brief number of features  */
     unsigned num_feature;
-    /* \brief number of class, if it is multi-class classification  */
+    /* \brief number of classes, if it is multi-class classification  */
     int num_class;
     /*! \brief whether the model itself is saved with pbuffer */
     int saved_with_pbuffer;
@@ -495,7 +495,7 @@ class BoostLearner : public rabit::Serializable {
   int updater_mode;
   // cached size of predict buffer
   size_t pred_buffer_size;
-  // maximum buffred row value
+  // maximum buffered row value
   float prob_buffer_row;
   // evaluation set
   EvalSet evaluator_;
@@ -505,13 +505,13 @@ class BoostLearner : public rabit::Serializable {
   gbm::IGradBooster *gbm_;
   // name of gbm model used for training
   std::string name_gbm_;
-  // objective fnction
+  // objective function
   IObjFunction *obj_;
   // name of objective function
   std::string name_obj_;
   // configurations
   std::vector< std::pair<std::string, std::string> > cfg_;
-  // temporal storages for prediciton
+  // temporal storages for prediction
   std::vector<float> preds_;
   // gradient pairs
   std::vector<bst_gpair> gpair_;
@@ -527,7 +527,7 @@ class BoostLearner : public rabit::Serializable {
     CacheEntry(const DMatrix *mat, size_t buffer_offset, size_t num_row)
         :mat_(mat), buffer_offset_(buffer_offset), num_row_(num_row) {}
   };
-  // find internal bufer offset for certain matrix, if not exist, return -1
+  // find internal buffer offset for certain matrix, if not exist, return -1
   inline int64_t FindBufferOffset(const DMatrix &mat) const {
     for (size_t i = 0; i < cache_.size(); ++i) {
       if (cache_[i].mat_ == &mat && mat.cache_learner_ptr_ == this) {
diff --git a/src/learner/objective-inl.hpp b/src/learner/objective-inl.hpp
index b6d388e3c1e0..ce23b02fb91c 100644
--- a/src/learner/objective-inl.hpp
+++ b/src/learner/objective-inl.hpp
@@ -84,7 +84,7 @@ struct LossType {
    * \return second order gradient
    */
   inline float SecondOrderGradient(float predt, float label) const {
-    // cap second order gradient to postive value
+    // cap second order gradient to positive value
     const float eps = 1e-16f;
     switch (loss_type) {
       case kLinearSquare: return 1.0f;
diff --git a/src/learner/objective.h b/src/learner/objective.h
index 08b57f528ce3..7742868544cb 100644
--- a/src/learner/objective.h
+++ b/src/learner/objective.h
@@ -68,7 +68,7 @@ class IObjFunction{
 // factory function
 namespace xgboost {
 namespace learner {
-/*! \brief factory funciton to create objective function by name */
+/*! \brief factory function to create objective function by name */
 inline IObjFunction* CreateObjFunction(const char *name) {
   using namespace std;
   if (!strcmp("reg:linear", name)) return new RegLossObj(LossType::kLinearSquare);
diff --git a/src/tree/model.h b/src/tree/model.h
index 6a22aa5f19b4..6f2479cc228b 100644
--- a/src/tree/model.h
+++ b/src/tree/model.h
@@ -321,9 +321,9 @@ class TreeModel {
    */
   inline void SaveModel(utils::IStream &fo) const { // NOLINT(*)
     utils::Assert(param.num_nodes == static_cast<int>(nodes.size()),
-                  "Tree::SaveModel");
+                  "TreeModel::SaveModel");
     utils::Assert(param.num_nodes == static_cast<int>(stats.size()),
-                  "Tree::SaveModel");
+                  "TreeModel::SaveModel");
     fo.Write(&param, sizeof(Param));
     utils::Assert(param.num_nodes != 0, "invalid model");
     fo.Write(BeginPtr(nodes), sizeof(Node) * nodes.size());
@@ -462,7 +462,7 @@ class TreeModel {
 
 /*! \brief node statistics used in regression tree */
 struct RTreeNodeStat {
-  /*! \brief loss chg caused by current split */
+  /*! \brief loss change caused by current split */
   float loss_chg;
   /*! \brief sum of hessian values, used to measure coverage of data */
   float sum_hess;
@@ -485,7 +485,7 @@ class RegTree: public TreeModel<bst_float, RTreeNodeStat>{
  public:
   /*!
    * \brief dense feature vector that can be taken by RegTree
-   * to do tranverse efficiently
+   * to do traverse efficiently
    * and can be construct from sparse feature vector
    */
   struct FVec {
@@ -498,7 +498,7 @@ class RegTree: public TreeModel<bst_float, RTreeNodeStat>{
       int flag;
     };
     std::vector<Entry> data;
-    /*! \brief intialize the vector with size vector */
+    /*! \brief initialize the vector with size vector */
     inline void Init(size_t size) {
       Entry e; e.flag = -1;
       data.resize(size);
@@ -529,14 +529,14 @@ class RegTree: public TreeModel<bst_float, RTreeNodeStat>{
   };
   /*!
    * \brief get the leaf index
-   * \param feats dense feature vector, if the feature is missing the field is set to NaN
-   * \param root_gid starting root index of the instance
+   * \param feat dense feature vector, if the feature is missing the field is set to NaN
+   * \param root_id starting root index of the instance
    * \return the leaf index of the given feature
    */
-  inline int GetLeafIndex(const FVec&feat, unsigned root_id = 0) const {
+  inline int GetLeafIndex(const FVec &feat, unsigned root_id = 0) const {
     // start from groups that belongs to current data
     int pid = static_cast<int>(root_id);
-    // tranverse tree
+    // traverse tree
     while (!(*this)[ pid ].is_leaf()) {
       unsigned split_index = (*this)[pid].split_index();
       pid = this->GetNext(pid, feat.fvalue(split_index), feat.is_missing(split_index));
@@ -546,7 +546,7 @@ class RegTree: public TreeModel<bst_float, RTreeNodeStat>{
   /*!
    * \brief get the prediction of regression tree, only accepts dense feature vector
    * \param feats dense feature vector, if the feature is missing the field is set to NaN
-   * \param root_gid starting root index of the instance
+   * \param root_id starting root index of the instance
    * \return the leaf index of the given feature
    */
   inline float Predict(const FVec &feat, unsigned root_id = 0) const {
diff --git a/src/tree/param.h b/src/tree/param.h
index f06365a17d9e..364e3572d297 100644
--- a/src/tree/param.h
+++ b/src/tree/param.h
@@ -32,7 +32,7 @@ struct TrainParam{
   // default direction choice
   int default_direction;
   // maximum delta update we can add in weight estimation
-  // this parameter can be used to stablize update
+  // this parameter can be used to stabilize update
   // default=0 means no constraint on weight delta
   float max_delta_step;
   // whether we want to do subsample
@@ -51,7 +51,7 @@ struct TrainParam{
   int size_leaf_vector;
   // option for parallelization
   int parallel_option;
-  // option to open cacheline optimizaton
+  // option to open cacheline optimization
   int cache_opt;
   // number of threads to be used for tree construction,
   // if OpenMP is enabled, if equals 0, use system default
@@ -72,7 +72,8 @@ struct TrainParam{
     opt_dense_col = 1.0f;
     nthread = 0;
     size_leaf_vector = 0;
-    parallel_option = 2;
+    // enforce parallel option to 0 for now, investigate the other strategy
+    parallel_option = 0;
     sketch_eps = 0.1f;
     sketch_ratio = 2.0f;
     cache_opt = 1;
@@ -131,7 +132,7 @@ struct TrainParam{
       }
     }
   }
-  // calculate cost of loss function with four stati
+  // calculate cost of loss function with four statistics
   inline double CalcGain(double sum_grad, double sum_hess,
                          double test_grad, double test_hess) const {
     double w = CalcWeight(sum_grad, sum_hess);
@@ -166,7 +167,7 @@ struct TrainParam{
   inline bool need_backward_search(float col_density, bool indicator) const {
     return this->default_direction != 2;
   }
-  /*! \brief given the loss change, whether we need to invode prunning */
+  /*! \brief given the loss change, whether we need to invoke pruning */
   inline bool need_prune(double loss_chg, int depth) const {
     return loss_chg < this->min_split_loss;
   }
@@ -234,7 +235,7 @@ struct GradStats {
     const bst_gpair &b = gpair[ridx];
     this->Add(b.grad, b.hess);
   }
-  /*! \brief caculate leaf weight */
+  /*! \brief calculate leaf weight */
   inline double CalcWeight(const TrainParam &param) const {
     return param.CalcWeight(sum_grad, sum_hess);
   }
@@ -361,10 +362,10 @@ struct SplitEntry{
   /*! \brief constructor */
   SplitEntry(void) : loss_chg(0.0f), sindex(0), split_value(0.0f) {}
   /*!
-   * \brief decides whether a we can replace current entry with the statistics given
-   *   This function gives better priority to lower index when loss_chg equals
-   *    not the best way, but helps to give consistent result during multi-thread execution
-   * \param loss_chg the loss reduction get through the split
+   * \brief decides whether we can replace current entry with the given statistics
+   *   This function gives better priority to lower index when loss_chg == new_loss_chg.
+   *   Not the best way, but helps to give consistent result during multi-thread execution.
+   * \param new_loss_chg the loss reduction get through the split
    * \param split_index the feature index where the split is on
    */
   inline bool NeedReplace(bst_float new_loss_chg, unsigned split_index) const {
@@ -391,9 +392,9 @@ struct SplitEntry{
   }
   /*!
    * \brief update the split entry, replace it if e is better
-   * \param loss_chg loss reduction of new candidate
+   * \param new_loss_chg loss reduction of new candidate
    * \param split_index feature index to split on
-   * \param split_value the split point
+   * \param new_split_value the split point
    * \param default_left whether the missing value goes to left
    * \return whether the proposed split is better and can replace current split
    */
diff --git a/src/tree/updater.h b/src/tree/updater.h
index 1cf74a699bed..ff4da5e98d82 100644
--- a/src/tree/updater.h
+++ b/src/tree/updater.h
@@ -26,11 +26,11 @@ class IUpdater {
    */
   virtual void SetParam(const char *name, const char *val) = 0;
   /*!
-   * \brief peform update to the tree models
+   * \brief perform update to the tree models
    * \param gpair the gradient pair statistics of the data
    * \param p_fmat feature matrix that provide access to features
    * \param info extra side information that may be need, such as root index
-   * \param trees pointer to the trees to be updated, upater will change the content of the tree
+   * \param trees references the trees to be updated, updater will change the content of trees
    *   note: all the trees in the vector are updated, with the same statistics,
    *         but maybe different random seeds, usually one tree is passed in at a time,
    *         there can be multiple trees when we train random forest style model
@@ -53,7 +53,7 @@ class IUpdater {
   virtual ~IUpdater(void) {}
 };
 /*!
- * \brief create a updater based on name
+ * \brief create an updater based on name
  * \param name name of updater
  * \return return the updater instance
  */
diff --git a/src/tree/updater_colmaker-inl.hpp b/src/tree/updater_colmaker-inl.hpp
index e3070d495492..1f89f7ed49f3 100644
--- a/src/tree/updater_colmaker-inl.hpp
+++ b/src/tree/updater_colmaker-inl.hpp
@@ -17,7 +17,7 @@
 
 namespace xgboost {
 namespace tree {
-/*! \brief colunwise update to construct a tree */
+/*! \brief column-wise update to construct a tree */
 template<typename TStats>
 class ColMaker: public IUpdater {
  public:
diff --git a/src/tree/updater_prune-inl.hpp b/src/tree/updater_prune-inl.hpp
index dc99e94e4979..2b90646be416 100644
--- a/src/tree/updater_prune-inl.hpp
+++ b/src/tree/updater_prune-inl.hpp
@@ -14,7 +14,7 @@
 
 namespace xgboost {
 namespace tree {
-/*! \brief pruner that prunes a tree after growing finishs */
+/*! \brief pruner that prunes a tree after growing finishes */
 class TreePruner: public IUpdater {
  public:
   virtual ~TreePruner(void) {}
@@ -56,7 +56,7 @@ class TreePruner: public IUpdater {
       return npruned;
     }
   }
-  /*! \brief do prunning of a tree */
+  /*! \brief do pruning of a tree */
   inline void DoPrune(RegTree &tree) { // NOLINT(*)
     int npruned = 0;
     // initialize auxiliary statistics
@@ -69,7 +69,7 @@ class TreePruner: public IUpdater {
       }
     }
     if (silent == 0) {
-      utils::Printf("tree prunning end, %d roots, %d extra nodes, %d pruned nodes ,max_depth=%d\n",
+      utils::Printf("tree pruning end, %d roots, %d extra nodes, %d pruned nodes ,max_depth=%d\n",
                     tree.param.num_roots, tree.num_extra_nodes(), npruned, tree.MaxDepth());
     }
   }
diff --git a/src/utils/base64-inl.h b/src/utils/base64-inl.h
index 49cd652549fb..be99e07b7217 100644
--- a/src/utils/base64-inl.h
+++ b/src/utils/base64-inl.h
@@ -91,7 +91,7 @@ class Base64InStream: public IStream {
    * call this function before actually start read
    */
   inline void InitPosition(void) {
-    // get a charater
+    // get a character
     do {
       tmp_ch = reader_.GetChar();
     } while (isspace(tmp_ch));
@@ -223,7 +223,7 @@ class Base64OutStream: public IStream {
   }
   /*!
    * \brief finish writing of all current base64 stream, do some post processing
-   * \param endch charater to put to end of stream, if it is EOF, then nothing will be done
+   * \param endch character to put to end of stream, if it is EOF, then nothing will be done
    */
   inline void Finish(char endch = EOF) {
     using base64::EncodeTable;
diff --git a/src/utils/fmap.h b/src/utils/fmap.h
index 218a61aa4045..cc06b7021168 100644
--- a/src/utils/fmap.h
+++ b/src/utils/fmap.h
@@ -58,7 +58,7 @@ class FeatMap {
   }
   /*! \brief return type of specific feature */
   const Type& type(size_t idx) const {
-    utils::Assert(idx < names_.size(), "utils::FMap::name feature index exceed bound");
+    utils::Assert(idx < names_.size(), "utils::FMap::type feature index exceed bound");
     return types_[idx];
   }
 
diff --git a/src/utils/iterator.h b/src/utils/iterator.h
index 5d986b2e40f1..73068dbbfb40 100644
--- a/src/utils/iterator.h
+++ b/src/utils/iterator.h
@@ -23,7 +23,7 @@ class IIterator {
    * \param val value of parameter
    */
   virtual void SetParam(const char *name, const char *val) {}
-  /*! \brief initalize the iterator so that we can use the iterator */
+  /*! \brief initialize the iterator so that we can use the iterator */
   virtual void Init(void) {}
   /*! \brief set before first of the item */
   virtual void BeforeFirst(void) = 0;
diff --git a/src/utils/quantile.h b/src/utils/quantile.h
index adcd0222de7d..d1c029f65d90 100644
--- a/src/utils/quantile.h
+++ b/src/utils/quantile.h
@@ -214,7 +214,7 @@ struct WQSummary {
   /*!
    * \brief set current summary to be merged summary of sa and sb
    * \param sa first input summary to be merged
-   * \param sb second input summar to be merged
+   * \param sb second input summary to be merged
    */
   inline void SetCombine(const WQSummary &sa,
                          const WQSummary &sb) {
@@ -329,7 +329,7 @@ struct WQSummary {
   }
 };
 
-/*! \brief try to do efficient prunning */
+/*! \brief try to do efficient pruning */
 template<typename DType, typename RType>
 struct WXQSummary : public WQSummary<DType, RType> {
   // redefine entry type
@@ -364,7 +364,7 @@ struct WXQSummary : public WQSummary<DType, RType> {
     RType mrange = 0;
     {
       // first scan, grab all the big chunk
-      // moviing block index
+      // moving block index
       size_t bid = 0;
       for (size_t i = 1; i < src.size; ++i) {
         if (CheckLarge(src.data[i], chunk)) {
@@ -574,7 +574,7 @@ struct GKSummary {
 };
 
 /*!
- * \brief template for all quantle sketch algorithm
+ * \brief template for all quantile sketch algorithm
  *        that uses merge/prune scheme
  * \tparam DType type of data content
  * \tparam RType type of rank
@@ -605,7 +605,7 @@ class QuantileSketchTemplate {
     }
     /*!
      * \brief set the space to be merge of all Summary arrays
-     * \param begin begining position in th summary array
+     * \param begin beginning position in the summary array
      * \param end ending position in the Summary array
      */
     inline void SetMerge(const Summary *begin,
@@ -664,7 +664,7 @@ class QuantileSketchTemplate {
     }
   };
   /*!
-   * \brief intialize the quantile sketch, given the performance specification
+   * \brief initialize the quantile sketch, given the performance specification
    * \param maxn maximum number of data points can be feed into sketch
    * \param eps accuracy level of summary
    */
@@ -688,7 +688,7 @@ class QuantileSketchTemplate {
   }
   /*!
    * \brief add an element to a sketch
-   * \param x the elemented added to the sketch
+   * \param x the element added to the sketch
    */
   inline void Push(DType x, RType w = 1) {
     if (w == static_cast<RType>(0)) return;
diff --git a/src/utils/random.h b/src/utils/random.h
index 7d52c2ae79ce..8e3255cf3b97 100644
--- a/src/utils/random.h
+++ b/src/utils/random.h
@@ -27,7 +27,7 @@ inline void Seed(unsigned seed) {
 inline double Uniform(void) {
   return static_cast<double>(rand()) / (static_cast<double>(RAND_MAX)+1.0); // NOLINT(*)
 }
-/*! \brief return a real numer uniform in (0,1) */
+/*! \brief return a real number uniform in (0,1) */
 inline double NextDouble2(void) {
   return (static_cast<double>(rand()) + 1.0) / (static_cast<double>(RAND_MAX)+2.0); // NOLINT(*)
 }
diff --git a/src/utils/thread_buffer.h b/src/utils/thread_buffer.h
index bc4fb9f5e0d9..8acb8ffd0af3 100644
--- a/src/utils/thread_buffer.h
+++ b/src/utils/thread_buffer.h
@@ -21,8 +21,8 @@ namespace utils {
 #if !defined(XGBOOST_STRICT_CXX98_)
 /*!
  * \brief buffered loading iterator that uses multithread
- * this template method will assume the following paramters
- * \tparam Elem elememt type to be buffered
+ * this template method will assume the following parameters
+ * \tparam Elem element type to be buffered
  * \tparam ElemFactory factory type to implement in order to use thread buffer
  */
 template<typename Elem, typename ElemFactory>
@@ -45,7 +45,7 @@ class ThreadBuffer {
   /*!
    * \brief initalize the buffered iterator
    * \param param a initialize parameter that will pass to factory, ignore it if not necessary
-   * \return false if the initlization can't be done, e.g. buffer file hasn't been created
+   * \return false if the initialization can't be done, e.g. buffer file hasn't been created
    */
   inline bool Init(void) {
     if (!factory.Init()) return false;
@@ -61,7 +61,7 @@ class ThreadBuffer {
   inline void BeforeFirst(void) {
     // wait till last loader end
     loading_end.Wait();
-    // critcal zone
+    // critical zone
     current_buf = 1;
     factory.BeforeFirst();
     // reset terminate limit
diff --git a/src/utils/utils.h b/src/utils/utils.h
index 7a8f18390d52..4d06d3c61e2e 100644
--- a/src/utils/utils.h
+++ b/src/utils/utils.h
@@ -62,7 +62,7 @@ const int kPrintBuffer = 1 << 12;
 
 #ifndef XGBOOST_CUSTOMIZE_MSG_
 /*!
- * \brief handling of Assert error, caused by in-apropriate input
+ * \brief handling of Assert error, caused by inappropriate input
  * \param msg error message
  */
 inline void HandleAssertError(const char *msg) {
@@ -70,7 +70,7 @@ inline void HandleAssertError(const char *msg) {
   exit(-1);
 }
 /*!
- * \brief handling of Check error, caused by in-apropriate input
+ * \brief handling of Check error, caused by inappropriate input
  * \param msg error message
  */
 inline void HandleCheckError(const char *msg) {
@@ -157,7 +157,7 @@ inline std::FILE *FopenCheck(const char *fname, const char *flag) {
   return fp;
 }
 }  // namespace utils
-// easy utils that can be directly acessed in xgboost
+// easy utils that can be directly accessed in xgboost
 /*! \brief get the beginning address of a vector */
 template<typename T>
 inline T *BeginPtr(std::vector<T> &vec) { // NOLINT(*)
diff --git a/subtree/rabit/.gitignore b/subtree/rabit/.gitignore
index 504802743472..121caaafe661 100644
--- a/subtree/rabit/.gitignore
+++ b/subtree/rabit/.gitignore
@@ -34,3 +34,6 @@
 *tmp*
 *.rabit
 *.mock
+dmlc-core
+recommonmark
+recom
diff --git a/subtree/rabit/.travis.yml b/subtree/rabit/.travis.yml
new file mode 100644
index 000000000000..339f5c692e5c
--- /dev/null
+++ b/subtree/rabit/.travis.yml
@@ -0,0 +1,51 @@
+# disable sudo to use container based build
+sudo: false
+
+# Use Build Matrix to do lint and build seperately
+env:
+  matrix:
+    - TASK=lint LINT_LANG=cpp
+    - TASK=lint LINT_LANG=python
+    - TASK=doc
+    - TASK=build CXX=g++
+    - TASK=test CXX=g++
+
+# dependent apt packages
+addons:
+  apt:
+    packages:
+      - doxygen
+      - libopenmpi-dev
+      - wget
+      - git
+      - libcurl4-openssl-dev
+      - unzip
+      - python-numpy
+      
+before_install:
+  - git clone https://github.com/dmlc/dmlc-core
+  - export TRAVIS=dmlc-core/scripts/travis/
+  - source ${TRAVIS}/travis_setup_env.sh
+
+install:
+  - pip install cpplint pylint --user `whoami`
+
+script: scripts/travis_script.sh
+
+
+before_cache:
+  - ${TRAVIS}/travis_before_cache.sh
+
+
+cache:
+  directories:
+    - ${HOME}/.cache/usr
+
+
+notifications:
+# Emails are sent to the committer's git-configured email address by default,
+  email:
+    on_success: change
+    on_failure: always
+
+
diff --git a/subtree/rabit/Makefile b/subtree/rabit/Makefile
index e2a96eb43b4e..8c9d9f4033cc 100644
--- a/subtree/rabit/Makefile
+++ b/subtree/rabit/Makefile
@@ -3,8 +3,19 @@ export CXX = g++
 endif
 export MPICXX = mpicxx
 export LDFLAGS= -Llib -lrt
-export WARNFLAGS= -Wall -Wextra -Wno-unused-parameter -Wno-unknown-pragmas -pedantic 
-export CFLAGS = -O3 -msse2 -fPIC $(WARNFLAGS) 
+export WARNFLAGS= -Wall -Wextra -Wno-unused-parameter -Wno-unknown-pragmas -std=c++0x
+export CFLAGS = -O3 -msse2 $(WARNFLAGS)
+
+ifndef WITH_FPIC
+	WITH_FPIC = 1
+endif
+ifeq ($(WITH_FPIC), 1)
+	CFLAGS += -fPIC
+endif
+
+ifndef LINT_LANG
+	LINT_LANG="all"
+endif
 
 # build path
 BPATH=.
@@ -15,7 +26,9 @@ OBJ= $(BPATH)/allreduce_base.o $(BPATH)/allreduce_robust.o $(BPATH)/engine.o $(B
 SLIB= wrapper/librabit_wrapper.so wrapper/librabit_wrapper_mock.so wrapper/librabit_wrapper_mpi.so
 ALIB= lib/librabit.a lib/librabit_mpi.a lib/librabit_empty.a lib/librabit_mock.a lib/librabit_base.a
 HEADERS=src/*.h include/*.h include/rabit/*.h
-.PHONY: clean all install mpi python
+DMLC=dmlc-core
+
+.PHONY: clean all install mpi python lint doc doxygen
 
 all: lib/librabit.a lib/librabit_mock.a  wrapper/librabit_wrapper.so wrapper/librabit_wrapper_mock.so lib/librabit_base.a
 mpi: lib/librabit_mpi.a wrapper/librabit_wrapper_mpi.so
@@ -40,10 +53,10 @@ wrapper/librabit_wrapper.so: $(BPATH)/rabit_wrapper.o lib/librabit.a
 wrapper/librabit_wrapper_mock.so: $(BPATH)/rabit_wrapper.o lib/librabit_mock.a
 wrapper/librabit_wrapper_mpi.so: $(BPATH)/rabit_wrapper.o lib/librabit_mpi.a
 
-$(OBJ) : 
+$(OBJ) :
 	$(CXX) -c $(CFLAGS) -o $@ $(firstword $(filter %.cpp %.c %.cc, $^) )
 
-$(MPIOBJ) : 
+$(MPIOBJ) :
 	$(MPICXX) -c $(CFLAGS) -o $@ $(firstword $(filter %.cpp %.c %.cc, $^) )
 
 $(ALIB):
@@ -52,6 +65,12 @@ $(ALIB):
 $(SLIB) :
 	$(CXX) $(CFLAGS) -shared -o $@ $(filter %.cpp %.o %.c %.cc %.a, $^) $(LDFLAGS)
 
+lint:
+	$(DMLC)/scripts/lint.py rabit $(LINT_LANG) src include wrapper
+
+doc doxygen:
+	cd include; doxygen ../doc/Doxyfile; cd -
+
 clean:
-	$(RM) $(OBJ) $(MPIOBJ) $(ALIB) $(MPIALIB) *~ src/*~ include/*~ include/*/*~ wrapper/*~
+	$(RM) $(OBJ) $(MPIOBJ) $(ALIB) $(MPIALIB) $(SLIB) *~ src/*~ include/*~ include/*/*~ wrapper/*~
 
diff --git a/subtree/rabit/README.md b/subtree/rabit/README.md
index 1bf5caee4910..9302a2199eca 100644
--- a/subtree/rabit/README.md
+++ b/subtree/rabit/README.md
@@ -1,6 +1,8 @@
 ## rabit: Reliable Allreduce and Broadcast Interface
+[![Build Status](https://travis-ci.org/dmlc/rabit.svg?branch=master)](https://travis-ci.org/dmlc/rabit)
+[![Documentation Status](https://readthedocs.org/projects/rabit/badge/?version=latest)](http://rabit.readthedocs.org/)
 
-rabit is a light weight library that provides a fault tolerant interface of Allreduce and Broadcast. It is designed to support easy implementations of distributed machine learning programs, many of which fall naturally under the Allreduce abstraction. The goal of rabit is to support ***portable*** , ***scalable*** and ***reliable*** distributed machine learning programs. 
+rabit is a light weight library that provides a fault tolerant interface of Allreduce and Broadcast. It is designed to support easy implementations of distributed machine learning programs, many of which fall naturally under the Allreduce abstraction. The goal of rabit is to support ***portable*** , ***scalable*** and ***reliable*** distributed machine learning programs.
 
 * [Tutorial](guide)
 * [API Documentation](http://homes.cs.washington.edu/~tqchen/rabit/doc)
diff --git a/subtree/rabit/doc/.gitignore b/subtree/rabit/doc/.gitignore
index 9036e38b3e54..95f88be439bb 100644
--- a/subtree/rabit/doc/.gitignore
+++ b/subtree/rabit/doc/.gitignore
@@ -1,3 +1,5 @@
 html
 latex
 *.sh
+_*
+doxygen
diff --git a/subtree/rabit/doc/Doxyfile b/subtree/rabit/doc/Doxyfile
index 694bc35d305b..2c9c64ea7fa7 100644
--- a/subtree/rabit/doc/Doxyfile
+++ b/subtree/rabit/doc/Doxyfile
@@ -8,7 +8,7 @@ PROJECT_NAME           = "rabit"
 PROJECT_NUMBER         =
 PROJECT_BRIEF          =
 PROJECT_LOGO           =
-OUTPUT_DIRECTORY       = ../doc
+OUTPUT_DIRECTORY       = ../doc/doxygen
 CREATE_SUBDIRS         = NO
 OUTPUT_LANGUAGE        = English
 BRIEF_MEMBER_DESC      = YES
@@ -101,8 +101,8 @@ FILE_PATTERNS          =
 RECURSIVE              = NO
 EXCLUDE                =
 EXCLUDE_SYMLINKS       = NO
-EXCLUDE_PATTERNS       = *-inl.hpp 
-EXCLUDE_SYMBOLS        = 
+EXCLUDE_PATTERNS       = *-inl.hpp
+EXCLUDE_SYMBOLS        =
 EXAMPLE_PATH           =
 EXAMPLE_PATTERNS       =
 EXAMPLE_RECURSIVE      = NO
@@ -216,7 +216,7 @@ MAN_LINKS              = NO
 #---------------------------------------------------------------------------
 # configuration options related to the XML output
 #---------------------------------------------------------------------------
-GENERATE_XML           = NO
+GENERATE_XML           = YES
 XML_OUTPUT             = xml
 XML_SCHEMA             =
 XML_DTD                =
diff --git a/subtree/rabit/doc/Makefile b/subtree/rabit/doc/Makefile
new file mode 100644
index 000000000000..40bba2a280db
--- /dev/null
+++ b/subtree/rabit/doc/Makefile
@@ -0,0 +1,192 @@
+# Makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line.
+SPHINXOPTS    =
+SPHINXBUILD   = sphinx-build
+PAPER         =
+BUILDDIR      = _build
+
+# User-friendly check for sphinx-build
+ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1)
+$(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/)
+endif
+
+# Internal variables.
+PAPEROPT_a4     = -D latex_paper_size=a4
+PAPEROPT_letter = -D latex_paper_size=letter
+ALLSPHINXOPTS   = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
+# the i18n builder cannot share the environment and doctrees with the others
+I18NSPHINXOPTS  = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
+
+.PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest coverage gettext
+
+help:
+	@echo "Please use \`make <target>' where <target> is one of"
+	@echo "  html       to make standalone HTML files"
+	@echo "  dirhtml    to make HTML files named index.html in directories"
+	@echo "  singlehtml to make a single large HTML file"
+	@echo "  pickle     to make pickle files"
+	@echo "  json       to make JSON files"
+	@echo "  htmlhelp   to make HTML files and a HTML help project"
+	@echo "  qthelp     to make HTML files and a qthelp project"
+	@echo "  applehelp  to make an Apple Help Book"
+	@echo "  devhelp    to make HTML files and a Devhelp project"
+	@echo "  epub       to make an epub"
+	@echo "  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
+	@echo "  latexpdf   to make LaTeX files and run them through pdflatex"
+	@echo "  latexpdfja to make LaTeX files and run them through platex/dvipdfmx"
+	@echo "  text       to make text files"
+	@echo "  man        to make manual pages"
+	@echo "  texinfo    to make Texinfo files"
+	@echo "  info       to make Texinfo files and run them through makeinfo"
+	@echo "  gettext    to make PO message catalogs"
+	@echo "  changes    to make an overview of all changed/added/deprecated items"
+	@echo "  xml        to make Docutils-native XML files"
+	@echo "  pseudoxml  to make pseudoxml-XML files for display purposes"
+	@echo "  linkcheck  to check all external links for integrity"
+	@echo "  doctest    to run all doctests embedded in the documentation (if enabled)"
+	@echo "  coverage   to run coverage check of the documentation (if enabled)"
+
+clean:
+	rm -rf $(BUILDDIR)/*
+
+html:
+	$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
+	@echo
+	@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
+
+dirhtml:
+	$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
+	@echo
+	@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
+
+singlehtml:
+	$(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
+	@echo
+	@echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
+
+pickle:
+	$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
+	@echo
+	@echo "Build finished; now you can process the pickle files."
+
+json:
+	$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
+	@echo
+	@echo "Build finished; now you can process the JSON files."
+
+htmlhelp:
+	$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
+	@echo
+	@echo "Build finished; now you can run HTML Help Workshop with the" \
+	      ".hhp project file in $(BUILDDIR)/htmlhelp."
+
+qthelp:
+	$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
+	@echo
+	@echo "Build finished; now you can run "qcollectiongenerator" with the" \
+	      ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
+	@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/rabit.qhcp"
+	@echo "To view the help file:"
+	@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/rabit.qhc"
+
+applehelp:
+	$(SPHINXBUILD) -b applehelp $(ALLSPHINXOPTS) $(BUILDDIR)/applehelp
+	@echo
+	@echo "Build finished. The help book is in $(BUILDDIR)/applehelp."
+	@echo "N.B. You won't be able to view it unless you put it in" \
+	      "~/Library/Documentation/Help or install it in your application" \
+	      "bundle."
+
+devhelp:
+	$(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
+	@echo
+	@echo "Build finished."
+	@echo "To view the help file:"
+	@echo "# mkdir -p $$HOME/.local/share/devhelp/rabit"
+	@echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/rabit"
+	@echo "# devhelp"
+
+epub:
+	$(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
+	@echo
+	@echo "Build finished. The epub file is in $(BUILDDIR)/epub."
+
+latex:
+	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
+	@echo
+	@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
+	@echo "Run \`make' in that directory to run these through (pdf)latex" \
+	      "(use \`make latexpdf' here to do that automatically)."
+
+latexpdf:
+	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
+	@echo "Running LaTeX files through pdflatex..."
+	$(MAKE) -C $(BUILDDIR)/latex all-pdf
+	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
+
+latexpdfja:
+	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
+	@echo "Running LaTeX files through platex and dvipdfmx..."
+	$(MAKE) -C $(BUILDDIR)/latex all-pdf-ja
+	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
+
+text:
+	$(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
+	@echo
+	@echo "Build finished. The text files are in $(BUILDDIR)/text."
+
+man:
+	$(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
+	@echo
+	@echo "Build finished. The manual pages are in $(BUILDDIR)/man."
+
+texinfo:
+	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
+	@echo
+	@echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
+	@echo "Run \`make' in that directory to run these through makeinfo" \
+	      "(use \`make info' here to do that automatically)."
+
+info:
+	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
+	@echo "Running Texinfo files through makeinfo..."
+	make -C $(BUILDDIR)/texinfo info
+	@echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
+
+gettext:
+	$(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
+	@echo
+	@echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
+
+changes:
+	$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
+	@echo
+	@echo "The overview file is in $(BUILDDIR)/changes."
+
+linkcheck:
+	$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
+	@echo
+	@echo "Link check complete; look for any errors in the above output " \
+	      "or in $(BUILDDIR)/linkcheck/output.txt."
+
+doctest:
+	$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
+	@echo "Testing of doctests in the sources finished, look at the " \
+	      "results in $(BUILDDIR)/doctest/output.txt."
+
+coverage:
+	$(SPHINXBUILD) -b coverage $(ALLSPHINXOPTS) $(BUILDDIR)/coverage
+	@echo "Testing of coverage in the sources finished, look at the " \
+	      "results in $(BUILDDIR)/coverage/python.txt."
+
+xml:
+	$(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml
+	@echo
+	@echo "Build finished. The XML files are in $(BUILDDIR)/xml."
+
+pseudoxml:
+	$(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml
+	@echo
+	@echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml."
diff --git a/subtree/rabit/doc/conf.py b/subtree/rabit/doc/conf.py
new file mode 100644
index 000000000000..ef89de48998c
--- /dev/null
+++ b/subtree/rabit/doc/conf.py
@@ -0,0 +1,184 @@
+# -*- coding: utf-8 -*-
+#
+# documentation build configuration file, created by
+# sphinx-quickstart on Thu Jul 23 19:40:08 2015.
+#
+# This file is execfile()d with the current directory set to its
+# containing dir.
+#
+# Note that not all possible configuration values are present in this
+# autogenerated file.
+#
+# All configuration values have a default; values that are commented out
+# serve to show the default.
+import sys
+import os, subprocess
+import shlex
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
+libpath = os.path.join(curr_path, '../wrapper/')
+sys.path.insert(0, os.path.join(curr_path, '../wrapper/'))
+sys.path.insert(0, curr_path)
+from sphinx_util import MarkdownParser, AutoStructify
+
+# -- General configuration ------------------------------------------------
+
+# General information about the project.
+project = u'rabit'
+copyright = u'2015, rabit developers'
+author = u'rabit developers'
+github_doc_root = 'https://github.com/dmlc/rabit/tree/master/doc/'
+
+# add markdown parser
+MarkdownParser.github_doc_root = github_doc_root
+source_parsers = {
+    '.md': MarkdownParser,
+}
+# Version information.
+import rabit
+
+version = rabit.__version__
+release = rabit.__version__
+
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom ones
+extensions = [
+    'sphinx.ext.autodoc',
+    'sphinx.ext.napoleon',
+    'sphinx.ext.mathjax',
+    'breathe',
+]
+
+# Use breathe to include doxygen documents
+breathe_projects = {'rabit' : 'doxygen/xml/'}
+breathe_default_project = 'rabit'
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ['_templates']
+
+# The suffix(es) of source filenames.
+# You can specify multiple suffix as a list of string:
+# source_suffix = ['.rst', '.md']
+source_suffix = ['.rst', '.md']
+
+# The encoding of source files.
+#source_encoding = 'utf-8-sig'
+
+# The master toctree document.
+master_doc = 'index'
+
+# The language for content autogenerated by Sphinx. Refer to documentation
+# for a list of supported languages.
+#
+# This is also used if you do content translation via gettext catalogs.
+# Usually you set "language" from the command line for these cases.
+language = None
+
+# There are two options for replacing |today|: either, you set today to some
+# non-false value, then it is used:
+#today = ''
+# Else, today_fmt is used as the format for a strftime call.
+#today_fmt = '%B %d, %Y'
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+exclude_patterns = ['_build']
+
+# The reST default role (used for this markup: `text`) to use for all
+# documents.
+#default_role = None
+
+# If true, '()' will be appended to :func: etc. cross-reference text.
+#add_function_parentheses = True
+
+# If true, the current module name will be prepended to all description
+# unit titles (such as .. function::).
+#add_module_names = True
+
+# If true, sectionauthor and moduleauthor directives will be shown in the
+# output. They are ignored by default.
+#show_authors = False
+
+# The name of the Pygments (syntax highlighting) style to use.
+pygments_style = 'sphinx'
+
+# A list of ignored prefixes for module index sorting.
+#modindex_common_prefix = []
+
+# If true, keep warnings as "system message" paragraphs in the built documents.
+#keep_warnings = False
+
+# If true, `todo` and `todoList` produce output, else they produce nothing.
+todo_include_todos = False
+
+# -- Options for HTML output ----------------------------------------------
+
+# The theme to use for HTML and HTML Help pages.  See the documentation for
+# a list of builtin themes.
+# html_theme = 'alabaster'
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+html_static_path = ['_static']
+
+# Output file base name for HTML help builder.
+htmlhelp_basename = project + 'doc'
+
+# -- Options for LaTeX output ---------------------------------------------
+latex_elements = {
+}
+
+# Grouping the document tree into LaTeX files. List of tuples
+# (source start file, target name, title,
+#  author, documentclass [howto, manual, or own class]).
+latex_documents = [
+  (master_doc, 'rabit.tex', project,
+   author, 'manual'),
+]
+
+# hook for doxygen
+def run_doxygen(folder):
+    """Run the doxygen make command in the designated folder."""
+    try:
+        retcode = subprocess.call("cd %s; make doxygen" % folder, shell=True)
+        if retcode < 0:
+            sys.stderr.write("doxygen terminated by signal %s" % (-retcode))
+    except OSError as e:
+        sys.stderr.write("doxygen execution failed: %s" % e)
+
+
+def run_build_lib(folder):
+    """Run the doxygen make command in the designated folder."""
+    try:
+        retcode = subprocess.call("cd %s; make" % folder, shell=True)
+        retcode = subprocess.call("rm -rf _build/html/doxygen", shell=True)
+        retcode = subprocess.call("mkdir _build", shell=True)
+        retcode = subprocess.call("mkdir _build/html", shell=True)
+        retcode = subprocess.call("cp -rf doxygen/html _build/html/doxygen", shell=True)
+        if retcode < 0:
+            sys.stderr.write("build terminated by signal %s" % (-retcode))
+    except OSError as e:
+        sys.stderr.write("build execution failed: %s" % e)
+
+
+def generate_doxygen_xml(app):
+    """Run the doxygen make commands if we're on the ReadTheDocs server"""
+    read_the_docs_build = os.environ.get('READTHEDOCS', None) == 'True'
+    if read_the_docs_build:
+        run_doxygen('..')
+        sys.stderr.write('Check if shared lib exists\n')
+        run_build_lib('..')
+    sys.stderr.write('The wrapper path: %s\n' % str(os.listdir('../wrapper')))
+    rabit._loadlib()
+
+
+def setup(app):
+    # Add hook for building doxygen xml when needed
+    app.connect("builder-inited", generate_doxygen_xml)
+    app.add_config_value('recommonmark_config', {
+            'url_resolver': lambda url: github_doc_root + url,
+            }, True)
+    app.add_transform(AutoStructify)
diff --git a/subtree/rabit/doc/cpp_api.md b/subtree/rabit/doc/cpp_api.md
new file mode 100644
index 000000000000..c6184aa0850c
--- /dev/null
+++ b/subtree/rabit/doc/cpp_api.md
@@ -0,0 +1,9 @@
+C++ Library API of Rabit
+========================
+This page contains document of Library API of rabit.
+
+```eval_rst
+.. toctree::
+
+.. doxygennamespace:: rabit
+```
diff --git a/subtree/rabit/guide/README.md b/subtree/rabit/doc/guide.md
similarity index 89%
rename from subtree/rabit/guide/README.md
rename to subtree/rabit/doc/guide.md
index 26cace131b80..e2bfa5ce86b0 100644
--- a/subtree/rabit/guide/README.md
+++ b/subtree/rabit/doc/guide.md
@@ -1,10 +1,9 @@
 Tutorial
-=====
+========
 This is rabit's tutorial, a ***Reliable Allreduce and Broadcast Interface***.
+All the example codes are in the [guide](https://github.com/dmlc/rabit/blob/master/guide/) folder of the project.
 To run the examples locally, you will need to build them with ```make```.
 
-Please also refer to the [API Documentation](http://homes.cs.washington.edu/~tqchen/rabit/doc) for further details.
-
 **List of Topics**
 * [What is Allreduce](#what-is-allreduce)
 * [Common Use Case](#common-use-case)
@@ -20,9 +19,9 @@ Please also refer to the [API Documentation](http://homes.cs.washington.edu/~tqc
 * [Fault Tolerance](#fault-tolerance)
 
 What is Allreduce
-=====
+-----------------
 The main methods provided by rabit are Allreduce and Broadcast. Allreduce performs reduction across different computation nodes,
-and returns the result to every node. To understand the behavior of the function, consider the following example in [basic.cc](basic.cc) (there is a python example right after this if you are more familiar with python).
+and returns the result to every node. To understand the behavior of the function, consider the following example in [basic.cc](../guide/basic.cc) (there is a python example right after this if you are more familiar with python).
 ```c++
 #include <rabit.h>
 using namespace rabit;
@@ -32,7 +31,7 @@ int main(int argc, char *argv[]) {
   rabit::Init(argc, argv);
   for (int i = 0; i < N; ++i) {
     a[i] = rabit::GetRank() + i;
-  } 
+  }
   printf("@node[%d] before-allreduce: a={%d, %d, %d}\n",
          rabit::GetRank(), a[0], a[1], a[2]);
   // allreduce take max of each elements in all processes
@@ -42,7 +41,7 @@ int main(int argc, char *argv[]) {
   // second allreduce that sums everything up
   Allreduce<op::Sum>(&a[0], N);
   printf("@node[%d] after-allreduce-sum: a={%d, %d, %d}\n",
-         rabit::GetRank(), a[0], a[1], a[2]);  
+         rabit::GetRank(), a[0], a[1], a[2]);
   rabit::Finalize();
   return 0;
 }
@@ -55,7 +54,7 @@ starts the rabit program with two worker processes.
 This will start two processes, one process with rank 0 and the other with rank 1, both processes run the same code.
 The ```rabit::GetRank()``` function returns the rank of current process.
 
-Before the call to Allreduce, process 0 contains the array ```a = {0, 1, 2}```, while process 1 has the array 
+Before the call to Allreduce, process 0 contains the array ```a = {0, 1, 2}```, while process 1 has the array
 ```a = {1, 2, 3}```. After the call to Allreduce, the array contents in all processes are replaced by the
 reduction result (in this case, the maximum value in each position across all the processes). So, after the
 Allreduce call, the result will become ```a = {1, 2, 3}```.
@@ -63,7 +62,7 @@ Rabit provides different reduction operators, for example,  if you change ```op:
 the reduction operation will be a summation, and the result will become ```a = {1, 3, 5}```.
 You can also run the example with different processes by setting -n to different values.
 
-If you are more familiar with python, you can also use rabit in python. The same example as before can be found in [basic.py](basic.py):
+If you are more familiar with python, you can also use rabit in python. The same example as before can be found in [basic.py](../guide/basic.py):
 
 ```python
 import numpy as np
@@ -75,7 +74,7 @@ rank = rabit.get_rank()
 a = np.zeros(n)
 for i in xrange(n):
     a[i] = rank + i
-    
+
 print '@node[%d] before-allreduce: a=%s' % (rank, str(a))
 a = rabit.allreduce(a, rabit.MAX)
 print '@node[%d] after-allreduce-max: a=%s' % (rank, str(a))
@@ -89,7 +88,7 @@ You can run the program using the following command
 ```
 
 Broadcast is another method provided by rabit besides Allreduce. This function allows one node to broadcast its
-local data to all other nodes. The following code in [broadcast.cc](broadcast.cc) broadcasts a string from
+local data to all other nodes. The following code in [broadcast.cc](../guide/broadcast.cc) broadcasts a string from
 node 0 to all other nodes.
 ```c++
 #include <rabit.h>
@@ -115,7 +114,7 @@ The following command starts the program with three worker processes.
 ```
 Besides strings, rabit also allows to broadcast constant size array and vectors.
 
-The counterpart in python can be found in [broadcast.py](broadcast.py). Here is a snippet so that you can get a better sense of how simple is to use the python library:
+The counterpart in python can be found in [broadcast.py](../guide/broadcast.py). Here is a snippet so that you can get a better sense of how simple is to use the python library:
 
 ```python
 import rabit
@@ -132,7 +131,7 @@ rabit.finalize()
 ```
 
 Common Use Case
-=====
+---------------
 Many distributed machine learning algorithms involve splitting the data into different nodes,
 computing statistics locally, and finally aggregating them. Such workflow is usually done repetitively through many iterations before the algorithm converges. Allreduce naturally meets the structure of such programs,
 common use cases include:
@@ -144,7 +143,7 @@ common use cases include:
 Rabit is a reliable and portable library for distributed machine learning programs, that allow programs to run reliably on different platforms.
 
 Use Rabit API
-====
+-------------
 This section introduces topics about how to use rabit API.
 You can always refer to [API Documentation](http://homes.cs.washington.edu/~tqchen/rabit/doc) for definition of each functions.
 This section trys to gives examples of different aspectes of rabit API.
@@ -178,16 +177,16 @@ int main(int argc, char *argv[]) {
 ```
 
 Besides the common Allreduce and Broadcast functions, there are two additional functions: ```LoadCheckPoint```
-and ```CheckPoint```. These two functions are used for fault-tolerance purposes. 
+and ```CheckPoint```. These two functions are used for fault-tolerance purposes.
 As mentioned before, traditional machine learning programs involve several iterations. In each iteration, we start with a model, make some calls
 to Allreduce or Broadcast and update the model. The calling sequence in each iteration does not need to be the same.
 
 * When the nodes start from the beginning (i.e. iteration 0), ```LoadCheckPoint``` returns 0, so we can initialize the model.
 * ```CheckPoint``` saves the model after each iteration.
   - Efficiency Note: the model is only kept in local memory and no save to disk is performed when calling Checkpoint
-* When a node goes down and restarts, ```LoadCheckPoint``` will recover the latest saved model, and 
-* When a node goes down, the rest of the nodes will block in the call of Allreduce/Broadcast and wait for 
-  the recovery of the failed node until it catches up. 
+* When a node goes down and restarts, ```LoadCheckPoint``` will recover the latest saved model, and
+* When a node goes down, the rest of the nodes will block in the call of Allreduce/Broadcast and wait for
+  the recovery of the failed node until it catches up.
 
 Please see the [Fault Tolerance](#fault-tolerance) section to understand the recovery procedure executed by rabit.
 
@@ -202,8 +201,8 @@ into the data buffer, pass the data to Allreduce function, and get the reduced r
 from failure, we can directly recover the result from other nodes(see also [Fault Tolerance](#fault-tolerance)) and
 the data preparation procedure no longer necessary. Rabit Allreduce add an optional parameter preparation function
 to support such scenario. User can pass in a function that corresponds to the data preparation procedure to Allreduce
-calls, and the data preparation function will only be called when necessary. We use [lazy_allreduce.cc](lazy_allreduce.cc)
-as an example to demonstrate this feature. It is modified from [basic.cc](basic.cc), and you can compare the two codes.
+calls, and the data preparation function will only be called when necessary. We use [lazy_allreduce.cc](../guide/lazy_allreduce.cc)
+as an example to demonstrate this feature. It is modified from [basic.cc](../guide/basic.cc), and you can compare the two codes.
 ```c++
 #include <rabit.h>
 using namespace rabit;
@@ -216,18 +215,18 @@ int main(int argc, char *argv[]) {
     printf("@node[%d] run prepare function\n", rabit::GetRank());
     for (int i = 0; i < N; ++i) {
       a[i] = rabit::GetRank() + i;
-    } 
+    }
   };
   printf("@node[%d] before-allreduce: a={%d, %d, %d}\n",
          rabit::GetRank(), a[0], a[1], a[2]);
   // allreduce take max of each elements in all processes
-  Allreduce<op::Max>(&a[0], N, prepare);  
+  Allreduce<op::Max>(&a[0], N, prepare);
   printf("@node[%d] after-allreduce-sum: a={%d, %d, %d}\n",
-         rabit::GetRank(), a[0], a[1], a[2]);  
+         rabit::GetRank(), a[0], a[1], a[2]);
   // rum second allreduce
   Allreduce<op::Sum>(&a[0], N);
   printf("@node[%d] after-allreduce-max: a={%d, %d, %d}\n",
-         rabit::GetRank(), a[0], a[1], a[2]);  
+         rabit::GetRank(), a[0], a[1], a[2]);
   rabit::Finalize();
   return 0;
 }
@@ -242,7 +241,7 @@ the effect when a process goes down. You can run the program using the following
 The additional arguments ```mock=0,0,1,0``` will cause node 0 to kill itself before second call of Allreduce (see also [mock test](#link-against-mock-test-rabit-library)).
 You will find that the prepare function's print is only executed once and node 0 will no longer execute the preparation function when it restarts from failure.
 
-You can also find python version of the example in [lazy_allreduce.py](lazy_allreduce.py), and run it using the followin command
+You can also find python version of the example in [lazy_allreduce.py](../guide/lazy_allreduce.py), and run it using the followin command
 ```bash
 ../tracker/rabit_demo.py -n 2 lazy_allreduce.py mock=0,0,1,0
 
@@ -250,8 +249,8 @@ You can also find python version of the example in [lazy_allreduce.py](lazy_allr
 
 Since lazy preparation function may not be called during execution. User should be careful when using this feature. For example, a possible mistake
 could be putting some memory allocation code in the lazy preparation function, and the computing memory was not allocated when lazy preparation function is not called.
-The example in [lazy_allreduce.cc](lazy_allreduce.cc) provides a simple way to migrate normal prepration code([basic.cc](basic.cc)) to lazy version: wrap the preparation
-code with a lambda function, and pass it to allreduce. 
+The example in [lazy_allreduce.cc](../guide/lazy_allreduce.cc) provides a simple way to migrate normal prepration code([basic.cc](../guide/basic.cc)) to lazy version: wrap the preparation
+code with a lambda function, and pass it to allreduce.
 
 #### Checkpoint and LazyCheckpoint
 Common machine learning algorithms usually involves iterative computation. As mentioned in the section ([Structure of a Rabit Program](#structure-of-a-rabit-program)),
@@ -263,9 +262,9 @@ There are two model arguments you can pass to Checkpoint and LoadCheckpoint: ```
 * ```local_model``` refers to the model that is specifically tied to the current node
   - For example, in topic modeling, the topic assignments of subset of documents in current node is local model
 
-Because the different nature of the two types of models, different strategy will be used for them. 
+Because the different nature of the two types of models, different strategy will be used for them.
 ```global_model``` is simply saved in local memory of each node, while ```local_model``` will replicated to some other
-nodes (selected using a ring replication strategy). The checkpoint is only saved in the memory without touching the disk which makes rabit programs more efficient. 
+nodes (selected using a ring replication strategy). The checkpoint is only saved in the memory without touching the disk which makes rabit programs more efficient.
 User is encouraged to use ```global_model``` only when is sufficient for better efficiency.
 
 To enable a model class to be checked pointed, user can implement a [serialization interface](../include/rabit_serialization.h). The serialization interface already
@@ -287,7 +286,7 @@ improve the efficiency of the program.
 
 
 Compile Programs with Rabit
-====
+---------------------------
 Rabit is a portable library, to use it, you only need to include the rabit header file.
 * You will need to add the path to [../include](../include) to the header search path of the compiler
   - Solution 1: add ```-I/path/to/rabit/include``` to the compiler flag in gcc or clang
@@ -333,27 +332,27 @@ For example, consider the following script in the test case
   - Note that ndeath = 1 means this will happen only if node 1 died once, which is our case
 
 Running Rabit Jobs
-====
-Rabit is a portable library that can run on multiple platforms. 
+------------------
+Rabit is a portable library that can run on multiple platforms.
 
 #### Running Rabit Locally
-* You can use [../tracker/rabit_demo.py](../tracker/rabit_demo.py) to start n processes locally
+* You can use [../tracker/rabit_demo.py](https://github.com/dmlc/rabit/blob/master/tracker/rabit_demo.py) to start n processes locally
 * This script will restart the program when it exits with -2, so it can be used for [mock test](#link-against-mock-test-library)
 
 #### Running Rabit on Hadoop
-* You can use [../tracker/rabit_yarn.py](../tracker/rabit_yarn.py) to run rabit programs as Yarn application
+* You can use [../tracker/rabit_yarn.py](https://github.com/dmlc/rabit/blob/master/tracker/rabit_yarn.py) to run rabit programs as Yarn application
 * This will start rabit programs as yarn applications
   - This allows multi-threading programs in each node, which can be more efficient
   - An easy multi-threading solution could be to use OpenMP with rabit code
 * It is also possible to run rabit program via hadoop streaming, however, YARN is highly recommended.
 
 #### Running Rabit using MPI
-* You can submit rabit programs to an MPI cluster using [../tracker/rabit_mpi.py](../tracker/rabit_mpi.py).
+* You can submit rabit programs to an MPI cluster using [../tracker/rabit_mpi.py](https://github.com/dmlc/rabit/blob/master/tracker/rabit_mpi.py).
 * If you linked your code against librabit_mpi.a, then you can directly use mpirun to submit the job
 
 #### Customize Tracker Script
 You can also modify the tracker script to allow rabit to run on other platforms. To do so, refer to existing
-tracker scripts, such as [../tracker/rabit_hadoop.py](../tracker/rabit_hadoop.py) and [../tracker/rabit_mpi.py](../tracker/rabit_mpi.py) to get a sense of how it is done.
+tracker scripts, such as [../tracker/rabit_yarn.py](../tracker/rabit_yarn.py) and [../tracker/rabit_mpi.py](https://github.com/dmlc/rabit/blob/master/tracker/rabit_mpi.py) to get a sense of how it is done.
 
 You will need to implement a platform dependent submission function with the following definition
 ```python
@@ -376,7 +375,7 @@ Note that the current rabit tracker does not restart a worker when it dies, the
   - rabit-yarn provides such functionality in YARN
 
 Fault Tolerance
-=====
+---------------
 This section introduces how fault tolerance works in rabit.
 The following figure shows how rabit deals with failures.
 
diff --git a/subtree/rabit/doc/index.md b/subtree/rabit/doc/index.md
new file mode 100644
index 000000000000..d209d95ba074
--- /dev/null
+++ b/subtree/rabit/doc/index.md
@@ -0,0 +1,24 @@
+Rabit Documentation
+=====================
+rabit is a light weight library that provides a fault tolerant interface of Allreduce and Broadcast. It is designed to support easy implementations of distributed machine learning programs, many of which fall naturally under the Allreduce abstraction. The goal of rabit is to support **portable** , **scalable** and **reliable** distributed machine learning programs.
+
+API Documents
+-------------
+```eval_rst
+
+.. toctree::
+   :maxdepth: 2
+
+   python_api.md
+   cpp_api.md
+   parameters.md
+   guide.md
+```
+Indices and tables
+------------------
+
+```eval_rst
+* :ref:`genindex`
+* :ref:`modindex`
+* :ref:`search`
+```
\ No newline at end of file
diff --git a/subtree/rabit/doc/mkdoc.sh b/subtree/rabit/doc/mkdoc.sh
deleted file mode 100755
index 181e280fb38b..000000000000
--- a/subtree/rabit/doc/mkdoc.sh
+++ /dev/null
@@ -1,4 +0,0 @@
-#!/bin/bash
-cd ../include
-doxygen ../doc/Doxyfile 
-cd ../doc
diff --git a/subtree/rabit/doc/README.md b/subtree/rabit/doc/parameters.md
similarity index 70%
rename from subtree/rabit/doc/README.md
rename to subtree/rabit/doc/parameters.md
index fadc9a1b1b92..37580d5a13a9 100644
--- a/subtree/rabit/doc/README.md
+++ b/subtree/rabit/doc/parameters.md
@@ -1,18 +1,11 @@
-Rabit Documentation
-====
-* [Tutorial](../guide)
-* [API Documentation](http://homes.cs.washington.edu/~tqchen/rabit/doc)
-  - You can also run ```./mkdoc.sh``` to make the document locally
-* [Parameters](#parameters)
-
 Parameters
-====
+==========
 This section list all the parameters that can be passed to rabit::Init function as argv.
-All the parameters are passed in as string in format of ```parameter-name=parameter-value```.
+All the parameters are passed in as string in format of ``parameter-name=parameter-value``.
 In most setting these parameters have default value or will be automatically detected,
 and do not need to be manually configured.
 
-* rabit_tracker_uri [passed in automatically by tracker] 
+* rabit_tracker_uri [passed in automatically by tracker]
   - The uri/ip of rabit tracker
 * rabit_tracker_port [passed in automatically by tracker]
   - The port of rabit tracker
diff --git a/subtree/rabit/doc/python-requirements.txt b/subtree/rabit/doc/python-requirements.txt
new file mode 100644
index 000000000000..5970c4367983
--- /dev/null
+++ b/subtree/rabit/doc/python-requirements.txt
@@ -0,0 +1,4 @@
+numpy
+breathe
+commonmark
+
diff --git a/subtree/rabit/doc/python_api.md b/subtree/rabit/doc/python_api.md
new file mode 100644
index 000000000000..8a0eda9215b6
--- /dev/null
+++ b/subtree/rabit/doc/python_api.md
@@ -0,0 +1,11 @@
+Python API of Rabit
+===================
+This page contains document of python API of rabit.
+
+```eval_rst
+.. toctree::
+
+.. automodule:: rabit
+    :members:
+    :show-inheritance:
+```
diff --git a/subtree/rabit/doc/sphinx_util.py b/subtree/rabit/doc/sphinx_util.py
new file mode 100644
index 000000000000..f6a33ffa375d
--- /dev/null
+++ b/subtree/rabit/doc/sphinx_util.py
@@ -0,0 +1,16 @@
+# -*- coding: utf-8 -*-
+"""Helper utilty function for customization."""
+import sys
+import os
+import docutils
+import subprocess
+
+if os.environ.get('READTHEDOCS', None) == 'True':
+    subprocess.call('cd ..; rm -rf recommonmark;' +
+                    'git clone https://github.com/tqchen/recommonmark', shell=True)
+
+sys.path.insert(0, os.path.abspath('../recommonmark/'))
+from recommonmark import parser, transform
+
+MarkdownParser = parser.CommonMarkParser
+AutoStructify = transform.AutoStructify
diff --git a/subtree/rabit/guide/README b/subtree/rabit/guide/README
new file mode 100644
index 000000000000..2483d683fd70
--- /dev/null
+++ b/subtree/rabit/guide/README
@@ -0,0 +1 @@
+See tutorial at ../doc/guide.md
\ No newline at end of file
diff --git a/subtree/rabit/guide/basic.cc b/subtree/rabit/guide/basic.cc
index 62c0fc16524b..a9a729170c51 100644
--- a/subtree/rabit/guide/basic.cc
+++ b/subtree/rabit/guide/basic.cc
@@ -5,11 +5,17 @@
  *
  * \author Tianqi Chen
  */
+#define _CRT_SECURE_NO_WARNINGS
+#define _CRT_SECURE_NO_DEPRECATE
+#include <vector>
 #include <rabit.h>
 using namespace rabit;
-const int N = 3;
 int main(int argc, char *argv[]) {
-  int a[N];
+  int N = 3;
+  if (argc > 1) {
+    N = atoi(argv[1]);
+  }
+  std::vector<int> a(N);
   rabit::Init(argc, argv);
   for (int i = 0; i < N; ++i) {
     a[i] = rabit::GetRank() + i;
diff --git a/subtree/rabit/include/dmlc/io.h b/subtree/rabit/include/dmlc/io.h
index e273763ca26d..66d590b2d160 100644
--- a/subtree/rabit/include/dmlc/io.h
+++ b/subtree/rabit/include/dmlc/io.h
@@ -14,6 +14,7 @@
 
 // include uint64_t only to make io standalone
 #ifdef _MSC_VER
+/*! \brief uint64 */
 typedef unsigned __int64 uint64_t;
 #else
 #include <inttypes.h>
@@ -24,7 +25,7 @@ namespace dmlc {
 /*!
  * \brief interface of stream I/O for serialization
  */
-class Stream {
+class Stream {  // NOLINT(*)
  public:
   /*!
    * \brief reads data from a stream
@@ -71,7 +72,7 @@ class Stream {
   /*!
    * \brief writes a string
    * \param str the string to be written/serialized
-   */ 
+   */
   inline void Write(const std::string &str);
   /*!
    * \brief loads a string
@@ -94,7 +95,7 @@ class SeekStream: public Stream {
    * \brief generic factory function
    *  create an SeekStream for read only,
    *  the stream will close the underlying files upon deletion
-   *  error will be reported and the system will exit when create failed 
+   *  error will be reported and the system will exit when create failed
    * \param uri the uri of the input currently we support
    *            hdfs://, s3://, and file:// by default file:// will be used
    * \param allow_null whether NULL can be returned, or directly report error
@@ -107,12 +108,12 @@ class SeekStream: public Stream {
 /*! \brief interface for serializable objects */
 class Serializable {
  public:
-  /*! 
+  /*!
   * \brief load the model from a stream
   * \param fi stream where to load the model from
   */
   virtual void Load(Stream *fi) = 0;
-  /*! 
+  /*!
   * \brief saves the model to a stream
   * \param fo stream where to save the model to
   */
@@ -123,7 +124,7 @@ class Serializable {
  * \brief input split creates that allows reading
  *  of records from split of data,
  *  independent part that covers all the dataset
- * 
+ *
  *  see InputSplit::Create for definition of record
  */
 class InputSplit {
@@ -141,7 +142,7 @@ class InputSplit {
    *  this is a hint so may not be enforced,
    *  but InputSplit will try adjust its internal buffer
    *  size to the hinted value
-   * \param chunk_size the chunk size 
+   * \param chunk_size the chunk size
    */
   virtual void HintChunkSize(size_t chunk_size) {}
   /*! \brief reset the position of InputSplit to beginning */
@@ -150,7 +151,7 @@ class InputSplit {
    * \brief get the next record, the returning value
    *   is valid until next call to NextRecord or NextChunk
    *   caller can modify the memory content of out_rec
-   *   
+   *
    *   For text, out_rec contains a single line
    *   For recordio, out_rec contains one record content(with header striped)
    *
@@ -161,11 +162,11 @@ class InputSplit {
    */
   virtual bool NextRecord(Blob *out_rec) = 0;
   /*!
-   * \brief get a chunk of memory that can contain multiple records, 
+   * \brief get a chunk of memory that can contain multiple records,
    *  the caller needs to parse the content of the resulting chunk,
    *  for text file, out_chunk can contain data of multiple lines
    *  for recordio, out_chunk can contain multiple records(including headers)
-   *   
+   *
    *  This function ensures there won't be partial record in the chunk
    *  caller can modify the memory content of out_chunk,
    *  the memory is valid until next call to NextRecord or NextChunk
@@ -192,9 +193,10 @@ class InputSplit {
    *   List of possible types: "text", "recordio"
    *     - "text":
    *         text file, each line is treated as a record
-   *         input split will split on \n or \r
+   *         input split will split on '\\n' or '\\r'
    *     - "recordio":
    *         binary recordio file, see recordio.h
+   * \return a new input split
    * \sa InputSplit::Type
    */
   static InputSplit* Create(const char *uri,
@@ -224,7 +226,7 @@ class ostream : public std::basic_ostream<char> {
    * \param buffer_size internal streambuf size
    */
   explicit ostream(Stream *stream,
-                   size_t buffer_size = 1 << 10)
+                   size_t buffer_size = (1 << 10))
       : std::basic_ostream<char>(NULL), buf_(buffer_size) {
     this->set_stream(stream);
   }
@@ -240,7 +242,7 @@ class ostream : public std::basic_ostream<char> {
     buf_.set_stream(stream);
     this->rdbuf(&buf_);
   }
-  
+
  private:
   // internal streambuf
   class OutBuf : public std::streambuf {
@@ -251,7 +253,7 @@ class ostream : public std::basic_ostream<char> {
     }
     // set stream to the buffer
     inline void set_stream(Stream *stream);
-    
+
    private:
     /*! \brief internal stream by StreamBuf */
     Stream *stream_;
@@ -287,7 +289,7 @@ class istream : public std::basic_istream<char> {
    * \param buffer_size internal buffer size
    */
   explicit istream(Stream *stream,
-                   size_t buffer_size = 1 << 10)                   
+                   size_t buffer_size = (1 << 10))
       : std::basic_istream<char>(NULL), buf_(buffer_size) {
     this->set_stream(stream);
   }
@@ -325,7 +327,7 @@ class istream : public std::basic_istream<char> {
     Stream *stream_;
     /*! \brief how many bytes we read so far */
     size_t bytes_read_;
-    /*! \brief internal buffer */    
+    /*! \brief internal buffer */
     std::vector<char> buffer_;
     // override underflow
     inline int_type underflow();
@@ -402,7 +404,7 @@ inline int ostream::OutBuf::overflow(int c) {
 // implementations for istream
 inline void istream::InBuf::set_stream(Stream *stream) {
   stream_ = stream;
-  this->setg(&buffer_[0], &buffer_[0], &buffer_[0]);  
+  this->setg(&buffer_[0], &buffer_[0], &buffer_[0]);
 }
 inline int istream::InBuf::underflow() {
   char *bhead = &buffer_[0];
diff --git a/subtree/rabit/include/rabit.h b/subtree/rabit/include/rabit.h
index 824b454bb814..b0f1df39c5ec 100644
--- a/subtree/rabit/include/rabit.h
+++ b/subtree/rabit/include/rabit.h
@@ -8,12 +8,18 @@
  *   rabit.h and serializable.h is all what the user needs to use the rabit interface
  * \author Tianqi Chen, Ignacio Cano, Tianyi Zhou
  */
-#ifndef RABIT_RABIT_H_
-#define RABIT_RABIT_H_
+#ifndef RABIT_RABIT_H_  // NOLINT(*)
+#define RABIT_RABIT_H_  // NOLINT(*)
 #include <string>
 #include <vector>
+
+// whether or not use c++11 support
+#ifndef DMLC_USE_CXX11
+#define DMLC_USE_CXX11 (defined(__GXX_EXPERIMENTAL_CXX0X__) ||\
+                        __cplusplus >= 201103L || defined(_MSC_VER))
+#endif
 // optionally support of lambda functions in C++11, if available
-#if __cplusplus >= 201103L
+#if DMLC_USE_CXX11
 #include <functional>
 #endif  // C++11
 // contains definition of Serializable
@@ -56,8 +62,8 @@ struct BitOR;
  * \param argv the array of input arguments
  */
 inline void Init(int argc, char *argv[]);
-/*! 
- * \brief finalizes the rabit engine, call this function after you finished with all the jobs 
+/*!
+ * \brief finalizes the rabit engine, call this function after you finished with all the jobs
  */
 inline void Finalize(void);
 /*! \brief gets rank of the current process */
@@ -71,7 +77,7 @@ inline bool IsDistributed(void);
 inline std::string GetProcessorName(void);
 /*!
  * \brief prints the msg to the tracker,
- *    this function can be used to communicate progress information to 
+ *    this function can be used to communicate progress information to
  *    the user who monitors the tracker
  * \param msg the message to be printed
  */
@@ -89,7 +95,7 @@ inline void TrackerPrintf(const char *fmt, ...);
 /*!
  * \brief broadcasts a memory region to every node from the root
  *
- *     Example: int a = 1; Broadcast(&a, sizeof(a), root); 
+ *     Example: int a = 1; Broadcast(&a, sizeof(a), root);
  * \param sendrecv_data the pointer to the send/receive buffer,
  * \param size the data size
  * \param root the process root
@@ -113,48 +119,54 @@ inline void Broadcast(std::vector<DType> *sendrecv_data, int root);
  */
 inline void Broadcast(std::string *sendrecv_data, int root);
 /*!
- * \brief performs in-place Allreduce on sendrecvbuf 
+ * \brief performs in-place Allreduce on sendrecvbuf
  *        this function is NOT thread-safe
  *
  * Example Usage: the following code does an Allreduce and outputs the sum as the result
- *     vector<int> data(10);
- *     ...
- *     Allreduce<op::Sum>(&data[0], data.size());
- *     ...
+ * \code{.cpp}
+ * vector<int> data(10);
+ * ...
+ * Allreduce<op::Sum>(&data[0], data.size());
+ * ...
+ * \endcode
+ *
  * \param sendrecvbuf buffer for both sending and receiving data
  * \param count number of elements to be reduced
  * \param prepare_fun Lazy preprocessing function, if it is not NULL, prepare_fun(prepare_arg)
  *                    will be called by the function before performing Allreduce in order to initialize the data in sendrecvbuf.
  *                     If the result of Allreduce can be recovered directly, then prepare_func will NOT be called
- * \param prepare_arg argument used to pass into the lazy preprocessing function 
- * \tparam OP see namespace op, reduce operator 
+ * \param prepare_arg argument used to pass into the lazy preprocessing function
+ * \tparam OP see namespace op, reduce operator
  * \tparam DType data type
  */
 template<typename OP, typename DType>
 inline void Allreduce(DType *sendrecvbuf, size_t count,
-                      void (*prepare_fun)(void *arg) = NULL,
+                      void (*prepare_fun)(void *) = NULL,
                       void *prepare_arg = NULL);
 // C++11 support for lambda prepare function
-#if __cplusplus >= 201103L
+#if DMLC_USE_CXX11
 /*!
  * \brief performs in-place Allreduce, on sendrecvbuf
  *        with a prepare function specified by a lambda function
  *
- * Example Usage: the following code does an Allreduce and outputs the sum as the result
- *     vector<int> data(10);
- *     ...
- *     Allreduce<op::Sum>(&data[0], data.size(), [&]() {
- *                          for (int i = 0; i < 10; ++i) {
- *                            data[i] = i;
- *                          }
- *                        });
+ * Example Usage:
+ * \code{.cpp}
+ * // the following code does an Allreduce and outputs the sum as the result
+ * vector<int> data(10);
+ * ...
+ * Allreduce<op::Sum>(&data[0], data.size(), [&]() {
+ *                     for (int i = 0; i < 10; ++i) {
+ *                       data[i] = i;
+ *                     }
+ *                    });
  *     ...
+ * \endcode
  * \param sendrecvbuf buffer for both sending and receiving data
  * \param count number of elements to be reduced
  * \param prepare_fun  Lazy lambda preprocessing function, prepare_fun() will be invoked
  *                     by the function before performing Allreduce in order to initialize the data in sendrecvbuf.
  *                     If the result of Allreduce can be recovered directly, then prepare_func will NOT be called
- * \tparam OP see namespace op, reduce operator 
+ * \tparam OP see namespace op, reduce operator
  * \tparam DType data type
  */
 template<typename OP, typename DType>
@@ -168,19 +180,20 @@ inline void Allreduce(DType *sendrecvbuf, size_t count,
  *   is the same in every node
  * \param local_model pointer to the local model that is specific to the current node/rank
  *   this can be NULL when no local model is needed
- * 
+ *
  * \return the version number of the check point loaded
  *     if returned version == 0, this means no model has been CheckPointed
  *     the p_model is not touched, users should do the necessary initialization by themselves
- *   
- *   Common usage example:
- *      int iter = rabit::LoadCheckPoint(&model);
- *      if (iter == 0) model.InitParameters();
- *      for (i = iter; i < max_iter; ++i) {
- *        do many things, include allreduce
- *        rabit::CheckPoint(model);
- *      } 
  *
+ * \code{.cpp}
+ * // Example usage code of LoadCheckPoint
+ * int iter = rabit::LoadCheckPoint(&model);
+ * if (iter == 0) model.InitParameters();
+ * for (i = iter; i < max_iter; ++i) {
+ *   // do many things, include allreduce
+ *   rabit::CheckPoint(model);
+ * }
+ * \endcode
  * \sa CheckPoint, VersionNumber
  */
 inline int LoadCheckPoint(Serializable *global_model,
@@ -188,7 +201,7 @@ inline int LoadCheckPoint(Serializable *global_model,
 /*!
  * \brief checkpoints the model, meaning a stage of execution has finished.
  *  every time we call check point, a version number will be increased by one
- * 
+ *
  * \param global_model pointer to the globally shared model/state
  *   when calling this function, the caller needs to guarantee that the global_model
  *   is the same in every node
@@ -204,16 +217,16 @@ inline void CheckPoint(const Serializable *global_model,
 /*!
  * \brief This function can be used to replace CheckPoint for global_model only,
  *   when certain condition is met (see detailed explanation).
- * 
+ *
  *   This is a "lazy" checkpoint such that only the pointer to the global_model is
  *   remembered and no memory copy is taken. To use this function, the user MUST ensure that:
  *   The global_model must remain unchanged until the last call of Allreduce/Broadcast in the current version finishes.
- *   In other words, the global_model model can be changed only between the last call of 
+ *   In other words, the global_model model can be changed only between the last call of
  *   Allreduce/Broadcast and LazyCheckPoint, both in the same version
- *   
+ *
  *   For example, suppose the calling sequence is:
  *   LazyCheckPoint, code1, Allreduce, code2, Broadcast, code3, LazyCheckPoint/(or can be CheckPoint)
- *   
+ *
  *   Then the user MUST only change the global_model in code3.
  *
  *   The use of LazyCheckPoint instead of CheckPoint will improve the efficiency of the program.
@@ -235,36 +248,36 @@ namespace engine {
 class ReduceHandle;
 }  // namespace engine
 /*!
- * \brief template class to make customized reduce and all reduce easy  
- *  Do not use reducer directly in the function you call Finalize, 
+ * \brief template class to make customized reduce and all reduce easy
+ *  Do not use reducer directly in the function you call Finalize,
  *   because the destructor can execute after Finalize
  * \tparam DType data type that to be reduced
  * \tparam freduce the customized reduction function
  *  DType must be a struct, with no pointer
  */
-template<typename DType, void (*freduce)(DType &dst, const DType &src)>
+template<typename DType, void (*freduce)(DType &dst, const DType &src)>  // NOLINT(*)
 class Reducer {
  public:
   Reducer(void);
   /*!
-   * \brief customized in-place all reduce operation 
+   * \brief customized in-place all reduce operation
    * \param sendrecvbuf the in place send-recv buffer
    * \param count number of elements to be reduced
    * \param prepare_fun Lazy preprocessing function, if it is not NULL, prepare_fun(prepare_arg)
    *                     will be called by the function before performing Allreduce, to initialize the data in sendrecvbuf.
    *                     If the result of Allreduce can be recovered directly, then prepare_func will NOT be called
-   * \param prepare_arg argument used to pass into the lazy preprocessing function 
+   * \param prepare_arg argument used to pass into the lazy preprocessing function
    */
   inline void Allreduce(DType *sendrecvbuf, size_t count,
-                        void (*prepare_fun)(void *arg) = NULL,
+                        void (*prepare_fun)(void *) = NULL,
                         void *prepare_arg = NULL);
-#if __cplusplus >= 201103L
+#if DMLC_USE_CXX11
   /*!
    * \brief customized in-place all reduce operation, with lambda function as preprocessor
    * \param sendrecvbuf pointer to the array of objects to be reduced
    * \param count number of elements to be reduced
    * \param prepare_fun lambda function executed to prepare the data, if necessary
-   */  
+   */
   inline void Allreduce(DType *sendrecvbuf, size_t count,
                         std::function<void()> prepare_fun);
 #endif
@@ -278,7 +291,7 @@ class Reducer {
  *  this class defines complex reducer handles all the data structure that can be
  *  serialized/deserialized into fixed size buffer
  *  Do not use reducer directly in the function you call Finalize, because the destructor can execute after Finalize
- * 
+ *
  * \tparam DType data type that to be reduced, DType must contain the following functions:
  * \tparam freduce the customized reduction function
  *   (1) Save(IStream &fs)  (2) Load(IStream &fs) (3) Reduce(const DType &src, size_t max_nbyte)
@@ -288,7 +301,7 @@ class SerializeReducer {
  public:
   SerializeReducer(void);
   /*!
-   * \brief customized in-place all reduce operation 
+   * \brief customized in-place all reduce operation
    * \param sendrecvobj pointer to the array of objects to be reduced
    * \param max_nbyte maximum amount of memory needed to serialize each object
    *        this includes budget limit for intermediate and final result
@@ -296,14 +309,14 @@ class SerializeReducer {
    * \param prepare_fun Lazy preprocessing function, if it is not NULL, prepare_fun(prepare_arg)
    *                     will be called by the function before performing Allreduce, to initialize the data in sendrecvbuf.
    *                     If the result of Allreduce can be recovered directly, then the prepare_func will NOT be called
-   * \param prepare_arg argument used to pass into the lazy preprocessing function 
+   * \param prepare_arg argument used to pass into the lazy preprocessing function
    */
   inline void Allreduce(DType *sendrecvobj,
                         size_t max_nbyte, size_t count,
-                        void (*prepare_fun)(void *arg) = NULL,
+                        void (*prepare_fun)(void *) = NULL,
                         void *prepare_arg = NULL);
 // C++11 support for lambda prepare function
-#if __cplusplus >= 201103L
+#if DMLC_USE_CXX11
   /*!
    * \brief customized in-place all reduce operation, with lambda function as preprocessor
    * \param sendrecvobj pointer to the array of objects to be reduced
@@ -311,7 +324,7 @@ class SerializeReducer {
    *        this includes budget limit for intermediate and final result
    * \param count number of elements to be reduced
    * \param prepare_fun lambda function executed to prepare the data, if necessary
-   */  
+   */
   inline void Allreduce(DType *sendrecvobj,
                         size_t max_nbyte, size_t count,
                         std::function<void()> prepare_fun);
@@ -326,4 +339,4 @@ class SerializeReducer {
 }  // namespace rabit
 // implementation of template functions
 #include "./rabit/rabit-inl.h"
-#endif  // RABIT_RABIT_H_
+#endif  // RABIT_RABIT_H_ // NOLINT(*)
diff --git a/subtree/rabit/include/rabit/engine.h b/subtree/rabit/include/rabit/engine.h
index a2f5da25b42e..272bbb8ef378 100644
--- a/subtree/rabit/include/rabit/engine.h
+++ b/subtree/rabit/include/rabit/engine.h
@@ -183,7 +183,9 @@ enum DataType {
   kLong = 4,
   kULong = 5,
   kFloat = 6,
-  kDouble = 7
+  kDouble = 7,
+  kLongLong = 8,
+  kULongLong = 9
 };
 }  // namespace mpi
 /*!
diff --git a/subtree/rabit/include/rabit/io.h b/subtree/rabit/include/rabit/io.h
index a0eb0adb89a8..7ffca38f296e 100644
--- a/subtree/rabit/include/rabit/io.h
+++ b/subtree/rabit/include/rabit/io.h
@@ -4,8 +4,8 @@
  * \brief utilities with different serializable implementations
  * \author Tianqi Chen
  */
-#ifndef RABIT_UTILS_IO_H_
-#define RABIT_UTILS_IO_H_
+#ifndef RABIT_IO_H_
+#define RABIT_IO_H_
 #include <cstdio>
 #include <vector>
 #include <cstring>
@@ -51,6 +51,7 @@ struct MemoryFixSizeBuffer : public SeekStream {
   virtual bool AtEnd(void) const {
     return curr_ptr_ == buffer_size_;
   }
+
  private:
   /*! \brief in memory buffer */
   char *p_buffer_;
@@ -93,6 +94,7 @@ struct MemoryBufferStream : public SeekStream {
   virtual bool AtEnd(void) const {
     return curr_ptr_ == p_buffer_->length();
   }
+
  private:
   /*! \brief in memory buffer */
   std::string *p_buffer_;
@@ -101,4 +103,4 @@ struct MemoryBufferStream : public SeekStream {
 };  // class MemoryBufferStream
 }  // namespace utils
 }  // namespace rabit
-#endif  // RABIT_UTILS_IO_H_
+#endif  // RABIT_IO_H_
diff --git a/subtree/rabit/include/rabit/rabit-inl.h b/subtree/rabit/include/rabit/rabit-inl.h
index 3d1ec59a8c1a..e82b5a9a0d3a 100644
--- a/subtree/rabit/include/rabit/rabit-inl.h
+++ b/subtree/rabit/include/rabit/rabit-inl.h
@@ -1,12 +1,15 @@
 /*!
+ * Copyright by Contributors
  * \file rabit-inl.h
  * \brief implementation of inline template function for rabit interface
  *
  * \author Tianqi Chen
  */
-#ifndef RABIT_RABIT_INL_H
-#define RABIT_RABIT_INL_H
+#ifndef RABIT_RABIT_INL_H_
+#define RABIT_RABIT_INL_H_
 // use engine for implementation
+#include <vector>
+#include <string>
 #include "./io.h"
 #include "./utils.h"
 #include "../rabit.h"
@@ -30,15 +33,15 @@ inline DataType GetType<int>(void) {
   return kInt;
 }
 template<>
-inline DataType GetType<unsigned>(void) {
+inline DataType GetType<unsigned int>(void) { // NOLINT(*)
   return kUInt;
 }
 template<>
-inline DataType GetType<long>(void) {
+inline DataType GetType<long>(void) {  // NOLINT(*)
   return kLong;
 }
 template<>
-inline DataType GetType<unsigned long>(void) {
+inline DataType GetType<unsigned long>(void) { // NOLINT(*)
   return kULong;
 }
 template<>
@@ -49,47 +52,55 @@ template<>
 inline DataType GetType<double>(void) {
   return kDouble;
 }
+template<>
+inline DataType GetType<long long>(void) { // NOLINT(*)
+  return kLongLong;
+}
+template<>
+inline DataType GetType<unsigned long long>(void) { // NOLINT(*)
+  return kULongLong;
+}
 }  // namespace mpi
 }  // namespace engine
 
 namespace op {
 struct Max {
-  const static engine::mpi::OpType kType = engine::mpi::kMax;
+  static const engine::mpi::OpType kType = engine::mpi::kMax;
   template<typename DType>
-  inline static void Reduce(DType &dst, const DType &src) {
+  inline static void Reduce(DType &dst, const DType &src) { // NOLINT(*)
     if (dst < src) dst = src;
   }
 };
 struct Min {
-  const static engine::mpi::OpType kType = engine::mpi::kMin;
+  static const engine::mpi::OpType kType = engine::mpi::kMin;
   template<typename DType>
-  inline static void Reduce(DType &dst, const DType &src) {
+  inline static void Reduce(DType &dst, const DType &src) { // NOLINT(*)
     if (dst > src) dst = src;
   }
 };
 struct Sum {
-  const static engine::mpi::OpType kType = engine::mpi::kSum;
+  static const engine::mpi::OpType kType = engine::mpi::kSum;
   template<typename DType>
-  inline static void Reduce(DType &dst, const DType &src) {
+  inline static void Reduce(DType &dst, const DType &src) { // NOLINT(*)
     dst += src;
   }
 };
 struct BitOR {
-  const static engine::mpi::OpType kType = engine::mpi::kBitwiseOR;
+  static const engine::mpi::OpType kType = engine::mpi::kBitwiseOR;
   template<typename DType>
-  inline static void Reduce(DType &dst, const DType &src) {
+  inline static void Reduce(DType &dst, const DType &src) { // NOLINT(*)
     dst |= src;
   }
 };
 template<typename OP, typename DType>
 inline void Reducer(const void *src_, void *dst_, int len, const MPI::Datatype &dtype) {
   const DType *src = (const DType*)src_;
-  DType *dst = (DType*)dst_;  
+  DType *dst = (DType*)dst_;  // NOLINT(*)
   for (int i = 0; i < len; ++i) {
     OP::Reduce(dst[i], src[i]);
   }
 }
-} // namespace op
+}  // namespace op
 
 // intialize the rabit engine
 inline void Init(int argc, char *argv[]) {
@@ -144,23 +155,23 @@ inline void Broadcast(std::string *sendrecv_data, int root) {
 // perform inplace Allreduce
 template<typename OP, typename DType>
 inline void Allreduce(DType *sendrecvbuf, size_t count,
-                      void (*prepare_fun)(void *arg), 
+                      void (*prepare_fun)(void *arg),
                       void *prepare_arg) {
-  engine::Allreduce_(sendrecvbuf, sizeof(DType), count, op::Reducer<OP,DType>,
+  engine::Allreduce_(sendrecvbuf, sizeof(DType), count, op::Reducer<OP, DType>,
                      engine::mpi::GetType<DType>(), OP::kType, prepare_fun, prepare_arg);
 }
 
 // C++11 support for lambda prepare function
-#if __cplusplus >= 201103L
+#if DMLC_USE_CXX11
 inline void InvokeLambda_(void *fun) {
   (*static_cast<std::function<void()>*>(fun))();
 }
 template<typename OP, typename DType>
 inline void Allreduce(DType *sendrecvbuf, size_t count, std::function<void()> prepare_fun) {
-  engine::Allreduce_(sendrecvbuf, sizeof(DType), count, op::Reducer<OP,DType>,
+  engine::Allreduce_(sendrecvbuf, sizeof(DType), count, op::Reducer<OP, DType>,
                      engine::mpi::GetType<DType>(), OP::kType, InvokeLambda_, &prepare_fun);
 }
-#endif // C++11
+#endif  // C++11
 
 // print message to the tracker
 inline void TrackerPrint(const std::string &msg) {
@@ -215,15 +226,16 @@ inline void ReducerSafe_(const void *src_, void *dst_, int len_, const MPI::Data
   }
 }
 // function to perform reduction for Reducer
-template<typename DType, void (*freduce)(DType &dst, const DType &src)>
-inline void ReducerAlign_(const void *src_, void *dst_, int len_, const MPI::Datatype &dtype) {
+template<typename DType, void (*freduce)(DType &dst, const DType &src)> // NOLINT(*)
+inline void ReducerAlign_(const void *src_, void *dst_,
+                          int len_, const MPI::Datatype &dtype) {
   const DType *psrc = reinterpret_cast<const DType*>(src_);
   DType *pdst = reinterpret_cast<DType*>(dst_);
   for (int i = 0; i < len_; ++i) {
     freduce(pdst[i], psrc[i]);
   }
 }
-template<typename DType, void (*freduce)(DType &dst, const DType &src)>
+template<typename DType, void (*freduce)(DType &dst, const DType &src)>  // NOLINT(*)
 inline Reducer<DType, freduce>::Reducer(void) {
   // it is safe to directly use handle for aligned data types
   if (sizeof(DType) == 8 || sizeof(DType) == 4 || sizeof(DType) == 1) {
@@ -232,7 +244,7 @@ inline Reducer<DType, freduce>::Reducer(void) {
     this->handle_.Init(ReducerSafe_<DType, freduce>, sizeof(DType));
   }
 }
-template<typename DType, void (*freduce)(DType &dst, const DType &src)>
+template<typename DType, void (*freduce)(DType &dst, const DType &src)> // NOLINT(*)
 inline void Reducer<DType, freduce>::Allreduce(DType *sendrecvbuf, size_t count,
                                                void (*prepare_fun)(void *arg),
                                                void *prepare_arg) {
@@ -240,13 +252,14 @@ inline void Reducer<DType, freduce>::Allreduce(DType *sendrecvbuf, size_t count,
 }
 // function to perform reduction for SerializeReducer
 template<typename DType>
-inline void SerializeReducerFunc_(const void *src_, void *dst_, int len_, const MPI::Datatype &dtype) {
+inline void SerializeReducerFunc_(const void *src_, void *dst_,
+                                  int len_, const MPI::Datatype &dtype) {
   int nbytes = engine::ReduceHandle::TypeSize(dtype);
   // temp space
   DType tsrc, tdst;
   for (int i = 0; i < len_; ++i) {
-    utils::MemoryFixSizeBuffer fsrc((char*)(src_) + i * nbytes, nbytes);
-    utils::MemoryFixSizeBuffer fdst((char*)(dst_) + i * nbytes, nbytes);
+    utils::MemoryFixSizeBuffer fsrc((char*)(src_) + i * nbytes, nbytes); // NOLINT(*)
+    utils::MemoryFixSizeBuffer fdst((char*)(dst_) + i * nbytes, nbytes); // NOLINT(*)
     tsrc.Load(fsrc);
     tdst.Load(fdst);
     // govern const check
@@ -288,8 +301,8 @@ inline void SerializeReducer<DType>::Allreduce(DType *sendrecvobj,
   // setup closure
   SerializeReduceClosure<DType> c;
   c.sendrecvobj = sendrecvobj; c.max_nbyte = max_nbyte; c.count = count;
-  c.prepare_fun = prepare_fun; c.prepare_arg = prepare_arg; c.p_buffer = &buffer_;   
-  // invoke here 
+  c.prepare_fun = prepare_fun; c.prepare_arg = prepare_arg; c.p_buffer = &buffer_;
+  // invoke here
   handle_.Allreduce(BeginPtr(buffer_), max_nbyte, count,
                     SerializeReduceClosure<DType>::Invoke, &c);
   for (size_t i = 0; i < count; ++i) {
@@ -298,8 +311,8 @@ inline void SerializeReducer<DType>::Allreduce(DType *sendrecvobj,
   }
 }
 
-#if __cplusplus >= 201103L
-template<typename DType, void (*freduce)(DType &dst, const DType &src)>
+#if DMLC_USE_CXX11
+template<typename DType, void (*freduce)(DType &dst, const DType &src)>  // NOLINT(*)g
 inline void Reducer<DType, freduce>::Allreduce(DType *sendrecvbuf, size_t count,
                                                std::function<void()> prepare_fun) {
   this->Allreduce(sendrecvbuf, count, InvokeLambda_, &prepare_fun);
@@ -312,4 +325,4 @@ inline void SerializeReducer<DType>::Allreduce(DType *sendrecvobj,
 }
 #endif
 }  // namespace rabit
-#endif
+#endif  // RABIT_RABIT_INL_H_
diff --git a/subtree/rabit/include/rabit/timer.h b/subtree/rabit/include/rabit/timer.h
index 46b7affc40bc..1f135add6e52 100644
--- a/subtree/rabit/include/rabit/timer.h
+++ b/subtree/rabit/include/rabit/timer.h
@@ -1,4 +1,5 @@
 /*!
+ * Copyright by Contributors
  * \file timer.h
  * \brief This file defines the utils for timing
  * \author Tianqi Chen, Nacho, Tianyi
@@ -18,7 +19,6 @@ namespace utils {
  * \brief return time in seconds, not cross platform, avoid to use this in most places
  */
 inline double GetTime(void) {
-  // TODO: use c++11 chrono when c++11 was available
   #ifdef __MACH__
   clock_serv_t cclock;
   mach_timespec_t mts;
@@ -32,7 +32,6 @@ inline double GetTime(void) {
   utils::Check(clock_gettime(CLOCK_REALTIME, &ts) == 0, "failed to get time");
   return static_cast<double>(ts.tv_sec) + static_cast<double>(ts.tv_nsec) * 1e-9;
   #else
-  // TODO: add MSVC macro, and MSVC timer
   return static_cast<double>(time(NULL));
   #endif
   #endif
diff --git a/subtree/rabit/include/rabit/utils.h b/subtree/rabit/include/rabit/utils.h
index 0f48fa0fa4a8..28709ee7df42 100644
--- a/subtree/rabit/include/rabit/utils.h
+++ b/subtree/rabit/include/rabit/utils.h
@@ -27,7 +27,7 @@
 #else
 #ifdef _FILE_OFFSET_BITS
 #if _FILE_OFFSET_BITS == 32
-#pragma message ("Warning: FILE OFFSET BITS defined to be 32 bit")
+#pragma message("Warning: FILE OFFSET BITS defined to be 32 bit")
 #endif
 #endif
 
@@ -59,17 +59,17 @@ namespace utils {
 const int kPrintBuffer = 1 << 12;
 
 #ifndef RABIT_CUSTOMIZE_MSG_
-/*! 
+/*!
  * \brief handling of Assert error, caused by inappropriate input
- * \param msg error message 
+ * \param msg error message
  */
 inline void HandleAssertError(const char *msg) {
   fprintf(stderr, "AssertError:%s\n", msg);
   exit(-1);
 }
-/*! 
+/*!
  * \brief handling of Check error, caused by inappropriate input
- * \param msg error message 
+ * \param msg error message
  */
 inline void HandleCheckError(const char *msg) {
   fprintf(stderr, "%s\n", msg);
@@ -163,7 +163,7 @@ inline std::FILE *FopenCheck(const char *fname, const char *flag) {
 // easy utils that can be directly accessed in xgboost
 /*! \brief get the beginning address of a vector */
 template<typename T>
-inline T *BeginPtr(std::vector<T> &vec) {
+inline T *BeginPtr(std::vector<T> &vec) {  // NOLINT(*)
   if (vec.size() == 0) {
     return NULL;
   } else {
@@ -172,14 +172,14 @@ inline T *BeginPtr(std::vector<T> &vec) {
 }
 /*! \brief get the beginning address of a vector */
 template<typename T>
-inline const T *BeginPtr(const std::vector<T> &vec) {
+inline const T *BeginPtr(const std::vector<T> &vec) {  // NOLINT(*)
   if (vec.size() == 0) {
     return NULL;
   } else {
     return &vec[0];
   }
 }
-inline char* BeginPtr(std::string &str) {
+inline char* BeginPtr(std::string &str) {  // NOLINT(*)
   if (str.length() == 0) return NULL;
   return &str[0];
 }
diff --git a/subtree/rabit/include/rabit_serializable.h b/subtree/rabit/include/rabit_serializable.h
index 40266575b8e1..c9199bba125b 100644
--- a/subtree/rabit/include/rabit_serializable.h
+++ b/subtree/rabit/include/rabit_serializable.h
@@ -4,8 +4,8 @@
  * \brief defines serializable interface of rabit
  * \author Tianqi Chen
  */
-#ifndef RABIT_RABIT_SERIALIZABLE_H_
-#define RABIT_RABIT_SERIALIZABLE_H_
+#ifndef RABIT_SERIALIZABLE_H_
+#define RABIT_SERIALIZABLE_H_
 #include <vector>
 #include <string>
 #include "./rabit/utils.h"
@@ -13,15 +13,15 @@
 
 namespace rabit {
 /*!
- * \brief defines stream used in rabit 
- * see definition of Stream in dmlc/io.h 
+ * \brief defines stream used in rabit
+ * see definition of Stream in dmlc/io.h
  */
 typedef dmlc::Stream Stream;
 /*!
- * \brief defines serializable objects used in rabit 
- * see definition of Serializable in dmlc/io.h 
+ * \brief defines serializable objects used in rabit
+ * see definition of Serializable in dmlc/io.h
  */
 typedef dmlc::Serializable Serializable;
 
 }  // namespace rabit
-#endif  // RABIT_RABIT_SERIALIZABLE_H_
+#endif  // RABIT_SERIALIZABLE_H_
diff --git a/subtree/rabit/scripts/travis_runtest.sh b/subtree/rabit/scripts/travis_runtest.sh
new file mode 100755
index 000000000000..f57141c6c0cb
--- /dev/null
+++ b/subtree/rabit/scripts/travis_runtest.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+make -f test.mk model_recover_10_10k || exit -1
+make -f test.mk model_recover_10_10k_die_same  || exit -1
+make -f test.mk local_recover_10_10k || exit -1
+make -f test.mk pylocal_recover_10_10k || exit -1
+make -f test.mk lazy_recover_10_10k_die_hard || exit -1
+make -f test.mk lazy_recover_10_10k_die_same || exit -1
+make -f test.mk ringallreduce_10_10k || exit -1
\ No newline at end of file
diff --git a/subtree/rabit/scripts/travis_script.sh b/subtree/rabit/scripts/travis_script.sh
new file mode 100755
index 000000000000..664582906a85
--- /dev/null
+++ b/subtree/rabit/scripts/travis_script.sh
@@ -0,0 +1,22 @@
+#!/bin/bash
+
+# main script of travis
+if [ ${TASK} == "lint" ]; then
+    make lint || exit -1
+fi
+
+if [ ${TASK} == "doc" ]; then
+    make doc 2>log.txt
+    (cat log.txt| grep -v ENABLE_PREPROCESSING |grep -v "unsupported tag" |grep warning) && exit -1
+fi
+
+if [ ${TASK} == "build" ]; then
+    make all || exit -1
+fi
+
+if [ ${TASK} == "test" ]; then
+    cd test
+    make all || exit -1
+    ../scripts/travis_runtest.sh || exit -1
+fi
+
diff --git a/subtree/rabit/src/allreduce_base.cc b/subtree/rabit/src/allreduce_base.cc
index 964738b343f3..d3b7502fff26 100644
--- a/subtree/rabit/src/allreduce_base.cc
+++ b/subtree/rabit/src/allreduce_base.cc
@@ -24,6 +24,7 @@ AllreduceBase::AllreduceBase(void) {
   nport_trial = 1000;
   rank = 0;
   world_size = -1;
+  connect_retry = 5;
   hadoop_mode = 0;
   version_number = 0;
   // 32 K items
@@ -46,6 +47,7 @@ AllreduceBase::AllreduceBase(void) {
   env_vars.push_back("DMLC_NUM_ATTEMPT");
   env_vars.push_back("DMLC_TRACKER_URI");
   env_vars.push_back("DMLC_TRACKER_PORT");
+  env_vars.push_back("DMLC_WORKER_CONNECT_RETRY");
 }
 
 // initialization function
@@ -94,7 +96,8 @@ void AllreduceBase::Init(void) {
     }
   }
   if (dmlc_role != "worker") {
-    fprintf(stderr, "Rabit Module currently only work with dmlc worker, quit this program by exit 0\n");
+    fprintf(stderr, "Rabit Module currently only work with dmlc worker"\
+            ", quit this program by exit 0\n");
     exit(0);
   }
   // clear the setting before start reconnection
@@ -134,7 +137,7 @@ void AllreduceBase::TrackerPrint(const std::string &msg) {
 // util to parse data with unit suffix
 inline size_t ParseUnit(const char *name, const char *val) {
   char unit;
-  unsigned long amt;
+  unsigned long amt;  // NOLINT(*)
   int n = sscanf(val, "%lu%c", &amt, &unit);
   size_t amount = amt;
   if (n == 2) {
@@ -154,7 +157,7 @@ inline size_t ParseUnit(const char *name, const char *val) {
   }
 }
 /*!
- * \brief set parameters to the engine 
+ * \brief set parameters to the engine
  * \param name parameter name
  * \param val parameter value
  */
@@ -174,6 +177,9 @@ void AllreduceBase::SetParam(const char *name, const char *val) {
   if (!strcmp(name, "rabit_reduce_buffer")) {
     reduce_buffer_size = (ParseUnit(name, val) + 7) >> 3;
   }
+  if (!strcmp(name, "DMLC_WORKER_CONNECT_RETRY")) {
+    connect_retry = atoi(val);
+  }
 }
 /*!
  * \brief initialize connection to the tracker
@@ -184,9 +190,27 @@ utils::TCPSocket AllreduceBase::ConnectTracker(void) const {
   // get information from tracker
   utils::TCPSocket tracker;
   tracker.Create();
-  if (!tracker.Connect(utils::SockAddr(tracker_uri.c_str(), tracker_port))) {
-    utils::Socket::Error("Connect");
-  }
+
+  int retry = 0;
+  do {
+    fprintf(stderr, "connect to ip: [%s]\n", tracker_uri.c_str());
+    if (!tracker.Connect(utils::SockAddr(tracker_uri.c_str(), tracker_port))) {
+      if (++retry >= connect_retry) {
+        fprintf(stderr, "connect to (failed): [%s]\n", tracker_uri.c_str());
+        utils::Socket::Error("Connect");
+      } else {
+        fprintf(stderr, "retry connect to ip(retry time %d): [%s]\n", retry, tracker_uri.c_str());
+        #ifdef _MSC_VER
+        Sleep(1);
+        #else
+        sleep(1);
+        #endif
+        continue;
+      }
+    }
+    break;
+  } while (1);
+
   using utils::Assert;
   Assert(tracker.SendAll(&magic, sizeof(magic)) == sizeof(magic),
          "ReConnectLink failure 1");
@@ -258,7 +282,7 @@ void AllreduceBase::ReConnectLinks(const char *cmd) {
       } else {
         if (!all_links[i].sock.IsClosed()) all_links[i].sock.Close();
       }
-    }    
+    }
     int ngood = static_cast<int>(good_link.size());
     Assert(tracker.SendAll(&ngood, sizeof(ngood)) == sizeof(ngood),
            "ReConnectLink failure 5");
@@ -359,7 +383,7 @@ void AllreduceBase::ReConnectLinks(const char *cmd) {
  *    The kSuccess TryAllreduce does NOT mean every node have successfully finishes TryAllreduce.
  *    It only means the current node get the correct result of Allreduce.
  *    However, it means every node finishes LAST call(instead of this one) of Allreduce/Bcast
- * 
+ *
  * \param sendrecvbuf_ buffer for both sending and recving data
  * \param type_nbytes the unit number of bytes the type have
  * \param count number of elements to be reduced
@@ -440,7 +464,7 @@ AllreduceBase::TryAllreduceTree(void *sendrecvbuf_,
           selecter.WatchRead(links[i].sock);
         }
         // size_write <= size_read
-        if (links[i].size_write != total_size){
+        if (links[i].size_write != total_size) {
           if (links[i].size_write < size_down_in) {
             selecter.WatchWrite(links[i].sock);
           }
@@ -477,7 +501,7 @@ AllreduceBase::TryAllreduceTree(void *sendrecvbuf_,
       size_t max_reduce = total_size;
       for (int i = 0; i < nlink; ++i) {
         if (i != parent_index) {
-          max_reduce= std::min(max_reduce, links[i].size_read);
+          max_reduce = std::min(max_reduce, links[i].size_read);
           utils::Assert(buffer_size == 0 || buffer_size == links[i].buffer_size,
                         "buffer size inconsistent");
           buffer_size = links[i].buffer_size;
@@ -513,7 +537,7 @@ AllreduceBase::TryAllreduceTree(void *sendrecvbuf_,
         if (len != -1) {
           size_up_out += static_cast<size_t>(len);
         } else {
-          ReturnType ret = Errno2Return(errno);
+          ReturnType ret = Errno2Return();
           if (ret != kSuccess) {
             return ReportError(&links[parent_index], ret);
           }
@@ -525,7 +549,7 @@ AllreduceBase::TryAllreduceTree(void *sendrecvbuf_,
         ssize_t len = links[parent_index].sock.
             Recv(sendrecvbuf + size_down_in, total_size - size_down_in);
         if (len == 0) {
-          links[parent_index].sock.Close(); 
+          links[parent_index].sock.Close();
           return ReportError(&links[parent_index], kRecvZeroLen);
         }
         if (len != -1) {
@@ -533,7 +557,7 @@ AllreduceBase::TryAllreduceTree(void *sendrecvbuf_,
           utils::Assert(size_down_in <= size_up_out,
                         "Allreduce: boundary error");
         } else {
-          ReturnType ret = Errno2Return(errno);
+          ReturnType ret = Errno2Return();
           if (ret != kSuccess) {
             return ReportError(&links[parent_index], ret);
           }
@@ -670,7 +694,7 @@ AllreduceBase::TryAllgatherRing(void *sendrecvbuf_, size_t total_size,
                                 size_t slice_begin,
                                 size_t slice_end,
                                 size_t size_prev_slice) {
-  // read from next link and send to prev one 
+  // read from next link and send to prev one
   LinkRecord &prev = *ring_prev, &next = *ring_next;
   // need to reply on special rank structure
   utils::Assert(next.rank == (rank + 1) % world_size &&
@@ -678,11 +702,11 @@ AllreduceBase::TryAllgatherRing(void *sendrecvbuf_, size_t total_size,
                 "need to assume rank structure");
   // send recv buffer
   char *sendrecvbuf = reinterpret_cast<char*>(sendrecvbuf_);
-  const size_t stop_read = total_size + slice_begin; 
-  const size_t stop_write = total_size + slice_begin - size_prev_slice; 
+  const size_t stop_read = total_size + slice_begin;
+  const size_t stop_write = total_size + slice_begin - size_prev_slice;
   size_t write_ptr = slice_begin;
   size_t read_ptr = slice_end;
-  
+
   while (true) {
     // select helper
     bool finished = true;
@@ -709,7 +733,7 @@ AllreduceBase::TryAllgatherRing(void *sendrecvbuf_, size_t total_size,
       if (len != -1) {
         read_ptr += static_cast<size_t>(len);
       } else {
-        ReturnType ret = Errno2Return(errno);
+        ReturnType ret = Errno2Return();
         if (ret != kSuccess) return ReportError(&next, ret);
       }
     }
@@ -723,7 +747,7 @@ AllreduceBase::TryAllgatherRing(void *sendrecvbuf_, size_t total_size,
       if (len != -1) {
         write_ptr += static_cast<size_t>(len);
       } else {
-        ReturnType ret = Errno2Return(errno);
+        ReturnType ret = Errno2Return();
         if (ret != kSuccess) return ReportError(&prev, ret);
       }
     }
@@ -733,7 +757,7 @@ AllreduceBase::TryAllgatherRing(void *sendrecvbuf_, size_t total_size,
 /*!
  * \brief perform in-place allreduce, on sendrecvbuf, this function can fail,
  *  and will return the cause of failure
- * 
+ *
  *  Ring-based algorithm
  *
  * \param sendrecvbuf_ buffer for both sending and recving data
@@ -748,7 +772,7 @@ AllreduceBase::TryReduceScatterRing(void *sendrecvbuf_,
                                     size_t type_nbytes,
                                     size_t count,
                                     ReduceFunction reducer) {
-  // read from next link and send to prev one 
+  // read from next link and send to prev one
   LinkRecord &prev = *ring_prev, &next = *ring_next;
   // need to reply on special rank structure
   utils::Assert(next.rank == (rank + 1) % world_size &&
@@ -757,7 +781,7 @@ AllreduceBase::TryReduceScatterRing(void *sendrecvbuf_,
   // total size of message
   const size_t total_size = type_nbytes * count;
   size_t n = static_cast<size_t>(world_size);
-  size_t step = (count + n - 1) / n;  
+  size_t step = (count + n - 1) / n;
   size_t r = static_cast<size_t>(next.rank);
   size_t write_ptr = std::min(r * step, count) * type_nbytes;
   size_t read_ptr = std::min((r + 1) * step, count) * type_nbytes;
@@ -826,11 +850,11 @@ AllreduceBase::TryReduceScatterRing(void *sendrecvbuf_,
       if (len != -1) {
         write_ptr += static_cast<size_t>(len);
       } else {
-        ReturnType ret = Errno2Return(errno);
+        ReturnType ret = Errno2Return();
         if (ret != kSuccess) return ReportError(&prev, ret);
       }
     }
-  }  
+  }
   return kSuccess;
 }
 /*!
@@ -857,7 +881,7 @@ AllreduceBase::TryAllreduceRing(void *sendrecvbuf_,
   size_t end = std::min((rank + 1) * step, count) * type_nbytes;
   // previous rank
   int prank = ring_prev->rank;
-  // get rank of previous 
+  // get rank of previous
   return TryAllgatherRing
       (sendrecvbuf_, type_nbytes * count,
        begin, end,
diff --git a/subtree/rabit/src/allreduce_base.h b/subtree/rabit/src/allreduce_base.h
index c34eb6042a50..63acd75d5868 100644
--- a/subtree/rabit/src/allreduce_base.h
+++ b/subtree/rabit/src/allreduce_base.h
@@ -42,7 +42,7 @@ class AllreduceBase : public IEngine {
   // shutdown the engine
   virtual void Shutdown(void);
   /*!
-   * \brief set parameters to the engine 
+   * \brief set parameters to the engine
    * \param name parameter name
    * \param val parameter value
    */
@@ -72,7 +72,7 @@ class AllreduceBase : public IEngine {
     return host_uri;
   }
   /*!
-   * \brief perform in-place allreduce, on sendrecvbuf 
+   * \brief perform in-place allreduce, on sendrecvbuf
    *        this function is NOT thread-safe
    * \param sendrecvbuf_ buffer for both sending and recving data
    * \param type_nbytes the unit number of bytes the type have
@@ -82,7 +82,7 @@ class AllreduceBase : public IEngine {
    *                     will be called by the function before performing Allreduce, to intialize the data in sendrecvbuf_.
    *                     If the result of Allreduce can be recovered directly, then prepare_func will NOT be called
    * \param prepare_arg argument used to passed into the lazy preprocessing function
-   */  
+   */
   virtual void Allreduce(void *sendrecvbuf_,
                          size_t type_nbytes,
                          size_t count,
@@ -90,6 +90,7 @@ class AllreduceBase : public IEngine {
                          PreprocFunction prepare_fun = NULL,
                          void *prepare_arg = NULL) {
     if (prepare_fun != NULL) prepare_fun(prepare_arg);
+    if (world_size == 1) return;
     utils::Assert(TryAllreduce(sendrecvbuf_,
                                type_nbytes, count, reducer) == kSuccess,
                   "Allreduce failed");
@@ -101,6 +102,7 @@ class AllreduceBase : public IEngine {
    * \param root the root worker id to broadcast the data
    */
   virtual void Broadcast(void *sendrecvbuf_, size_t total_size, int root) {
+    if (world_size == 1) return;
     utils::Assert(TryBroadcast(sendrecvbuf_, total_size, root) == kSuccess,
                   "Broadcast failed");
   }
@@ -115,14 +117,14 @@ class AllreduceBase : public IEngine {
    * \return the version number of check point loaded
    *     if returned version == 0, this means no model has been CheckPointed
    *     the p_model is not touched, user should do necessary initialization by themselves
-   *   
+   *
    *   Common usage example:
    *      int iter = rabit::LoadCheckPoint(&model);
    *      if (iter == 0) model.InitParameters();
    *      for (i = iter; i < max_iter; ++i) {
    *        do many things, include allreduce
    *        rabit::CheckPoint(model);
-   *      } 
+   *      }
    *
    * \sa CheckPoint, VersionNumber
    */
@@ -133,7 +135,7 @@ class AllreduceBase : public IEngine {
   /*!
    * \brief checkpoint the model, meaning we finished a stage of execution
    *  every time we call check point, there is a version number which will increase by one
-   * 
+   *
    * \param global_model pointer to the globally shared model/state
    *   when calling this function, the caller need to gauranttees that global_model
    *   is the same in all nodes
@@ -153,16 +155,16 @@ class AllreduceBase : public IEngine {
   /*!
    * \brief This function can be used to replace CheckPoint for global_model only,
    *   when certain condition is met(see detailed expplaination).
-   * 
+   *
    *   This is a "lazy" checkpoint such that only the pointer to global_model is
    *   remembered and no memory copy is taken. To use this function, the user MUST ensure that:
    *   The global_model must remain unchanged util last call of Allreduce/Broadcast in current version finishs.
-   *   In another words, global_model model can be changed only between last call of 
+   *   In another words, global_model model can be changed only between last call of
    *   Allreduce/Broadcast and LazyCheckPoint in current version
-   *   
+   *
    *   For example, suppose the calling sequence is:
    *   LazyCheckPoint, code1, Allreduce, code2, Broadcast, code3, LazyCheckPoint
-   *   
+   *
    *   If user can only changes global_model in code3, then LazyCheckPoint can be used to
    *   improve efficiency of the program.
    * \param global_model pointer to the globally shared model/state
@@ -189,8 +191,8 @@ class AllreduceBase : public IEngine {
   virtual void InitAfterException(void) {
     utils::Error("InitAfterException: not implemented");
   }
-  /*! 
-   * \brief report current status to the job tracker 
+  /*!
+   * \brief report current status to the job tracker
    * depending on the job tracker we are in
    */
   inline void ReportStatus(void) const {
@@ -211,7 +213,7 @@ class AllreduceBase : public IEngine {
     kRecvZeroLen,
     /*! \brief a neighbor node go down, the connection is dropped */
     kSockError,
-    /*! 
+    /*!
      * \brief another node which is not my neighbor go down,
      *   get Out-of-Band exception notification from my neighbor
      */
@@ -223,7 +225,7 @@ class AllreduceBase : public IEngine {
     ReturnTypeEnum value;
     // constructor
     ReturnType() {}
-    ReturnType(ReturnTypeEnum value) : value(value){}
+    ReturnType(ReturnTypeEnum value) : value(value) {}  // NOLINT(*)
     inline bool operator==(const ReturnTypeEnum &v) const {
       return value == v;
     }
@@ -232,8 +234,13 @@ class AllreduceBase : public IEngine {
     }
   };
   /*! \brief translate errno to return type */
-  inline static ReturnType Errno2Return(int errsv) {
-    if (errsv == EAGAIN || errsv == EWOULDBLOCK) return kSuccess;
+  inline static ReturnType Errno2Return() {
+    int errsv = utils::Socket::GetLastError();
+    if (errsv == EAGAIN || errsv == EWOULDBLOCK || errsv == 0) return kSuccess;
+#ifdef _WIN32
+    if (errsv == WSAEWOULDBLOCK) return kSuccess;
+    if (errsv == WSAECONNRESET) return kConnReset;
+#endif
     if (errsv == ECONNRESET) return kConnReset;
     return kSockError;
   }
@@ -253,7 +260,7 @@ class AllreduceBase : public IEngine {
     // buffer size, in bytes
     size_t buffer_size;
     // constructor
-    LinkRecord(void) 
+    LinkRecord(void)
         : buffer_head(NULL), buffer_size(0) {
     }
     // initialize buffer
@@ -297,7 +304,7 @@ class AllreduceBase : public IEngine {
       if (len == 0) {
         sock.Close(); return kRecvZeroLen;
       }
-      if (len == -1) return Errno2Return(errno);
+      if (len == -1) return Errno2Return();
       size_read += static_cast<size_t>(len);
       return kSuccess;
     }
@@ -316,7 +323,7 @@ class AllreduceBase : public IEngine {
       if (len == 0) {
         sock.Close(); return kRecvZeroLen;
       }
-      if (len == -1) return Errno2Return(errno);
+      if (len == -1) return Errno2Return();
       size_read += static_cast<size_t>(len);
       return kSuccess;
     }
@@ -329,7 +336,7 @@ class AllreduceBase : public IEngine {
     inline ReturnType WriteFromArray(const void *sendbuf_, size_t max_size) {
       const char *p = static_cast<const char*>(sendbuf_);
       ssize_t len = sock.Send(p + size_write, max_size - size_write);
-      if (len == -1) return Errno2Return(errno);
+      if (len == -1) return Errno2Return();
       size_write += static_cast<size_t>(len);
       return kSuccess;
     }
@@ -370,7 +377,7 @@ class AllreduceBase : public IEngine {
    *    The kSuccess TryAllreduce does NOT mean every node have successfully finishes TryAllreduce.
    *    It only means the current node get the correct result of Allreduce.
    *    However, it means every node finishes LAST call(instead of this one) of Allreduce/Bcast
-   * 
+   *
    * \param sendrecvbuf_ buffer for both sending and recving data
    * \param type_nbytes the unit number of bytes the type have
    * \param count number of elements to be reduced
@@ -390,7 +397,7 @@ class AllreduceBase : public IEngine {
    * \return this function can return kSuccess, kSockError, kGetExcept, see ReturnType for details
    * \sa ReturnType
    */
-  ReturnType TryBroadcast(void *sendrecvbuf_, size_t size, int root);   
+  ReturnType TryBroadcast(void *sendrecvbuf_, size_t size, int root);
   /*!
    * \brief perform in-place allreduce, on sendrecvbuf,
    * this function implements tree-shape reduction
@@ -426,14 +433,14 @@ class AllreduceBase : public IEngine {
                               size_t size_prev_slice);
   /*!
    * \brief perform in-place allreduce, reduce on the sendrecvbuf,
-   * 
+   *
    *  after the function, node k get k-th segment of the reduction result
    *  the k-th segment is defined by [k * step, min((k + 1) * step,count) )
    *  where step = ceil(count / world_size)
    *
    * \param sendrecvbuf_ buffer for both sending and recving data
    * \param type_nbytes the unit number of bytes the type have
-   * \param count number of elements to be reduced   
+   * \param count number of elements to be reduced
    * \param reducer reduce function
    * \return this function can return kSuccess, kSockError, kGetExcept, see ReturnType for details
    * \sa ReturnType, TryAllreduce
@@ -458,7 +465,7 @@ class AllreduceBase : public IEngine {
                               size_t count,
                               ReduceFunction reducer);
   /*!
-   * \brief function used to report error when a link goes wrong 
+   * \brief function used to report error when a link goes wrong
    * \param link the pointer to the link who causes the error
    * \param err the error type
    */
@@ -512,7 +519,9 @@ class AllreduceBase : public IEngine {
   int rank;
   // world size
   int world_size;
+  // connect retry time
+  int connect_retry;
 };
 }  // namespace engine
 }  // namespace rabit
-#endif  // RABIT_ALLREDUCE_BASE_H
+#endif  // RABIT_ALLREDUCE_BASE_H_
diff --git a/subtree/rabit/src/allreduce_mock.h b/subtree/rabit/src/allreduce_mock.h
index 4c271e7ba4b9..c3f9f4f1ddf0 100644
--- a/subtree/rabit/src/allreduce_mock.h
+++ b/subtree/rabit/src/allreduce_mock.h
@@ -1,8 +1,9 @@
 /*!
+ * Copyright by Contributors
  * \file allreduce_mock.h
  * \brief Mock test module of AllReduce engine,
  * insert failures in certain call point, to test if the engine is robust to failure
- * 
+ *
  * \author Ignacio Cano, Tianqi Chen
  */
 #ifndef RABIT_ALLREDUCE_MOCK_H_
@@ -68,7 +69,7 @@ class AllreduceMock : public AllreduceRobust {
       DummySerializer dum;
       ComboSerializer com(global_model, local_model);
       return AllreduceRobust::LoadCheckPoint(&dum, &com);
-    }    
+    }
   }
   virtual void CheckPoint(const Serializable *global_model,
                           const Serializable *local_model) {
@@ -100,6 +101,7 @@ class AllreduceMock : public AllreduceRobust {
     this->Verify(MockKey(rank, version_number, seq_counter, num_trial), "LazyCheckPoint");
     AllreduceRobust::LazyCheckPoint(global_model);
   }
+
  protected:
   // force checkpoint to local
   int force_local;
@@ -108,7 +110,7 @@ class AllreduceMock : public AllreduceRobust {
   // sum of allreduce
   double tsum_allreduce;
   double time_checkpoint;
-  
+
  private:
   struct DummySerializer : public Serializable {
     virtual void Load(Stream *fi) {
@@ -126,7 +128,7 @@ class AllreduceMock : public AllreduceRobust {
     }
     ComboSerializer(const Serializable *lhs, const Serializable *rhs)
         : lhs(NULL), rhs(NULL), c_lhs(lhs), c_rhs(rhs) {
-    }    
+    }
     virtual void Load(Stream *fi) {
       if (lhs != NULL) lhs->Load(fi);
       if (rhs != NULL) rhs->Load(fi);
@@ -143,10 +145,10 @@ class AllreduceMock : public AllreduceRobust {
     int seqno;
     int ntrial;
     MockKey(void) {}
-    MockKey(int rank, int version, int seqno, int ntrial) 
+    MockKey(int rank, int version, int seqno, int ntrial)
         : rank(rank), version(version), seqno(seqno), ntrial(ntrial) {}
     inline bool operator==(const MockKey &b) const {
-      return rank == b.rank && 
+      return rank == b.rank &&
           version == b.version &&
           seqno == b.seqno &&
           ntrial == b.ntrial;
@@ -173,4 +175,4 @@ class AllreduceMock : public AllreduceRobust {
 };
 }  // namespace engine
 }  // namespace rabit
-#endif // RABIT_ALLREDUCE_MOCK_H_
+#endif  // RABIT_ALLREDUCE_MOCK_H_
diff --git a/subtree/rabit/src/allreduce_robust-inl.h b/subtree/rabit/src/allreduce_robust-inl.h
index d8cc8dcddbf6..d3cbc003306e 100644
--- a/subtree/rabit/src/allreduce_robust-inl.h
+++ b/subtree/rabit/src/allreduce_robust-inl.h
@@ -2,17 +2,17 @@
  *  Copyright (c) 2014 by Contributors
  * \file allreduce_robust-inl.h
  * \brief implementation of inline template function in AllreduceRobust
- *   
+ *
  * \author Tianqi Chen
  */
-#ifndef RABIT_ENGINE_ROBUST_INL_H_
-#define RABIT_ENGINE_ROBUST_INL_H_
+#ifndef RABIT_ALLREDUCE_ROBUST_INL_H_
+#define RABIT_ALLREDUCE_ROBUST_INL_H_
 #include <vector>
 
 namespace rabit {
 namespace engine {
 /*!
- * \brief run message passing algorithm on the allreduce tree 
+ * \brief run message passing algorithm on the allreduce tree
  *        the result is edge message stored in p_edge_in and p_edge_out
  * \param node_value the value associated with current node
  * \param p_edge_in used to store input message from each of the edge
@@ -35,7 +35,7 @@ inline AllreduceRobust::ReturnType
 AllreduceRobust::MsgPassing(const NodeType &node_value,
                             std::vector<EdgeType> *p_edge_in,
                             std::vector<EdgeType> *p_edge_out,
-                            EdgeType (*func)
+                            EdgeType(*func)
                             (const NodeType &node_value,
                              const std::vector<EdgeType> &edge_in,
                              size_t out_index)) {
@@ -80,8 +80,16 @@ AllreduceRobust::MsgPassing(const NodeType &node_value,
             selecter.WatchRead(links[i].sock);
           }
           break;
-        case 1: if (i == parent_index) selecter.WatchWrite(links[i].sock); break;
-        case 2: if (i == parent_index) selecter.WatchRead(links[i].sock); break;
+        case 1:
+          if (i == parent_index) {
+            selecter.WatchWrite(links[i].sock);
+          }
+          break;
+        case 2:
+          if (i == parent_index) {
+            selecter.WatchRead(links[i].sock);
+          }
+          break;
         case 3:
           if (i != parent_index && links[i].size_write != sizeof(EdgeType)) {
             selecter.WatchWrite(links[i].sock);
@@ -158,4 +166,4 @@ AllreduceRobust::MsgPassing(const NodeType &node_value,
 }
 }  // namespace engine
 }  // namespace rabit
-#endif  // RABIT_ENGINE_ROBUST_INL_H_
+#endif  // RABIT_ALLREDUCE_ROBUST_INL_H_
diff --git a/subtree/rabit/src/allreduce_robust.cc b/subtree/rabit/src/allreduce_robust.cc
index 33960349832a..175751842930 100644
--- a/subtree/rabit/src/allreduce_robust.cc
+++ b/subtree/rabit/src/allreduce_robust.cc
@@ -27,7 +27,7 @@ AllreduceRobust::AllreduceRobust(void) {
   result_buffer_round = 1;
   global_lazycheck = NULL;
   use_local_model = -1;
-  recover_counter = 0;  
+  recover_counter = 0;
   env_vars.push_back("rabit_global_replica");
   env_vars.push_back("rabit_local_replica");
 }
@@ -49,7 +49,7 @@ void AllreduceRobust::Shutdown(void) {
   AllreduceBase::Shutdown();
 }
 /*!
- * \brief set parameters to the engine 
+ * \brief set parameters to the engine
  * \param name parameter name
  * \param val parameter value
  */
@@ -61,7 +61,7 @@ void AllreduceRobust::SetParam(const char *name, const char *val) {
   }
 }
 /*!
- * \brief perform in-place allreduce, on sendrecvbuf 
+ * \brief perform in-place allreduce, on sendrecvbuf
  *        this function is NOT thread-safe
  * \param sendrecvbuf_ buffer for both sending and recving data
  * \param type_nbytes the unit number of bytes the type have
@@ -147,14 +147,14 @@ void AllreduceRobust::Broadcast(void *sendrecvbuf_, size_t total_size, int root)
  * \return the version number of check point loaded
  *     if returned version == 0, this means no model has been CheckPointed
  *     the p_model is not touched, user should do necessary initialization by themselves
- *   
+ *
  *   Common usage example:
  *      int iter = rabit::LoadCheckPoint(&model);
  *      if (iter == 0) model.InitParameters();
  *      for (i = iter; i < max_iter; ++i) {
  *        do many things, include allreduce
  *        rabit::CheckPoint(model);
- *      } 
+ *      }
  *
  * \sa CheckPoint, VersionNumber
  */
@@ -208,7 +208,7 @@ int AllreduceRobust::LoadCheckPoint(Serializable *global_model,
  * \brief internal consistency check function,
  *  use check to ensure user always call CheckPoint/LoadCheckPoint
  *  with or without local but not both, this function will set the approperiate settings
- *  in the first call of LoadCheckPoint/CheckPoint 
+ *  in the first call of LoadCheckPoint/CheckPoint
  *
  * \param with_local whether the user calls CheckPoint with local model
  */
@@ -224,14 +224,14 @@ void AllreduceRobust::LocalModelCheck(bool with_local) {
       num_local_replica = 0;
     }
   } else {
-    utils::Check(use_local_model == int(with_local),
+    utils::Check(use_local_model == static_cast<int>(with_local),
                  "Can only call Checkpoint/LoadCheckPoint always with"\
                  "or without local_model, but not mixed case");
   }
 }
 /*!
  * \brief internal implementation of checkpoint, support both lazy and normal way
- * 
+ *
  * \param global_model pointer to the globally shared model/state
  *   when calling this function, the caller need to gauranttees that global_model
  *   is the same in all nodes
@@ -423,7 +423,7 @@ AllreduceRobust::ReturnType AllreduceRobust::TryResetLinks(void) {
  *         recover links according to the error type reported
  *        if there is no error, return true
  * \param err_type the type of error happening in the system
- * \return true if err_type is kSuccess, false otherwise 
+ * \return true if err_type is kSuccess, false otherwise
  */
 bool AllreduceRobust::CheckAndRecover(ReturnType err_type) {
   if (err_type == kSuccess) return true;
@@ -488,7 +488,7 @@ ShortestDist(const std::pair<bool, size_t> &node_value,
  * \brief message passing function, used to decide the
  *    data request from each edge, whether need to request data from certain edge
  * \param node_value a pair of request_data and best_link
- *           request_data stores whether current node need to request data 
+ *           request_data stores whether current node need to request data
  *           best_link gives the best edge index to fetch the data
  * \param req_in the data request from incoming edges
  * \param out_index the edge index of output link
@@ -524,7 +524,7 @@ inline char DataRequest(const std::pair<bool, int> &node_value,
  *
  * \return this function can return kSuccess/kSockError/kGetExcept, see ReturnType for details
  * \sa ReturnType
- */  
+ */
 AllreduceRobust::ReturnType
 AllreduceRobust::TryDecideRouting(AllreduceRobust::RecoverType role,
                                   size_t *p_size,
@@ -586,7 +586,7 @@ AllreduceRobust::TryDecideRouting(AllreduceRobust::RecoverType role,
  *
  * \return this function can return kSuccess/kSockError/kGetExcept, see ReturnType for details
  * \sa ReturnType, TryDecideRouting
- */  
+ */
 AllreduceRobust::ReturnType
 AllreduceRobust::TryRecoverData(RecoverType role,
                                 void *sendrecvbuf_,
@@ -644,7 +644,7 @@ AllreduceRobust::TryRecoverData(RecoverType role,
     if (role == kRequestData) {
       const int pid = recv_link;
       if (selecter.CheckRead(links[pid].sock)) {
-        ReturnType ret = links[pid].ReadToArray(sendrecvbuf_, size);        
+        ReturnType ret = links[pid].ReadToArray(sendrecvbuf_, size);
         if (ret != kSuccess) {
           return ReportError(&links[pid], ret);
         }
@@ -691,7 +691,7 @@ AllreduceRobust::TryRecoverData(RecoverType role,
           if (len != -1) {
             links[i].size_write += len;
           } else {
-            ReturnType ret = Errno2Return(errno);
+            ReturnType ret = Errno2Return();
             if (ret != kSuccess) return ReportError(&links[i], ret);
           }
         }
@@ -823,10 +823,10 @@ AllreduceRobust::TryGetResult(void *sendrecvbuf, size_t size, int seqno, bool re
  * \param buf the buffer to store the result
  * \param size the total size of the buffer
  * \param flag flag information about the action \sa ActionSummary
- * \param seqno sequence number of the action, if it is special action with flag set, 
+ * \param seqno sequence number of the action, if it is special action with flag set,
  *              seqno needs to be set to ActionSummary::kSpecialOp
  *
- * \return if this function can return true or false 
+ * \return if this function can return true or false
  *    - true means buf already set to the
  *           result by recovering procedure, the action is complete, no further action is needed
  *    - false means this is the lastest action that has not yet been executed, need to execute the action
@@ -907,7 +907,7 @@ bool AllreduceRobust::RecoverExec(void *buf, size_t size, int flag, int seqno) {
  *        plus replication of states in previous num_local_replica hops in the ring
  *
  * The input parameters must contain the valid local states available in current nodes,
- * This function try ist best to "complete" the missing parts of local_rptr and local_chkpt 
+ * This function try ist best to "complete" the missing parts of local_rptr and local_chkpt
  * If there is sufficient information in the ring, when the function returns, local_chkpt will
  * contain num_local_replica + 1 checkpoints (including the chkpt of this node)
  * If there is no sufficient information in the ring, this function the number of checkpoints
@@ -1161,7 +1161,7 @@ AllreduceRobust::RingPassing(void *sendrecvbuf_,
       if (len != -1) {
         read_ptr += static_cast<size_t>(len);
       } else {
-        ReturnType ret = Errno2Return(errno);
+        ReturnType ret = Errno2Return();
         if (ret != kSuccess) return ReportError(&prev, ret);
       }
     }
@@ -1171,7 +1171,7 @@ AllreduceRobust::RingPassing(void *sendrecvbuf_,
       if (len != -1) {
         write_ptr += static_cast<size_t>(len);
       } else {
-        ReturnType ret = Errno2Return(errno);
+        ReturnType ret = Errno2Return();
         if (ret != kSuccess) return ReportError(&prev, ret);
       }
     }
diff --git a/subtree/rabit/src/allreduce_robust.h b/subtree/rabit/src/allreduce_robust.h
index 658d6f8c70ef..caf2e57afcb7 100644
--- a/subtree/rabit/src/allreduce_robust.h
+++ b/subtree/rabit/src/allreduce_robust.h
@@ -5,7 +5,7 @@
  *   using TCP non-block socket and tree-shape reduction.
  *
  *   This implementation considers the failure of nodes
- *   
+ *
  * \author Tianqi Chen, Ignacio Cano, Tianyi Zhou
  */
 #ifndef RABIT_ALLREDUCE_ROBUST_H_
@@ -28,13 +28,13 @@ class AllreduceRobust : public AllreduceBase {
   /*! \brief shutdown the engine */
   virtual void Shutdown(void);
   /*!
-   * \brief set parameters to the engine 
+   * \brief set parameters to the engine
    * \param name parameter name
    * \param val parameter value
    */
   virtual void SetParam(const char *name, const char *val);
   /*!
-   * \brief perform in-place allreduce, on sendrecvbuf 
+   * \brief perform in-place allreduce, on sendrecvbuf
    *        this function is NOT thread-safe
    * \param sendrecvbuf_ buffer for both sending and recving data
    * \param type_nbytes the unit number of bytes the type have
@@ -69,14 +69,14 @@ class AllreduceRobust : public AllreduceBase {
    * \return the version number of check point loaded
    *     if returned version == 0, this means no model has been CheckPointed
    *     the p_model is not touched, user should do necessary initialization by themselves
-   *   
+   *
    *   Common usage example:
    *      int iter = rabit::LoadCheckPoint(&model);
    *      if (iter == 0) model.InitParameters();
    *      for (i = iter; i < max_iter; ++i) {
    *        do many things, include allreduce
    *        rabit::CheckPoint(model);
-   *      } 
+   *      }
    *
    * \sa CheckPoint, VersionNumber
    */
@@ -85,7 +85,7 @@ class AllreduceRobust : public AllreduceBase {
   /*!
    * \brief checkpoint the model, meaning we finished a stage of execution
    *  every time we call check point, there is a version number which will increase by one
-   * 
+   *
    * \param global_model pointer to the globally shared model/state
    *   when calling this function, the caller need to gauranttees that global_model
    *   is the same in all nodes
@@ -105,16 +105,16 @@ class AllreduceRobust : public AllreduceBase {
   /*!
    * \brief This function can be used to replace CheckPoint for global_model only,
    *   when certain condition is met(see detailed expplaination).
-   * 
+   *
    *   This is a "lazy" checkpoint such that only the pointer to global_model is
    *   remembered and no memory copy is taken. To use this function, the user MUST ensure that:
    *   The global_model must remain unchanged util last call of Allreduce/Broadcast in current version finishs.
-   *   In another words, global_model model can be changed only between last call of 
+   *   In another words, global_model model can be changed only between last call of
    *   Allreduce/Broadcast and LazyCheckPoint in current version
-   *   
+   *
    *   For example, suppose the calling sequence is:
    *   LazyCheckPoint, code1, Allreduce, code2, Broadcast, code3, LazyCheckPoint
-   *   
+   *
    *   If user can only changes global_model in code3, then LazyCheckPoint can be used to
    *   improve efficiency of the program.
    * \param global_model pointer to the globally shared model/state
@@ -287,6 +287,7 @@ class AllreduceRobust : public AllreduceBase {
       if (seqno_.size() == 0) return -1;
       return seqno_.back();
     }
+
    private:
     // sequence number of each
     std::vector<int> seqno_;
@@ -301,14 +302,14 @@ class AllreduceRobust : public AllreduceBase {
    * \brief internal consistency check function,
    *  use check to ensure user always call CheckPoint/LoadCheckPoint
    *  with or without local but not both, this function will set the approperiate settings
-   *  in the first call of LoadCheckPoint/CheckPoint 
+   *  in the first call of LoadCheckPoint/CheckPoint
    *
    * \param with_local whether the user calls CheckPoint with local model
    */
   void LocalModelCheck(bool with_local);
   /*!
    * \brief internal implementation of checkpoint, support both lazy and normal way
-   * 
+   *
    * \param global_model pointer to the globally shared model/state
    *   when calling this function, the caller need to gauranttees that global_model
    *   is the same in all nodes
@@ -326,10 +327,10 @@ class AllreduceRobust : public AllreduceBase {
    *  after this function finishes, all the messages received and sent
    *  before in all live links are discarded,
    *  This allows us to get a fresh start after error has happened
-   *    
+   *
    *  TODO(tqchen): this function is not yet functioning was not used by engine,
    *   simple resetlink and reconnect strategy is used
-   * 
+   *
    * \return this function can return kSuccess or kSockError
    *         when kSockError is returned, it simply means there are bad sockets in the links,
    *         and some link recovery proceduer is needed
@@ -340,7 +341,7 @@ class AllreduceRobust : public AllreduceBase {
    *         recover links according to the error type reported
    *        if there is no error, return true
    * \param err_type the type of error happening in the system
-   * \return true if err_type is kSuccess, false otherwise 
+   * \return true if err_type is kSuccess, false otherwise
    */
   bool CheckAndRecover(ReturnType err_type);
   /*!
@@ -355,7 +356,7 @@ class AllreduceRobust : public AllreduceBase {
    * \param seqno sequence number of the action, if it is special action with flag set,
    *        seqno needs to be set to ActionSummary::kSpecialOp
    *
-   * \return if this function can return true or false 
+   * \return if this function can return true or false
    *    - true means buf already set to the
    *           result by recovering procedure, the action is complete, no further action is needed
    *    - false means this is the lastest action that has not yet been executed, need to execute the action
@@ -364,7 +365,7 @@ class AllreduceRobust : public AllreduceBase {
                    int seqno = ActionSummary::kSpecialOp);
   /*!
    * \brief try to load check point
-   *        
+   *
    *        This is a collaborative function called by all nodes
    *        only the nodes with requester set to true really needs to load the check point
    *        other nodes acts as collaborative roles to complete this request
@@ -395,7 +396,7 @@ class AllreduceRobust : public AllreduceBase {
    * \param p_size used to store the size of the message, for node in state kHaveData,
    *               this size must be set correctly before calling the function
    *               for others, this surves as output parameter
-   
+
    * \param p_recvlink used to store the link current node should recv data from, if necessary
    *          this can be -1, which means current node have the data
    * \param p_req_in used to store the resulting vector, indicating which link we should send the data to
@@ -432,7 +433,7 @@ class AllreduceRobust : public AllreduceBase {
    *        plus replication of states in previous num_local_replica hops in the ring
    *
    * The input parameters must contain the valid local states available in current nodes,
-   * This function try ist best to "complete" the missing parts of local_rptr and local_chkpt 
+   * This function try ist best to "complete" the missing parts of local_rptr and local_chkpt
    * If there is sufficient information in the ring, when the function returns, local_chkpt will
    * contain num_local_replica + 1 checkpoints (including the chkpt of this node)
    * If there is no sufficient information in the ring, this function the number of checkpoints
@@ -487,7 +488,7 @@ o   *  the input state must exactly one saved state(local state of current node)
                          LinkRecord *read_link,
                          LinkRecord *write_link);
   /*!
-   * \brief run message passing algorithm on the allreduce tree 
+   * \brief run message passing algorithm on the allreduce tree
    *        the result is edge message stored in p_edge_in and p_edge_out
    * \param node_value the value associated with current node
    * \param p_edge_in used to store input message from each of the edge
@@ -509,7 +510,7 @@ o   *  the input state must exactly one saved state(local state of current node)
   inline ReturnType MsgPassing(const NodeType &node_value,
                                std::vector<EdgeType> *p_edge_in,
                                std::vector<EdgeType> *p_edge_out,
-                               EdgeType (*func)
+                               EdgeType(*func)
                                (const NodeType &node_value,
                                 const std::vector<EdgeType> &edge_in,
                                 size_t out_index));
diff --git a/subtree/rabit/src/engine.cc b/subtree/rabit/src/engine.cc
index c5041642ef1f..0f4770fe20e6 100644
--- a/subtree/rabit/src/engine.cc
+++ b/subtree/rabit/src/engine.cc
@@ -3,7 +3,7 @@
  * \file engine.cc
  * \brief this file governs which implementation of engine we are actually using
  *  provides an singleton of engine interface
- *   
+ *
  * \author Tianqi Chen, Ignacio Cano, Tianyi Zhou
  */
 #define _CRT_SECURE_NO_WARNINGS
@@ -60,7 +60,7 @@ void Allreduce_(void *sendrecvbuf,
 }
 
 // code for reduce handle
-ReduceHandle::ReduceHandle(void) 
+ReduceHandle::ReduceHandle(void)
   : handle_(NULL), redfunc_(NULL), htype_(NULL) {
 }
 ReduceHandle::~ReduceHandle(void) {}
diff --git a/subtree/rabit/src/engine_mpi.cc b/subtree/rabit/src/engine_mpi.cc
index 5c8a4c3726ae..11e55335b80a 100644
--- a/subtree/rabit/src/engine_mpi.cc
+++ b/subtree/rabit/src/engine_mpi.cc
@@ -3,7 +3,7 @@
  * \file engine_mpi.cc
  * \brief this file gives an implementation of engine interface using MPI,
  *   this will allow rabit program to run with MPI, but do not comes with fault tolerant
- *   
+ *
  * \author Tianqi Chen
  */
 #define _CRT_SECURE_NO_WARNINGS
@@ -110,6 +110,8 @@ inline MPI::Datatype GetType(mpi::DataType dtype) {
     case kULong: return MPI::UNSIGNED_LONG;
     case kFloat: return MPI::FLOAT;
     case kDouble: return MPI::DOUBLE;
+    case kLongLong: return MPI::LONG_LONG;
+    case kULongLong: return MPI::UNSIGNED_LONG_LONG;
   }
   utils::Error("unknown mpi::DataType");
   return MPI::CHAR;
@@ -141,7 +143,7 @@ void Allreduce_(void *sendrecvbuf,
 }
 
 // code for reduce handle
-ReduceHandle::ReduceHandle(void) 
+ReduceHandle::ReduceHandle(void)
     : handle_(NULL), redfunc_(NULL), htype_(NULL) {
 }
 ReduceHandle::~ReduceHandle(void) {
@@ -164,7 +166,7 @@ void ReduceHandle::Init(IEngine::ReduceFunction redfunc, size_t type_nbytes) {
   if (type_nbytes != 0) {
     MPI::Datatype *dtype = new MPI::Datatype();
     if (type_nbytes % 8 == 0) {
-      *dtype = MPI::LONG.Create_contiguous(type_nbytes / sizeof(long));
+      *dtype = MPI::LONG.Create_contiguous(type_nbytes / sizeof(long));  // NOLINT(*)
     } else if (type_nbytes % 4 == 0) {
       *dtype = MPI::INT.Create_contiguous(type_nbytes / sizeof(int));
     } else {
@@ -193,7 +195,7 @@ void ReduceHandle::Allreduce(void *sendrecvbuf,
       dtype->Free();
     }
     if (type_nbytes % 8 == 0) {
-      *dtype = MPI::LONG.Create_contiguous(type_nbytes / sizeof(long));
+      *dtype = MPI::LONG.Create_contiguous(type_nbytes / sizeof(long));  // NOLINT(*)
     } else if (type_nbytes % 4 == 0) {
       *dtype = MPI::INT.Create_contiguous(type_nbytes / sizeof(int));
     } else {
diff --git a/subtree/rabit/src/socket.h b/subtree/rabit/src/socket.h
index c0eb6278cc75..6df7a7b7835f 100644
--- a/subtree/rabit/src/socket.h
+++ b/subtree/rabit/src/socket.h
@@ -51,7 +51,7 @@ struct SockAddr {
     utils::Check(gethostname(&buf[0], 256) != -1, "fail to get host name");
     return std::string(buf.c_str());
   }
-  /*! 
+  /*!
    * \brief set the address
    * \param url the url of the address
    * \param port the port of address
@@ -83,7 +83,7 @@ struct SockAddr {
   }
 };
 
-/*! 
+/*!
  * \brief base class containing common operations of TCP and UDP sockets
  */
 class Socket {
@@ -94,6 +94,25 @@ class Socket {
   inline operator SOCKET() const {
     return sockfd;
   }
+  /*!
+   * \return last error of socket operation
+   */
+  inline static int GetLastError(void) {
+#ifdef _WIN32
+    return WSAGetLastError();
+#else
+    return errno;
+#endif
+  }
+  /*! \return whether last error was would block */
+  inline static bool LastErrorWouldBlock(void) {
+    int errsv = GetLastError();
+#ifdef _WIN32
+    return errsv == WSAEWOULDBLOCK;
+#else
+    return errsv == EAGAIN || errsv == EWOULDBLOCK;
+#endif
+  }
   /*!
    * \brief start up the socket module
    *   call this before using the sockets
@@ -110,15 +129,15 @@ class Socket {
     }
 #endif
   }
-  /*! 
+  /*!
    * \brief shutdown the socket module after use, all sockets need to be closed
-   */  
+   */
   inline static void Finalize(void) {
 #ifdef _WIN32
     WSACleanup();
 #endif
   }
-  /*! 
+  /*!
    * \brief set this socket to use non-blocking mode
    * \param non_block whether set it to be non-block, if it is false
    *        it will set it back to block mode
@@ -144,8 +163,8 @@ class Socket {
     }
 #endif
   }
-  /*! 
-   * \brief bind the socket to an address 
+  /*!
+   * \brief bind the socket to an address
    * \param addr
    */
   inline void Bind(const SockAddr &addr) {
@@ -154,7 +173,7 @@ class Socket {
       Socket::Error("Bind");
     }
   }
-  /*! 
+  /*!
    * \brief try bind the socket to host, from start_port to end_port
    * \param start_port starting port number to try
    * \param end_port ending port number to try
@@ -169,11 +188,11 @@ class Socket {
         return port;
       }
 #if defined(_WIN32)
-	  if (WSAGetLastError() != WSAEADDRINUSE) {
-        Socket::Error("TryBindHost");	  
-	  }
+      if (WSAGetLastError() != WSAEADDRINUSE) {
+        Socket::Error("TryBindHost");
+      }
 #else
-	  if (errno != EADDRINUSE) {
+      if (errno != EADDRINUSE) {
         Socket::Error("TryBindHost");
       }
 #endif
@@ -216,8 +235,12 @@ class Socket {
   }
   // report an socket error
   inline static void Error(const char *msg) {
-    int errsv = errno;
+    int errsv = GetLastError();
+#ifdef _WIN32
+    utils::Error("Socket %s Error:WSAError-code=%d", msg, errsv);
+#else
     utils::Error("Socket %s Error:%s", msg, strerror(errsv));
+#endif
   }
 
  protected:
@@ -225,7 +248,7 @@ class Socket {
   }
 };
 
-/*! 
+/*!
  * \brief a wrapper of TCP socket that hopefully be cross platform
  */
 class TCPSocket : public Socket{
@@ -238,10 +261,11 @@ class TCPSocket : public Socket{
   /*!
    * \brief enable/disable TCP keepalive
    * \param keepalive whether to set the keep alive option on
-   */  
+   */
   inline void SetKeepAlive(bool keepalive) {
     int opt = static_cast<int>(keepalive);
-    if (setsockopt(sockfd, SOL_SOCKET, SO_KEEPALIVE, reinterpret_cast<char*>(&opt), sizeof(opt)) < 0) {
+    if (setsockopt(sockfd, SOL_SOCKET, SO_KEEPALIVE,
+                   reinterpret_cast<char*>(&opt), sizeof(opt)) < 0) {
       Socket::Error("SetKeepAlive");
     }
   }
@@ -271,12 +295,12 @@ class TCPSocket : public Socket{
     return TCPSocket(newfd);
   }
   /*!
-   * \brief decide whether the socket is at OOB mark 
+   * \brief decide whether the socket is at OOB mark
    * \return 1 if at mark, 0 if not, -1 if an error occured
    */
   inline int AtMark(void) const {
 #ifdef _WIN32
-	unsigned long atmark;
+    unsigned long atmark;  // NOLINT(*)
     if (ioctlsocket(sockfd, SIOCATMARK, &atmark) != NO_ERROR) return -1;
 #else
     int atmark;
@@ -284,8 +308,8 @@ class TCPSocket : public Socket{
 #endif
     return static_cast<int>(atmark);
   }
-  /*! 
-   * \brief connect to an address 
+  /*!
+   * \brief connect to an address
    * \param addr the address to connect to
    * \return whether connect is successful
    */
@@ -305,8 +329,8 @@ class TCPSocket : public Socket{
     const char *buf = reinterpret_cast<const char*>(buf_);
     return send(sockfd, buf, static_cast<sock_size_t>(len), flag);
   }
-  /*! 
-   * \brief receive data using the socket 
+  /*!
+   * \brief receive data using the socket
    * \param buf_ the pointer to the buffer
    * \param len the size of the buffer
    * \param flags extra flags
@@ -330,7 +354,7 @@ class TCPSocket : public Socket{
     while (ndone <  len) {
       ssize_t ret = send(sockfd, buf, static_cast<ssize_t>(len - ndone), 0);
       if (ret == -1) {
-        if (errno == EAGAIN || errno == EWOULDBLOCK) return ndone;
+        if (LastErrorWouldBlock()) return ndone;
         Socket::Error("SendAll");
       }
       buf += ret;
@@ -352,7 +376,7 @@ class TCPSocket : public Socket{
       ssize_t ret = recv(sockfd, buf,
                          static_cast<sock_size_t>(len - ndone), MSG_WAITALL);
       if (ret == -1) {
-        if (errno == EAGAIN || errno == EWOULDBLOCK) return ndone;
+        if (LastErrorWouldBlock()) return ndone;
         Socket::Error("RecvAll");
       }
       if (ret == 0) return ndone;
@@ -362,7 +386,7 @@ class TCPSocket : public Socket{
     return ndone;
   }
   /*!
-   * \brief send a string over network 
+   * \brief send a string over network
    * \param str the string to be sent
    */
   inline void SendStr(const std::string &str) {
@@ -400,7 +424,7 @@ struct SelectHelper {
     maxfd = 0;
   }
   /*!
-   * \brief add file descriptor to watch for read 
+   * \brief add file descriptor to watch for read
    * \param fd file descriptor to be watched
    */
   inline void WatchRead(SOCKET fd) {
@@ -450,7 +474,7 @@ struct SelectHelper {
    * \param timeout the timeout counter, can be 0, which means wait until the event happen
    * \return 1 if success, 0 if timeout, and -1 if error occurs
    */
-  inline static int WaitExcept(SOCKET fd, long timeout = 0) {
+  inline static int WaitExcept(SOCKET fd, long timeout = 0) { // NOLINT(*)
     fd_set wait_set;
     FD_ZERO(&wait_set);
     FD_SET(fd, &wait_set);
@@ -463,10 +487,10 @@ struct SelectHelper {
    * \param select_write whether to watch for write event
    * \param select_except whether to watch for exception event
    * \param timeout specify timeout in micro-seconds(ms) if equals 0, means select will always block
-   * \return number of active descriptors selected, 
+   * \return number of active descriptors selected,
    *         return -1 if error occurs
    */
-  inline int Select(long timeout = 0) {
+  inline int Select(long timeout = 0) {  // NOLINT(*)
     int ret =  Select_(static_cast<int>(maxfd + 1),
                        &read_set, &write_set, &except_set, timeout);
     if (ret == -1) {
@@ -477,7 +501,7 @@ struct SelectHelper {
 
  private:
   inline static int Select_(int maxfd, fd_set *rfds,
-                            fd_set *wfds, fd_set *efds, long timeout) {
+                            fd_set *wfds, fd_set *efds, long timeout) { // NOLINT(*)
 #if !defined(_WIN32)
     utils::Assert(maxfd < FD_SETSIZE, "maxdf must be smaller than FDSETSIZE");
 #endif
diff --git a/subtree/rabit/test/Makefile b/subtree/rabit/test/Makefile
index a1ff6a854afd..62e4e17f0714 100644
--- a/subtree/rabit/test/Makefile
+++ b/subtree/rabit/test/Makefile
@@ -2,7 +2,7 @@ export CC  = gcc
 export CXX = g++
 export MPICXX = mpicxx
 export LDFLAGS=  -L../lib -pthread -lm  -lrt 
-export CFLAGS = -Wall -O3 -msse2  -Wno-unknown-pragmas -fPIC -I../include  -std=c++11
+export CFLAGS = -Wall -O3 -msse2  -Wno-unknown-pragmas -fPIC -I../include  -std=c++0x
 
 # specify tensor path
 BIN = speed_test model_recover local_recover lazy_recover
diff --git a/subtree/rabit/test/test.mk b/subtree/rabit/test/test.mk
index be3429bab2ed..282a82bc4536 100644
--- a/subtree/rabit/test/test.mk
+++ b/subtree/rabit/test/test.mk
@@ -1,7 +1,7 @@
 # this is a makefile used to show testcases of rabit
 .PHONY: all
 
-all:
+all: model_recover_10_10k  model_recover_10_10k_die_same
 
 # this experiment test recovery with actually process exit, use keepalive to keep program alive
 model_recover_10_10k:
diff --git a/subtree/rabit/tracker/rabit_tracker.py b/subtree/rabit/tracker/rabit_tracker.py
index c8dd896f168b..d8e6ae84d0b2 100644
--- a/subtree/rabit/tracker/rabit_tracker.py
+++ b/subtree/rabit/tracker/rabit_tracker.py
@@ -1,6 +1,6 @@
 """
 Tracker script for rabit
-Implements the tracker control protocol 
+Implements the tracker control protocol
  - start rabit jobs
  - help nodes to establish links with each other
 
@@ -19,13 +19,13 @@
 """
 Extension of socket to handle recv and send of special data
 """
-class ExSocket:    
+class ExSocket:
     def __init__(self, sock):
         self.sock = sock
     def recvall(self, nbytes):
         res = []
         sock = self.sock
-        nread = 0    
+        nread = 0
         while nread < nbytes:
             chunk = self.sock.recv(min(nbytes - nread, 1024))
             nread += len(chunk)
@@ -106,7 +106,7 @@ def assign_rank(self, rank, wait_conn, tree_map, parent_map, ring_map):
             for r in conset:
                 self.sock.sendstr(wait_conn[r].host)
                 self.sock.sendint(wait_conn[r].port)
-                self.sock.sendint(r)        
+                self.sock.sendint(r)
             nerr = self.sock.recvint()
             if nerr != 0:
                 continue
@@ -121,7 +121,7 @@ def assign_rank(self, rank, wait_conn, tree_map, parent_map, ring_map):
                 wait_conn.pop(r, None)
             self.wait_accept = len(badset) - len(conset)
             return rmset
-    
+
 class Tracker:
     def __init__(self, port = 9091, port_end = 9999, verbose = True, hostIP = 'auto'):
         sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
@@ -132,7 +132,7 @@ def __init__(self, port = 9091, port_end = 9999, verbose = True, hostIP = 'auto'
                 break
             except socket.error:
                 continue
-        sock.listen(16)
+        sock.listen(128)
         self.sock = sock
         self.verbose = verbose
         if hostIP == 'auto':
@@ -145,7 +145,7 @@ def slave_envs(self):
         """
         get enviroment variables for slaves
         can be passed in as args or envs
-        """        
+        """
         if self.hostIP == 'dns':
             host = socket.gethostname()
         elif self.hostIP == 'ip':
@@ -153,14 +153,14 @@ def slave_envs(self):
         else:
             host = self.hostIP
         return {'rabit_tracker_uri': host,
-                'rabit_tracker_port': self.port}        
+                'rabit_tracker_port': self.port}
     def get_neighbor(self, rank, nslave):
         rank = rank + 1
         ret = []
         if rank > 1:
             ret.append(rank / 2 - 1)
         if rank * 2 - 1  < nslave:
-            ret.append(rank * 2 - 1)            
+            ret.append(rank * 2 - 1)
         if rank * 2 < nslave:
             ret.append(rank * 2)
         return ret
@@ -198,10 +198,10 @@ def get_ring(self, tree_map, parent_map):
         rlst = self.find_share_ring(tree_map, parent_map, 0)
         assert len(rlst) == len(tree_map)
         ring_map = {}
-        nslave = len(tree_map)        
+        nslave = len(tree_map)
         for r in range(nslave):
             rprev = (r + nslave - 1) % nslave
-            rnext = (r + 1) % nslave            
+            rnext = (r + 1) % nslave
             ring_map[rlst[r]] = (rlst[rprev], rlst[rnext])
         return ring_map
 
@@ -231,7 +231,7 @@ def get_link_map(self, nslave):
             else:
                 parent_map_[rmap[k]] = -1
         return tree_map_, parent_map_, ring_map_
-        
+
     def handle_print(self,slave, msg):
         sys.stdout.write(msg)
 
@@ -253,14 +253,14 @@ def accept_slaves(self, nslave):
         pending = []
         # lazy initialize tree_map
         tree_map = None
-        
+
         while len(shutdown) != nslave:
             fd, s_addr = self.sock.accept()
             s = SlaveEntry(fd, s_addr)
             if s.cmd == 'print':
                 msg = s.sock.recvstr()
                 self.handle_print(s, msg)
-                continue                
+                continue
             if s.cmd == 'shutdown':
                 assert s.rank >= 0 and s.rank not in shutdown
                 assert s.rank not in wait_conn
@@ -280,12 +280,12 @@ def accept_slaves(self, nslave):
                 assert s.world_size == -1 or s.world_size == nslave
             if s.cmd == 'recover':
                 assert s.rank >= 0
-            
+
             rank = s.decide_rank(job_map)
             # batch assignment of ranks
             if rank == -1:
                 assert len(todo_nodes) != 0
-                pending.append(s)                
+                pending.append(s)
                 if len(pending) == len(todo_nodes):
                     pending.sort(key = lambda x : x.host)
                     for s in pending:
diff --git a/subtree/rabit/windows/basic/basic.vcxproj b/subtree/rabit/windows/basic/basic.vcxproj
index 4e686584cc1e..109c405efda1 100644
--- a/subtree/rabit/windows/basic/basic.vcxproj
+++ b/subtree/rabit/windows/basic/basic.vcxproj
@@ -100,6 +100,7 @@
       <FunctionLevelLinking>true</FunctionLevelLinking>
       <IntrinsicFunctions>true</IntrinsicFunctions>
       <AdditionalIncludeDirectories>..\..\include</AdditionalIncludeDirectories>
+      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
     </ClCompile>
     <Link>
       <GenerateDebugInformation>true</GenerateDebugInformation>
diff --git a/subtree/rabit/wrapper/rabit.py b/subtree/rabit/wrapper/rabit.py
index 6282e5cfd77e..91ce3e6ae62a 100644
--- a/subtree/rabit/wrapper/rabit.py
+++ b/subtree/rabit/wrapper/rabit.py
@@ -1,8 +1,9 @@
 """
-Python interface for rabit
-  Reliable Allreduce and Broadcast Library
+Reliable Allreduce and Broadcast Library.
+
 Author: Tianqi Chen
 """
+# pylint: disable=unused-argument,invalid-name,global-statement,dangerous-default-value,
 import cPickle as pickle
 import ctypes
 import os
@@ -10,34 +11,41 @@
 import warnings
 import numpy as np
 
+# version information about the doc
+__version__ = '1.0'
+
 if os.name == 'nt':
     WRAPPER_PATH = os.path.dirname(__file__) + '\\..\\windows\\x64\\Release\\rabit_wrapper%s.dll'
 else:
     WRAPPER_PATH = os.path.dirname(__file__) + '/librabit_wrapper%s.so'
-rbtlib = None
+
+_LIB = None
 
 # load in xgboost library
-def loadlib__(lib = 'standard'):    
-    global rbtlib
-    if rbtlib != None:
-        warnings.Warn('rabit.int call was ignored because it has already been initialized', level = 2)
+def _loadlib(lib='standard'):
+    """Load rabit library."""
+    global _LIB
+    if _LIB != None:
+        warnings.warn('rabit.int call was ignored because it has'\
+                          ' already been initialized', level=2)
         return
     if lib == 'standard':
-        rbtlib = ctypes.cdll.LoadLibrary(WRAPPER_PATH % '')
+        _LIB = ctypes.cdll.LoadLibrary(WRAPPER_PATH % '')
     elif lib == 'mock':
-        rbtlib = ctypes.cdll.LoadLibrary(WRAPPER_PATH % '_mock')
+        _LIB = ctypes.cdll.LoadLibrary(WRAPPER_PATH % '_mock')
     elif lib == 'mpi':
-        rbtlib = ctypes.cdll.LoadLibrary(WRAPPER_PATH % '_mpi')
+        _LIB = ctypes.cdll.LoadLibrary(WRAPPER_PATH % '_mpi')
     else:
         raise Exception('unknown rabit lib %s, can be standard, mock, mpi' % lib)
-    rbtlib.RabitGetRank.restype = ctypes.c_int
-    rbtlib.RabitGetWorldSize.restype = ctypes.c_int
-    rbtlib.RabitVersionNumber.restype = ctypes.c_int
+    _LIB.RabitGetRank.restype = ctypes.c_int
+    _LIB.RabitGetWorldSize.restype = ctypes.c_int
+    _LIB.RabitVersionNumber.restype = ctypes.c_int
 
-def unloadlib__():
-    global rbtlib
-    del rbtlib
-    rbtlib = None
+def _unloadlib():
+    """Unload rabit library."""
+    global _LIB
+    del _LIB
+    _LIB = None
 
 # reduction operators
 MAX = 0
@@ -45,125 +53,118 @@ def unloadlib__():
 SUM = 2
 BITOR = 3
 
-def check_err__():    
-    """
-    reserved function used to check error    
-    """
-    return
+def init(args=None, lib='standard'):
+    """Intialize the rabit module, call this once before using anything.
 
-def init(args = sys.argv, lib = 'standard'):
+    Parameters
+    ----------
+    args: list of str, optional
+        The list of arguments used to initialized the rabit
+        usually you need to pass in sys.argv.
+        Defaults to sys.argv when it is None.
+    lib: {'standard', 'mock', 'mpi'}
+        Type of library we want to load
     """
-    intialize the rabit module, call this once before using anything
-    Arguments:
-        args: list(string) [default=sys.argv]
-           the list of arguments used to initialized the rabit
-           usually you need to pass in sys.argv
-        with_mock: boolean [default=False]
-            Whether initialize the mock test module
-    """
-    loadlib__(lib)
+    if args is None:
+        args = sys.argv
+    _loadlib(lib)
     arr = (ctypes.c_char_p * len(args))()
     arr[:] = args
-    rbtlib.RabitInit(len(args), arr)
-    check_err__()
+    _LIB.RabitInit(len(args), arr)
 
 def finalize():
+    """Finalize the rabit engine.
+
+    Call this function after you finished all jobs.
     """
-    finalize the rabit engine, call this function after you finished all jobs 
-    """
-    rbtlib.RabitFinalize()
-    check_err__()
-    unloadlib__()
+    _LIB.RabitFinalize()
+    _unloadlib()
 
 def get_rank():
+    """Get rank of current process.
+
+    Returns
+    -------
+    rank : int
+        Rank of current process.
     """
-    Returns rank of current process
-    """
-    ret = rbtlib.RabitGetRank()
-    check_err__()
+    ret = _LIB.RabitGetRank()
     return ret
 
 def get_world_size():
+    """Get total number workers.
+
+    Returns
+    -------
+    n : int
+        Total number of process.
     """
-    Returns get total number of process
-    """
-    ret = rbtlib.RabitGetWorldSize()
-    check_err__()
+    ret = _LIB.RabitGetWorldSize()
     return ret
 
 def tracker_print(msg):
-    """
-    print message to the tracker
-    this function can be used to communicate the information of the progress
-    to the tracker
+    """Print message to the tracker.
+
+    This function can be used to communicate the information of
+    the progress to the tracker
+
+    Parameters
+    ----------
+    msg : str
+        The message to be printed to tracker.
     """
     if not isinstance(msg, str):
         msg = str(msg)
-    rbtlib.RabitTrackerPrint(ctypes.c_char_p(msg).encode('utf-8'))
-    check_err__()
+    _LIB.RabitTrackerPrint(ctypes.c_char_p(msg).encode('utf-8'))
 
 def get_processor_name():
-    """
-    Returns the name of processor(host)
+    """Get the processor name.
+
+    Returns
+    -------
+    name : str
+        the name of processor(host)
     """
     mxlen = 256
     length = ctypes.c_ulong()
     buf = ctypes.create_string_buffer(mxlen)
-    rbtlib.RabitGetProcessorName(buf, ctypes.byref(length),
-                                 mxlen)
-    check_err__()
+    _LIB.RabitGetProcessorName(buf, ctypes.byref(length), mxlen)
     return buf.value
 
 def broadcast(data, root):
-    """
-    broadcast object from one node to all other nodes
-    this function will return the broadcasted object
-
-    Example: the following example broadcast hello from rank 0 to all other nodes
-    ```python
-    rabit.init()
-    n = 3
-    rank = rabit.get_rank()
-    s = None
-    if rank == 0:
-        s = {'hello world':100, 2:3}
-    print '@node[%d] before-broadcast: s=\"%s\"' % (rank, str(s))
-    s = rabit.broadcast(s, 0)
-    print '@node[%d] after-broadcast: s=\"%s\"' % (rank, str(s))
-    rabit.finalize()
-    ```
-    
-    Arguments:
-        data: anytype that can be pickled
-              input data, if current rank does not equal root, this can be None
-        root: int
-              rank of the node to broadcast data from
-    Returns:
-        the result of broadcast
+    """Broadcast object from one node to all other nodes.
+
+    Parameters
+    ----------
+    data : any type that can be pickled
+        Input data, if current rank does not equal root, this can be None
+    root : int
+        Rank of the node to broadcast data from.
+
+    Returns
+    -------
+    object : int
+        the result of broadcast.
     """
     rank = get_rank()
     length = ctypes.c_ulong()
     if root == rank:
         assert data is not None, 'need to pass in data when broadcasting'
-        s = pickle.dumps(data, protocol = pickle.HIGHEST_PROTOCOL)
+        s = pickle.dumps(data, protocol=pickle.HIGHEST_PROTOCOL)
         length.value = len(s)
     # run first broadcast
-    rbtlib.RabitBroadcast(ctypes.byref(length),
-                          ctypes.sizeof(ctypes.c_ulong),
-                          root)    
-    check_err__()
+    _LIB.RabitBroadcast(ctypes.byref(length),
+                        ctypes.sizeof(ctypes.c_ulong), root)
     if root != rank:
         dptr = (ctypes.c_char * length.value)()
         # run second
-        rbtlib.RabitBroadcast(ctypes.cast(dptr, ctypes.c_void_p),
-                              length.value, root)
-        check_err__()
+        _LIB.RabitBroadcast(ctypes.cast(dptr, ctypes.c_void_p),
+                            length.value, root)
         data = pickle.loads(dptr.raw)
         del dptr
     else:
-        rbtlib.RabitBroadcast(ctypes.cast(ctypes.c_char_p(s), ctypes.c_void_p),
-                              length.value, root)
-        check_err__()
+        _LIB.RabitBroadcast(ctypes.cast(ctypes.c_char_p(s), ctypes.c_void_p),
+                            length.value, root)
         del s
     return data
 
@@ -179,20 +180,29 @@ def broadcast(data, root):
     np.dtype('float64') : 7
 }
 
-def allreduce(data, op, prepare_fun = None):
-    """
-    perform allreduce, return the result, this function is not thread-safe
-    Arguments:
-        data: numpy ndarray
-           input data 
-        op: int
-            reduction operators, can be MIN, MAX, SUM, BITOR
-        prepare_fun: lambda data
-            Lazy preprocessing function, if it is not None, prepare_fun(data)
-            will be called by the function before performing allreduce, to intialize the data
-            If the result of Allreduce can be recovered directly, then prepare_fun will NOT be called
-    Returns:
-        the result of allreduce, have same shape as data
+def allreduce(data, op, prepare_fun=None):
+    """Perform allreduce, return the result.
+
+    Parameters
+    ----------
+    data: numpy array
+        Input data.
+    op: int
+        Reduction operators, can be MIN, MAX, SUM, BITOR
+    prepare_fun: function
+        Lazy preprocessing function, if it is not None, prepare_fun(data)
+        will be called by the function before performing allreduce, to intialize the data
+        If the result of Allreduce can be recovered directly,
+        then prepare_fun will NOT be called
+
+    Returns
+    -------
+    result : array_like
+        The result of allreduce, have same shape as data
+
+    Notes
+    -----
+    This function is not thread-safe.
     """
     if not isinstance(data, np.ndarray):
         raise Exception('allreduce only takes in numpy.ndarray')
@@ -202,21 +212,21 @@ def allreduce(data, op, prepare_fun = None):
     if buf.dtype not in DTYPE_ENUM__:
         raise Exception('data type %s not supported' % str(buf.dtype))
     if prepare_fun is None:
-        rbtlib.RabitAllreduce(buf.ctypes.data_as(ctypes.c_void_p),
-                              buf.size, DTYPE_ENUM__[buf.dtype],
-                              op, None, None)
+        _LIB.RabitAllreduce(buf.ctypes.data_as(ctypes.c_void_p),
+                            buf.size, DTYPE_ENUM__[buf.dtype],
+                            op, None, None)
     else:
-        PFUNC = ctypes.CFUNCTYPE(None, ctypes.c_void_p)
+        func_ptr = ctypes.CFUNCTYPE(None, ctypes.c_void_p)
         def pfunc(args):
+            """prepare function."""
             prepare_fun(data)
-        rbtlib.RabitAllreduce(buf.ctypes.data_as(ctypes.c_void_p),
-                              buf.size, DTYPE_ENUM__[buf.dtype],
-                              op, PFUNC(pfunc), None)               
-    check_err__()
+        _LIB.RabitAllreduce(buf.ctypes.data_as(ctypes.c_void_p),
+                            buf.size, DTYPE_ENUM__[buf.dtype],
+                            op, func_ptr(pfunc), None)
     return buf
 
 
-def load_model__(ptr, length):
+def _load_model(ptr, length):
     """
     Internal function used by the module,
     unpickle a model from a buffer specified by ptr, length
@@ -229,78 +239,89 @@ def load_model__(ptr, length):
     data = (ctypes.c_char * length).from_address(ctypes.addressof(ptr.contents))
     return pickle.loads(data.raw)
 
-def load_checkpoint(with_local = False):
-    """
-    load latest check point
-    Arguments:
-        with_local: boolean [default = False]
-            whether the checkpoint contains local model
-    Returns: 
+def load_checkpoint(with_local=False):
+    """Load latest check point.
+
+    Parameters
+    ----------
+    with_local: bool, optional
+        whether the checkpoint contains local model
+
+    Returns
+    -------
+    tuple : tuple
         if with_local: return (version, gobal_model, local_model)
         else return (version, gobal_model)
         if returned version == 0, this means no model has been CheckPointed
         and global_model, local_model returned will be None
     """
-    gp = ctypes.POINTER(ctypes.c_char)()
+    gptr = ctypes.POINTER(ctypes.c_char)()
     global_len = ctypes.c_ulong()
     if with_local:
-        lp = ctypes.POINTER(ctypes.c_char)()
+        lptr = ctypes.POINTER(ctypes.c_char)()
         local_len = ctypes.c_ulong()
-        version = rbtlib.RabitLoadCheckPoint(
-            ctypes.byref(gp),
+        version = _LIB.RabitLoadCheckPoint(
+            ctypes.byref(gptr),
             ctypes.byref(global_len),
-            ctypes.byref(lp),
+            ctypes.byref(lptr),
             ctypes.byref(local_len))
-        check_err__()
         if version == 0:
             return (version, None, None)
         return (version,
-                load_model__(gp, global_len.value),
-                load_model__(lp, local_len.value))
+                _load_model(gptr, global_len.value),
+                _load_model(lptr, local_len.value))
     else:
-        version = rbtlib.RabitLoadCheckPoint(
-            ctypes.byref(gp),
+        version = _LIB.RabitLoadCheckPoint(
+            ctypes.byref(gptr),
             ctypes.byref(global_len),
             None, None)
-        check_err__()
         if version == 0:
             return (version, None)
         return (version,
-                load_model__(gp, global_len.value))
-    
-def checkpoint(global_model, local_model = None):
-    """
-    checkpoint the model, meaning we finished a stage of execution
-    every time we call check point, there is a version number which will increase by one    
+                _load_model(gptr, global_len.value))
 
-    Arguments:
-        global_model: anytype that can be pickled
-            globally shared model/state when calling this function,
-            the caller need to gauranttees that global_model is the same in all nodes
-        local_model: anytype that can be pickled
-            local model, that is specific to current node/rank.
-            This can be None when no local state is needed.
-            local_model requires explicit replication of the model for fault-tolerance,
-            which will bring replication cost in checkpoint function,
-            while global_model do not need explicit replication.
-            It is recommended to use global_model if possible
+def checkpoint(global_model, local_model=None):
+    """Checkpoint the model.
+
+    This means we finished a stage of execution.
+    Every time we call check point, there is a version number which will increase by one.
+
+    Parameters
+    ----------
+    global_model: anytype that can be pickled
+        globally shared model/state when calling this function,
+        the caller need to gauranttees that global_model is the same in all nodes
+
+    local_model: anytype that can be pickled
+       Local model, that is specific to current node/rank.
+       This can be None when no local state is needed.
+
+    Notes
+    -----
+    local_model requires explicit replication of the model for fault-tolerance.
+    This will bring replication cost in checkpoint function.
+    while global_model do not need explicit replication.
+    It is recommended to use global_model if possible.
     """
-    sg = pickle.dumps(global_model)
+    sglobal = pickle.dumps(global_model)
     if local_model is None:
-        rbtlib.RabitCheckPoint(sg, len(sg), None, 0)
-        check_err__()
-        del sg;
+        _LIB.RabitCheckPoint(sglobal, len(sglobal), None, 0)
+        del sglobal
     else:
-        sl = pickle.dumps(local_model)
-        rbtlib.RabitCheckPoint(sg, len(sg), sl, len(sl))
-        check_err__()
-        del sl; del sg;
+        slocal = pickle.dumps(local_model)
+        _LIB.RabitCheckPoint(sglobal, len(sglobal), slocal, len(slocal))
+        del slocal
+        del sglobal
 
 def version_number():
+    """Returns version number of current stored model.
+
+    This means how many calls to CheckPoint we made so far.
+
+    Returns
+    -------
+    version : int
+        Version number of currently stored model
     """
-    Returns version number of current stored model,
-    which means how many calls to CheckPoint we made so far
-    """
-    ret = rbtlib.RabitVersionNumber()
-    check_err__()
+    ret = _LIB.RabitVersionNumber()
     return ret
diff --git a/subtree/rabit/wrapper/rabit_wrapper.cc b/subtree/rabit/wrapper/rabit_wrapper.cc
index 704bf4abc605..7025b3ffe57b 100644
--- a/subtree/rabit/wrapper/rabit_wrapper.cc
+++ b/subtree/rabit/wrapper/rabit_wrapper.cc
@@ -1,3 +1,4 @@
+// Copyright by Contributors
 // implementations in ctypes
 #define _CRT_SECURE_NO_WARNINGS
 #define _CRT_SECURE_NO_DEPRECATE
@@ -28,7 +29,7 @@ struct FHelper<op::BitOR, DType> {
             void (*prepare_fun)(void *arg),
             void *prepare_arg) {
     utils::Error("DataType does not support bitwise or operation");
-  }  
+  }
 };
 template<typename OP>
 inline void Allreduce_(void *sendrecvbuf_,
@@ -60,12 +61,12 @@ inline void Allreduce_(void *sendrecvbuf_,
       return;
     case kLong:
       rabit::Allreduce<OP>
-          (static_cast<long*>(sendrecvbuf_),
+          (static_cast<long*>(sendrecvbuf_),  // NOLINT(*)
            count, prepare_fun, prepare_arg);
       return;
     case kULong:
       rabit::Allreduce<OP>
-          (static_cast<unsigned long*>(sendrecvbuf_),
+          (static_cast<unsigned long*>(sendrecvbuf_),  // NOLINT(*)
            count, prepare_fun, prepare_arg);
       return;
     case kFloat:
@@ -135,7 +136,7 @@ struct ReadWrapper : public Serializable {
   }
   virtual void Save(Stream *fo) const {
     utils::Error("not implemented");
-  }  
+  }
 };
 struct WriteWrapper : public Serializable {
   const char *data;
@@ -179,7 +180,7 @@ extern "C" {
     if (s.length() > max_len) {
       s.resize(max_len - 1);
     }
-    strcpy(out_name, s.c_str());
+    strcpy(out_name, s.c_str()); // NOLINT(*)
     *out_len = static_cast<rbt_ulong>(s.length());
   }
   void RabitBroadcast(void *sendrecv_data,
@@ -218,7 +219,7 @@ extern "C" {
       *out_local_model = BeginPtr(local_buffer);
       *out_local_len = static_cast<rbt_ulong>(local_buffer.length());
     }
-    return version;    
+    return version;
   }
   void RabitCheckPoint(const char *global_model,
                        rbt_ulong global_len,
diff --git a/subtree/rabit/wrapper/rabit_wrapper.h b/subtree/rabit/wrapper/rabit_wrapper.h
index 39caa70b490a..d00a31fda49c 100644
--- a/subtree/rabit/wrapper/rabit_wrapper.h
+++ b/subtree/rabit/wrapper/rabit_wrapper.h
@@ -1,18 +1,19 @@
-#ifndef RABIT_WRAPPER_H_
-#define RABIT_WRAPPER_H_
 /*!
+ * Copyright by Contributors
  * \file rabit_wrapper.h
  * \author Tianqi Chen
  * \brief a C style wrapper of rabit
  *  can be used to create wrapper of other languages
  */
+#ifndef RABIT_WRAPPER_H_
+#define RABIT_WRAPPER_H_
 #ifdef _MSC_VER
 #define RABIT_DLL __declspec(dllexport)
 #else
 #define RABIT_DLL
 #endif
 // manually define unsign long
-typedef unsigned long rbt_ulong;
+typedef unsigned long rbt_ulong;  // NOLINT(*)
 
 #ifdef __cplusplus
 extern "C" {
@@ -23,8 +24,8 @@ extern "C" {
  * \param argv the array of input arguments
  */
   RABIT_DLL void RabitInit(int argc, char *argv[]);
-  /*! 
-   * \brief finalize the rabit engine, call this function after you finished all jobs 
+  /*!
+   * \brief finalize the rabit engine, call this function after you finished all jobs
    */
   RABIT_DLL void RabitFinalize(void);
   /*! \brief get rank of current process */
@@ -37,9 +38,9 @@ extern "C" {
    *    the user who monitors the tracker
    * \param msg the message to be printed
    */
-  RABIT_DLL void RabitTrackerPrint(const char *msg);  
+  RABIT_DLL void RabitTrackerPrint(const char *msg);
   /*!
-   * \brief get name of processor 
+   * \brief get name of processor
    * \param out_name hold output string
    * \param out_len hold length of output string
    * \param max_len maximum buffer length of input
@@ -50,7 +51,7 @@ extern "C" {
   /*!
    * \brief broadcast an memory region to all others from root
    *
-   *     Example: int a = 1; Broadcast(&a, sizeof(a), root); 
+   *     Example: int a = 1; Broadcast(&a, sizeof(a), root);
    * \param sendrecv_data the pointer to send or recive buffer,
    * \param size the size of the data
    * \param root the root of process
@@ -58,7 +59,7 @@ extern "C" {
   RABIT_DLL void RabitBroadcast(void *sendrecv_data,
                                 rbt_ulong size, int root);
   /*!
-   * \brief perform in-place allreduce, on sendrecvbuf 
+   * \brief perform in-place allreduce, on sendrecvbuf
    *        this function is NOT thread-safe
    *
    * Example Usage: the following code gives sum of the result
@@ -81,14 +82,14 @@ extern "C" {
                                 int enum_op,
                                 void (*prepare_fun)(void *arg),
                                 void *prepare_arg);
-  
+
   /*!
    * \brief load latest check point
    * \param out_global_model hold output of serialized global_model
    * \param out_global_len the output length of serialized global model
    * \param out_local_model hold output of serialized local_model, can be NULL
    * \param out_local_len the output length of serialized local model, can be NULL
-   * 
+   *
    * \return the version number of check point loaded
    *     if returned version == 0, this means no model has been CheckPointed
    *     nothing will be touched
@@ -100,7 +101,7 @@ extern "C" {
   /*!
    * \brief checkpoint the model, meaning we finished a stage of execution
    *  every time we call check point, there is a version number which will increase by one
-   * 
+   *
    * \param global_model hold content of serialized global_model
    * \param global_len the content length of serialized global model
    * \param local_model hold content of serialized local_model, can be NULL
@@ -122,4 +123,4 @@ extern "C" {
 #ifdef __cplusplus
 }  // C
 #endif
-#endif  // XGBOOST_WRAPPER_H_
+#endif  // RABIT_WRAPPER_H_
diff --git a/tests/python/test_basic.py b/tests/python/test_basic.py
index fa287b247d86..dcdfc62746c0 100644
--- a/tests/python/test_basic.py
+++ b/tests/python/test_basic.py
@@ -3,8 +3,11 @@
 import xgboost as xgb
 import unittest
 
+import matplotlib
+matplotlib.use('Agg')
 
 dpath = 'demo/data/'
+rng = np.random.RandomState(1994)
 
 class TestBasic(unittest.TestCase):
 
@@ -45,7 +48,7 @@ def test_dmatrix_init(self):
                           feature_names=['a', 'b', 'c', 'd', 'd'])
         # contains symbol
         self.assertRaises(ValueError, xgb.DMatrix, data,
-                          feature_names=['a', 'b', 'c', 'd', 'e=1'])
+                          feature_names=['a', 'b', 'c', 'd', 'e<1'])
 
         dm = xgb.DMatrix(data)
         dm.feature_names = list('abcde')
@@ -102,7 +105,7 @@ def test_pandas(self):
         df = pd.DataFrame([[1, 2., True], [2, 3., False]], columns=['a', 'b', 'c'])
         dm = xgb.DMatrix(df, label=pd.Series([1, 2]))
         assert dm.feature_names == ['a', 'b', 'c']
-        assert dm.feature_types == ['int', 'q', 'i']
+        assert dm.feature_types == ['int', 'float', 'i']
         assert dm.num_row() == 2
         assert dm.num_col() == 3
 
@@ -122,17 +125,62 @@ def test_pandas(self):
         df = pd.DataFrame([[1, 2., True], [2, 3., False]])
         dm = xgb.DMatrix(df, label=pd.Series([1, 2]))
         assert dm.feature_names == ['0', '1', '2']
-        assert dm.feature_types == ['int', 'q', 'i']
+        assert dm.feature_types == ['int', 'float', 'i']
         assert dm.num_row() == 2
         assert dm.num_col() == 3
 
         df = pd.DataFrame([[1, 2., 1], [2, 3., 1]], columns=[4, 5, 6])
         dm = xgb.DMatrix(df, label=pd.Series([1, 2]))
         assert dm.feature_names == ['4', '5', '6']
-        assert dm.feature_types == ['int', 'q', 'int']
+        assert dm.feature_types == ['int', 'float', 'int']
         assert dm.num_row() == 2
         assert dm.num_col() == 3
 
+        df = pd.DataFrame({'A': ['X', 'Y', 'Z'], 'B': [1, 2, 3]})
+        dummies = pd.get_dummies(df)
+        #    B  A_X  A_Y  A_Z
+        # 0  1    1    0    0
+        # 1  2    0    1    0
+        # 2  3    0    0    1
+        result, _, _ = xgb.core._maybe_pandas_data(dummies, None, None)
+        exp = np.array([[ 1.,  1.,  0.,  0.],
+                        [ 2.,  0.,  1.,  0.],
+                        [ 3.,  0.,  0.,  1.]])
+        np.testing.assert_array_equal(result, exp)
+
+        dm = xgb.DMatrix(dummies)
+        assert dm.feature_names == ['B', 'A_X', 'A_Y', 'A_Z']
+        assert dm.feature_types == ['int', 'float', 'float', 'float']
+        assert dm.num_row() == 3
+        assert dm.num_col() == 4
+
+        df = pd.DataFrame({'A=1': [1, 2, 3], 'A=2': [4, 5, 6]})
+        dm = xgb.DMatrix(df)
+        assert dm.feature_names == ['A=1', 'A=2']
+        assert dm.feature_types == ['int', 'int']
+        assert dm.num_row() == 3
+        assert dm.num_col() == 2
+
+    def test_pandas_label(self):
+        import pandas as pd
+
+        # label must be a single column
+        df = pd.DataFrame({'A': ['X', 'Y', 'Z'], 'B': [1, 2, 3]})
+        self.assertRaises(ValueError, xgb.core._maybe_pandas_label, df)
+
+        # label must be supported dtype
+        df = pd.DataFrame({'A': np.array(['a', 'b', 'c'], dtype=object)})
+        self.assertRaises(ValueError, xgb.core._maybe_pandas_label, df)
+
+        df = pd.DataFrame({'A': np.array([1, 2, 3], dtype=int)})
+        result = xgb.core._maybe_pandas_label(df)
+        np.testing.assert_array_equal(result, np.array([[1.], [2.], [3.]], dtype=float))
+
+        dm = xgb.DMatrix(np.random.randn(3, 2), label=df)
+        assert dm.num_row() == 3
+        assert dm.num_col() == 2
+
+
     def test_load_file_invalid(self):
 
         self.assertRaises(ValueError, xgb.Booster,
@@ -197,9 +245,6 @@ def test_plotting(self):
         bst2 = xgb.Booster(model_file='xgb.model')
         # plotting
 
-        import matplotlib
-        matplotlib.use('Agg')
-
         from matplotlib.axes import Axes
         from graphviz import Digraph
 
@@ -220,7 +265,6 @@ def test_plotting(self):
         for p in ax.patches:
             assert p.get_facecolor() == (1.0, 0, 0, 1.0) # red
 
-
         ax = xgb.plot_importance(bst2, color=['r', 'r', 'b', 'b'],
                                  title=None, xlabel=None, ylabel=None)
         assert isinstance(ax, Axes)
@@ -235,5 +279,63 @@ def test_plotting(self):
 
         g = xgb.to_graphviz(bst2, num_trees=0)
         assert isinstance(g, Digraph)
+
         ax = xgb.plot_tree(bst2, num_trees=0)
         assert isinstance(ax, Axes)
+
+    def test_importance_plot_lim(self):
+        np.random.seed(1)
+        dm = xgb.DMatrix(np.random.randn(100, 100), label=[0, 1]*50)
+        bst = xgb.train({}, dm)
+        assert len(bst.get_fscore()) == 71
+        ax = xgb.plot_importance(bst)
+        assert ax.get_xlim() == (0., 11.)
+        assert ax.get_ylim() == (-1., 71.)
+
+        ax = xgb.plot_importance(bst, xlim=(0, 5), ylim=(10, 71))
+        assert ax.get_xlim() == (0., 5.)
+        assert ax.get_ylim() == (10., 71.)
+
+    def test_sklearn_api(self):
+        from sklearn import datasets
+        from sklearn.cross_validation import train_test_split
+
+        np.random.seed(1)
+
+        iris = datasets.load_iris()
+        tr_d, te_d, tr_l, te_l = train_test_split(iris.data, iris.target, train_size=120)
+
+        classifier = xgb.XGBClassifier()
+        classifier.fit(tr_d, tr_l)
+
+        preds = classifier.predict(te_d)
+        labels = te_l
+        err = sum([1 for p, l in zip(preds, labels) if p != l]) / len(te_l)
+        # error must be smaller than 10%
+        assert err < 0.1
+
+    def test_sklearn_plotting(self):
+        from sklearn import datasets
+        iris = datasets.load_iris()
+
+        classifier = xgb.XGBClassifier()
+        classifier.fit(iris.data, iris.target)
+
+        import matplotlib
+        matplotlib.use('Agg')
+
+        from matplotlib.axes import Axes
+        from graphviz import Digraph
+
+        ax = xgb.plot_importance(classifier)
+        assert isinstance(ax, Axes)
+        assert ax.get_title() == 'Feature importance'
+        assert ax.get_xlabel() == 'F score'
+        assert ax.get_ylabel() == 'Features'
+        assert len(ax.patches) == 4
+
+        g = xgb.to_graphviz(classifier, num_trees=0)
+        assert isinstance(g, Digraph)
+
+        ax = xgb.plot_tree(classifier, num_trees=0)
+        assert isinstance(ax, Axes)
diff --git a/tests/python/test_early_stopping.py b/tests/python/test_early_stopping.py
new file mode 100644
index 000000000000..512fd20d0504
--- /dev/null
+++ b/tests/python/test_early_stopping.py
@@ -0,0 +1,62 @@
+import xgboost as xgb
+import numpy as np
+from sklearn.datasets import load_digits
+from sklearn.cross_validation import KFold, train_test_split
+from sklearn.metrics import mean_squared_error
+import unittest
+
+rng = np.random.RandomState(1994)
+
+
+class TestEarlyStopping(unittest.TestCase):
+    def test_early_stopping_nonparallel(self):
+        digits = load_digits(2)
+        X = digits['data']
+        y = digits['target']
+        X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
+        clf1 = xgb.XGBClassifier()
+        clf1.fit(X_train, y_train, early_stopping_rounds=5, eval_metric="auc",
+                 eval_set=[(X_test, y_test)])
+        clf2 = xgb.XGBClassifier()
+        clf2.fit(X_train, y_train, early_stopping_rounds=4, eval_metric="auc",
+                 eval_set=[(X_test, y_test)])
+        # should be the same
+        assert clf1.best_score == clf2.best_score
+        assert clf1.best_score != 1
+        # check overfit
+        clf3 = xgb.XGBClassifier()
+        clf3.fit(X_train, y_train, early_stopping_rounds=10, eval_metric="auc",
+                 eval_set=[(X_test, y_test)])
+        assert clf3.best_score == 1
+
+        # TODO: parallel test for early stopping
+        # TODO: comment out for now. Will re-visit later
+
+    def evalerror(self, preds, dtrain):
+        labels = dtrain.get_label()
+        return 'rmse', mean_squared_error(labels, preds)
+
+    def test_cv_early_stopping(self):
+        digits = load_digits(2)
+        X = digits['data']
+        y = digits['target']
+        dm = xgb.DMatrix(X, label=y)
+        params = {'max_depth': 2, 'eta': 1, 'silent': 1, 'objective': 'binary:logistic'}
+
+        import pandas as pd
+        cv = xgb.cv(params, dm, num_boost_round=10, nfold=10, early_stopping_rounds=10)
+        assert cv.shape[0] == 10
+        cv = xgb.cv(params, dm, num_boost_round=10, nfold=10, early_stopping_rounds=5)
+        assert cv.shape[0] == 3
+        cv = xgb.cv(params, dm, num_boost_round=10, nfold=10, early_stopping_rounds=1)
+        assert cv.shape[0] == 1
+
+        cv = xgb.cv(params, dm, num_boost_round=10, nfold=10, feval=self.evalerror,
+                    early_stopping_rounds=10)
+        assert cv.shape[0] == 10
+        cv = xgb.cv(params, dm, num_boost_round=10, nfold=10, feval=self.evalerror,
+                    early_stopping_rounds=1)
+        assert cv.shape[0] == 5
+        cv = xgb.cv(params, dm, num_boost_round=10, nfold=10, feval=self.evalerror,
+                    maximize=True, early_stopping_rounds=1)
+        assert cv.shape[0] == 1
diff --git a/tests/python/test_eval_metrics.py b/tests/python/test_eval_metrics.py
new file mode 100644
index 000000000000..190851dae5e9
--- /dev/null
+++ b/tests/python/test_eval_metrics.py
@@ -0,0 +1,95 @@
+import xgboost as xgb
+import numpy as np
+from sklearn.cross_validation import KFold, train_test_split
+from sklearn.metrics import mean_squared_error
+from sklearn.grid_search import GridSearchCV
+from sklearn.datasets import load_iris, load_digits, load_boston
+import unittest
+
+rng = np.random.RandomState(1337)
+
+
+class TestEvalMetrics(unittest.TestCase):
+    xgb_params_01 = {
+        'silent': 1,
+        'nthread': 1,
+        'eval_metric': 'error'
+    }
+
+    xgb_params_02 = {
+        'silent': 1,
+        'nthread': 1,
+        'eval_metric': ['error']
+    }
+
+    xgb_params_03 = {
+        'silent': 1,
+        'nthread': 1,
+        'eval_metric': ['rmse', 'error']
+    }
+
+    xgb_params_04 = {
+        'silent': 1,
+        'nthread': 1,
+        'eval_metric': ['error', 'rmse']
+    }
+
+    def evalerror_01(self, preds, dtrain):
+        labels = dtrain.get_label()
+        return 'error', float(sum(labels != (preds > 0.0))) / len(labels)
+
+    def evalerror_02(self, preds, dtrain):
+        labels = dtrain.get_label()
+        return [('error', float(sum(labels != (preds > 0.0))) / len(labels))]
+
+    def evalerror_03(self, preds, dtrain):
+        labels = dtrain.get_label()
+        return [('rmse', mean_squared_error(labels, preds)),
+                ('error', float(sum(labels != (preds > 0.0))) / len(labels))]
+
+    def evalerror_04(self, preds, dtrain):
+        labels = dtrain.get_label()
+        return [('error', float(sum(labels != (preds > 0.0))) / len(labels)),
+                ('rmse', mean_squared_error(labels, preds))]
+
+    def test_eval_metrics(self):
+        digits = load_digits(2)
+        X = digits['data']
+        y = digits['target']
+
+        Xt, Xv, yt, yv = train_test_split(X, y, test_size=0.2, random_state=0)
+
+        dtrain = xgb.DMatrix(Xt, label=yt)
+        dvalid = xgb.DMatrix(Xv, label=yv)
+
+        watchlist = [(dtrain, 'train'), (dvalid, 'val')]
+
+        gbdt_01 = xgb.train(self.xgb_params_01, dtrain, num_boost_round=10)
+        gbdt_02 = xgb.train(self.xgb_params_02, dtrain, num_boost_round=10)
+        gbdt_03 = xgb.train(self.xgb_params_03, dtrain, num_boost_round=10)
+        assert gbdt_01.predict(dvalid)[0] == gbdt_02.predict(dvalid)[0]
+        assert gbdt_01.predict(dvalid)[0] == gbdt_03.predict(dvalid)[0]
+
+        gbdt_01 = xgb.train(self.xgb_params_01, dtrain, 10, watchlist,
+                            early_stopping_rounds=2)
+        gbdt_02 = xgb.train(self.xgb_params_02, dtrain, 10, watchlist,
+                            early_stopping_rounds=2)
+        gbdt_03 = xgb.train(self.xgb_params_03, dtrain, 10, watchlist,
+                            early_stopping_rounds=2)
+        gbdt_04 = xgb.train(self.xgb_params_04, dtrain, 10, watchlist,
+                            early_stopping_rounds=2)
+        assert gbdt_01.predict(dvalid)[0] == gbdt_02.predict(dvalid)[0]
+        assert gbdt_01.predict(dvalid)[0] == gbdt_03.predict(dvalid)[0]
+        assert gbdt_03.predict(dvalid)[0] != gbdt_04.predict(dvalid)[0]
+
+        gbdt_01 = xgb.train(self.xgb_params_01, dtrain, 10, watchlist,
+                            early_stopping_rounds=2, feval=self.evalerror_01)
+        gbdt_02 = xgb.train(self.xgb_params_02, dtrain, 10, watchlist,
+                            early_stopping_rounds=2, feval=self.evalerror_02)
+        gbdt_03 = xgb.train(self.xgb_params_03, dtrain, 10, watchlist,
+                            early_stopping_rounds=2, feval=self.evalerror_03)
+        gbdt_04 = xgb.train(self.xgb_params_04, dtrain, 10, watchlist,
+                            early_stopping_rounds=2, feval=self.evalerror_04)
+        assert gbdt_01.predict(dvalid)[0] == gbdt_02.predict(dvalid)[0]
+        assert gbdt_01.predict(dvalid)[0] == gbdt_03.predict(dvalid)[0]
+        assert gbdt_03.predict(dvalid)[0] != gbdt_04.predict(dvalid)[0]
diff --git a/tests/python/test_models.py b/tests/python/test_models.py
index 8c06d9de9528..883a605bea50 100644
--- a/tests/python/test_models.py
+++ b/tests/python/test_models.py
@@ -1,39 +1,89 @@
 import numpy as np
 import xgboost as xgb
+import unittest
 
 dpath = 'demo/data/'
 dtrain = xgb.DMatrix(dpath + 'agaricus.txt.train')
 dtest = xgb.DMatrix(dpath + 'agaricus.txt.test')
 
-def test_glm():
-	param = {'silent':1, 'objective':'binary:logistic', 'booster':'gblinear', 'alpha': 0.0001, 'lambda': 1 }
-	watchlist  = [(dtest,'eval'), (dtrain,'train')]
-	num_round = 4
-	bst = xgb.train(param, dtrain, num_round, watchlist)
-	assert isinstance(bst, xgb.core.Booster)
-	preds = bst.predict(dtest)
-	labels = dtest.get_label()
-	err = sum(1 for i in range(len(preds)) if int(preds[i]>0.5)!=labels[i]) / float(len(preds))
-	assert err < 0.1
-
-def test_custom_objective():
-	param = {'max_depth':2, 'eta':1, 'silent':1 }
-	watchlist  = [(dtest,'eval'), (dtrain,'train')]
-	num_round = 2
-	def logregobj(preds, dtrain):
-		labels = dtrain.get_label()
-		preds = 1.0 / (1.0 + np.exp(-preds))
-		grad = preds - labels
-		hess = preds * (1.0-preds)
-		return grad, hess
-	def evalerror(preds, dtrain):
-		labels = dtrain.get_label()
-		return 'error', float(sum(labels != (preds > 0.0))) / len(labels)
-	bst = xgb.train(param, dtrain, num_round, watchlist, logregobj, evalerror)
-	assert isinstance(bst, xgb.core.Booster)
-	preds = bst.predict(dtest)
-	labels = dtest.get_label()
-	err = sum(1 for i in range(len(preds)) if int(preds[i]>0.5)!=labels[i]) / float(len(preds))
-	assert err < 0.1
+rng = np.random.RandomState(1994)
 
+class TestModels(unittest.TestCase):
 
+	def test_glm(self):
+		param = {'silent':1, 'objective':'binary:logistic', 'booster':'gblinear', 'alpha': 0.0001, 'lambda': 1 }
+		watchlist  = [(dtest,'eval'), (dtrain,'train')]
+		num_round = 4
+		bst = xgb.train(param, dtrain, num_round, watchlist)
+		assert isinstance(bst, xgb.core.Booster)
+		preds = bst.predict(dtest)
+		labels = dtest.get_label()
+		err = sum(1 for i in range(len(preds)) if int(preds[i]>0.5)!=labels[i]) / float(len(preds))
+		assert err < 0.1
+
+	def test_eta_decay(self):
+		param = {'max_depth':2, 'eta':1, 'silent':1, 'objective':'binary:logistic' }
+		watchlist  = [(dtest,'eval'), (dtrain,'train')]
+		num_round = 2
+		# learning_rates as a list
+		bst = xgb.train(param, dtrain, num_round, watchlist, learning_rates=[0.4, 0.3])
+		assert isinstance(bst, xgb.core.Booster)
+
+		# learning_rates as a customized decay function
+		def eta_decay(ithround, num_boost_round):
+			return num_boost_round / ithround
+		bst = xgb.train(param, dtrain, num_round, watchlist, learning_rates=eta_decay)
+		assert isinstance(bst, xgb.core.Booster)
+
+
+	def test_custom_objective(self):
+		param = {'max_depth':2, 'eta':1, 'silent':1 }
+		watchlist  = [(dtest,'eval'), (dtrain,'train')]
+		num_round = 2
+		def logregobj(preds, dtrain):
+			labels = dtrain.get_label()
+			preds = 1.0 / (1.0 + np.exp(-preds))
+			grad = preds - labels
+			hess = preds * (1.0-preds)
+			return grad, hess
+		def evalerror(preds, dtrain):
+			labels = dtrain.get_label()
+			return 'error', float(sum(labels != (preds > 0.0))) / len(labels)
+		
+		# test custom_objective in training
+		bst = xgb.train(param, dtrain, num_round, watchlist, logregobj, evalerror)
+		assert isinstance(bst, xgb.core.Booster)
+		preds = bst.predict(dtest)
+		labels = dtest.get_label()
+		err = sum(1 for i in range(len(preds)) if int(preds[i]>0.5)!=labels[i]) / float(len(preds))
+		assert err < 0.1
+
+		# test custom_objective in cross-validation
+		xgb.cv(param, dtrain, num_round, nfold = 5, seed = 0,
+	       obj = logregobj, feval=evalerror)
+
+		# test maximize parameter
+		def neg_evalerror(preds, dtrain):
+			labels = dtrain.get_label()
+			return 'error', float(sum(labels == (preds > 0.0))) / len(labels)
+		bst2 = xgb.train(param, dtrain, num_round, watchlist, logregobj, neg_evalerror, maximize=True)
+		preds2 = bst2.predict(dtest)
+		err2 = sum(1 for i in range(len(preds2)) if int(preds2[i]>0.5)!=labels[i]) / float(len(preds2))
+		assert err == err2
+
+	def test_fpreproc(self):
+		param = {'max_depth':2, 'eta':1, 'silent':1, 'objective':'binary:logistic'}
+		num_round = 2
+		def fpreproc(dtrain, dtest, param):
+			label = dtrain.get_label()
+			ratio = float(np.sum(label == 0)) / np.sum(label==1)
+			param['scale_pos_weight'] = ratio
+			return (dtrain, dtest, param)
+		xgb.cv(param, dtrain, num_round, nfold=5,
+	       metrics={'auc'}, seed = 0, fpreproc = fpreproc)
+
+	def test_show_stdv(self):
+		param = {'max_depth':2, 'eta':1, 'silent':1, 'objective':'binary:logistic'}
+		num_round = 2
+		xgb.cv(param, dtrain, num_round, nfold=5,
+	       metrics={'error'}, seed = 0, show_stdv = False)
diff --git a/tests/python/test_training_continuation.py b/tests/python/test_training_continuation.py
new file mode 100644
index 000000000000..ac6deca264f1
--- /dev/null
+++ b/tests/python/test_training_continuation.py
@@ -0,0 +1,92 @@
+import xgboost as xgb
+import numpy as np
+from sklearn.preprocessing import MultiLabelBinarizer
+from sklearn.cross_validation import KFold, train_test_split
+from sklearn.metrics import mean_squared_error
+from sklearn.grid_search import GridSearchCV
+from sklearn.datasets import load_iris, load_digits, load_boston
+import unittest
+
+rng = np.random.RandomState(1337)
+
+
+class TestTrainingContinuation(unittest.TestCase):
+    num_parallel_tree = 3
+
+    xgb_params_01 = {
+        'silent': 1,
+        'nthread': 1,
+    }
+
+    xgb_params_02 = {
+        'silent': 1,
+        'nthread': 1,
+        'num_parallel_tree': num_parallel_tree
+    }
+
+    xgb_params_03 = {
+        'silent': 1,
+        'nthread': 1,
+        'num_class': 5,
+        'num_parallel_tree': num_parallel_tree
+    }
+
+    def test_training_continuation(self):
+        digits_2class = load_digits(2)
+        digits_5class = load_digits(5)
+
+        X_2class = digits_2class['data']
+        y_2class = digits_2class['target']
+
+        X_5class = digits_5class['data']
+        y_5class = digits_5class['target']
+
+        dtrain_2class = xgb.DMatrix(X_2class, label=y_2class)
+        dtrain_5class = xgb.DMatrix(X_5class, label=y_5class)
+
+        gbdt_01 = xgb.train(self.xgb_params_01, dtrain_2class, num_boost_round=10)
+        ntrees_01 = len(gbdt_01.get_dump())
+        assert ntrees_01 == 10
+
+        gbdt_02 = xgb.train(self.xgb_params_01, dtrain_2class, num_boost_round=0)
+        gbdt_02.save_model('xgb_tc.model')
+
+        gbdt_02a = xgb.train(self.xgb_params_01, dtrain_2class, num_boost_round=10, xgb_model=gbdt_02)
+        gbdt_02b = xgb.train(self.xgb_params_01, dtrain_2class, num_boost_round=10, xgb_model="xgb_tc.model")
+        ntrees_02a = len(gbdt_02a.get_dump())
+        ntrees_02b = len(gbdt_02b.get_dump())
+        assert ntrees_02a == 10
+        assert ntrees_02b == 10
+        assert mean_squared_error(y_2class, gbdt_01.predict(dtrain_2class)) == \
+               mean_squared_error(y_2class, gbdt_02a.predict(dtrain_2class))
+        assert mean_squared_error(y_2class, gbdt_01.predict(dtrain_2class)) == \
+               mean_squared_error(y_2class, gbdt_02b.predict(dtrain_2class))
+
+        gbdt_03 = xgb.train(self.xgb_params_01, dtrain_2class, num_boost_round=3)
+        gbdt_03.save_model('xgb_tc.model')
+
+        gbdt_03a = xgb.train(self.xgb_params_01, dtrain_2class, num_boost_round=7, xgb_model=gbdt_03)
+        gbdt_03b = xgb.train(self.xgb_params_01, dtrain_2class, num_boost_round=7, xgb_model="xgb_tc.model")
+        ntrees_03a = len(gbdt_03a.get_dump())
+        ntrees_03b = len(gbdt_03b.get_dump())
+        assert ntrees_03a == 10
+        assert ntrees_03b == 10
+        assert mean_squared_error(y_2class, gbdt_03a.predict(dtrain_2class)) == \
+               mean_squared_error(y_2class, gbdt_03b.predict(dtrain_2class))
+
+        gbdt_04 = xgb.train(self.xgb_params_02, dtrain_2class, num_boost_round=3)
+        assert gbdt_04.best_ntree_limit == (gbdt_04.best_iteration + 1) * self.num_parallel_tree
+        assert mean_squared_error(y_2class, gbdt_04.predict(dtrain_2class)) == \
+               mean_squared_error(y_2class, gbdt_04.predict(dtrain_2class, ntree_limit=gbdt_04.best_ntree_limit))
+
+        gbdt_04 = xgb.train(self.xgb_params_02, dtrain_2class, num_boost_round=7, xgb_model=gbdt_04)
+        assert gbdt_04.best_ntree_limit == (gbdt_04.best_iteration + 1) * self.num_parallel_tree
+        assert mean_squared_error(y_2class, gbdt_04.predict(dtrain_2class)) == \
+               mean_squared_error(y_2class, gbdt_04.predict(dtrain_2class, ntree_limit=gbdt_04.best_ntree_limit))
+
+        gbdt_05 = xgb.train(self.xgb_params_03, dtrain_5class, num_boost_round=7)
+        assert gbdt_05.best_ntree_limit == (gbdt_05.best_iteration + 1) * self.num_parallel_tree
+        gbdt_05 = xgb.train(self.xgb_params_03, dtrain_5class, num_boost_round=3, xgb_model=gbdt_05)
+        assert gbdt_05.best_ntree_limit == (gbdt_05.best_iteration + 1) * self.num_parallel_tree
+        assert np.any(gbdt_05.predict(dtrain_5class) !=
+                      gbdt_05.predict(dtrain_5class, ntree_limit=gbdt_05.best_ntree_limit)) == False
diff --git a/tests/python/test_with_sklearn.py b/tests/python/test_with_sklearn.py
new file mode 100644
index 000000000000..3e31ddb65c7d
--- /dev/null
+++ b/tests/python/test_with_sklearn.py
@@ -0,0 +1,64 @@
+import xgboost as xgb
+import numpy as np
+from sklearn.cross_validation import KFold, train_test_split
+from sklearn.metrics import mean_squared_error
+from sklearn.grid_search import GridSearchCV
+from sklearn.datasets import load_iris, load_digits, load_boston
+
+rng = np.random.RandomState(1994)
+
+def test_binary_classification():
+	digits = load_digits(2)
+	y = digits['target']
+	X = digits['data']
+	kf = KFold(y.shape[0], n_folds=2, shuffle=True, random_state=rng)
+	for train_index, test_index in kf:
+	    xgb_model = xgb.XGBClassifier().fit(X[train_index],y[train_index])
+	    preds = xgb_model.predict(X[test_index])
+	    labels = y[test_index]
+	    err = sum(1 for i in range(len(preds)) if int(preds[i]>0.5)!=labels[i]) / float(len(preds))
+	assert err < 0.1
+
+def test_multiclass_classification():
+	iris = load_iris()
+	y = iris['target']
+	X = iris['data']
+	kf = KFold(y.shape[0], n_folds=2, shuffle=True, random_state=rng)
+	for train_index, test_index in kf:
+	    xgb_model = xgb.XGBClassifier().fit(X[train_index],y[train_index])
+	    preds = xgb_model.predict(X[test_index])
+		# test other params in XGBClassifier().fit
+	    preds2 = xgb_model.predict(X[test_index], output_margin=True, ntree_limit=3)
+	    preds3 = xgb_model.predict(X[test_index], output_margin=True, ntree_limit=0)
+	    preds4 = xgb_model.predict(X[test_index], output_margin=False, ntree_limit=3)
+	    labels = y[test_index]
+	    err = sum(1 for i in range(len(preds)) if int(preds[i]>0.5)!=labels[i]) / float(len(preds))
+	assert err < 0.4
+
+def test_boston_housing_regression():
+	boston = load_boston()
+	y = boston['target']
+	X = boston['data']
+	kf = KFold(y.shape[0], n_folds=2, shuffle=True, random_state=rng)
+	for train_index, test_index in kf:
+	    xgb_model = xgb.XGBRegressor().fit(X[train_index],y[train_index])
+	    preds = xgb_model.predict(X[test_index])
+	    # test other params in XGBRegressor().fit
+	    preds2 = xgb_model.predict(X[test_index], output_margin=True, ntree_limit=3)
+	    preds3 = xgb_model.predict(X[test_index], output_margin=True, ntree_limit=0)
+	    preds4 = xgb_model.predict(X[test_index], output_margin=False, ntree_limit=3)
+	    labels = y[test_index]
+	assert mean_squared_error(preds, labels) < 25
+
+def test_parameter_tuning():
+	boston = load_boston()
+	y = boston['target']
+	X = boston['data']
+	xgb_model = xgb.XGBRegressor()
+	clf = GridSearchCV(xgb_model,
+	                   {'max_depth': [2,4,6],
+	                    'n_estimators': [50,100,200]}, verbose=1)
+	clf.fit(X,y)
+	assert clf.best_score_ < 0.7
+	assert clf.best_params_ == {'n_estimators': 100, 'max_depth': 4}
+