diff --git a/.gitignore b/.gitignore
index 73ae6748eff1..048803abd7de 100644
--- a/.gitignore
+++ b/.gitignore
@@ -48,10 +48,9 @@ Debug
 *.cpage.col
 *.cpage
 *.Rproj
-xgboost
-xgboost.mpi
-xgboost.mock
-train*
+./xgboost
+./xgboost.mpi
+./xgboost.mock
 rabit
 #.Rbuildignore
 R-package.Rproj
diff --git a/.travis.yml b/.travis.yml
index ac4f58154f29..5e10c3360e37 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -1,5 +1,10 @@
 sudo: true
 
+# Enabling test on Linux and OS X
+os:
+  - linux
+  - osx
+
 # Use Build Matrix to do lint and build seperately
 env:
   matrix:
@@ -7,10 +12,15 @@ env:
     - TASK=lint LINT_LANG=python
     - TASK=R-package CXX=g++
     - TASK=python-package CXX=g++
+    - TASK=python-package3 CXX=g++
     - TASK=java-package CXX=g++
     - TASK=build CXX=g++
     - TASK=build-with-dmlc CXX=g++
 
+os:
+  - linux
+  - osx
+
 # dependent apt packages
 addons:
   apt:
@@ -20,19 +30,18 @@ addons:
       - wget
       - libcurl4-openssl-dev
       - unzip
-      - python-numpy
-      - python-scipy
-      - python-nose
 
 before_install:
+  - scripts/travis_osx_install.sh
   - git clone https://github.com/dmlc/dmlc-core
   - export TRAVIS=dmlc-core/scripts/travis/
-  - export PYTHONPATH=${PYTHONPATH}:${PWD}/wrapper
+  - export PYTHONPATH=${PYTHONPATH}:${PWD}/python-package
   - source ${TRAVIS}/travis_setup_env.sh
 
 install:
   - pip install cpplint pylint --user `whoami`
 
+
 script: scripts/travis_script.sh
 
 
diff --git a/CHANGES.md b/CHANGES.md
index 90fd77ebb7f8..a8ddcd7ea577 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -1,18 +1,18 @@
 Change Log
-=====
+==========
 
 xgboost-0.1
-=====
+===========
 * Initial release
 
 xgboost-0.2x
-=====
+============
 * Python module
 * Weighted samples instances
 * Initial version of pairwise rank
 
 xgboost-0.3
-=====
+===========
 * Faster tree construction module
   - Allows subsample columns during tree construction via ```bst:col_samplebytree=ratio```
 * Support for boosting from initial predictions
@@ -22,7 +22,7 @@ xgboost-0.3
 * Add R module
 
 xgboost-0.4
-=====
+===========
 * Distributed version of xgboost that runs on YARN, scales to billions of examples
 * Direct save/load data and model from/to S3 and HDFS
 * Feature importance visualization in R module, by Michael Benesty
@@ -34,3 +34,14 @@ xgboost-0.4
   - xgboost python model is now pickable
 * sklearn wrapper is supported in python module
 * Experimental External memory version
+
+on going at master
+==================
+* Fix List
+  - Fixed possible problem of poisson regression for R.
+* Python module now throw exception instead of crash terminal when a parameter error happens.
+* Python module now has importance plot and tree plot functions.
+* Java api is ready for use
+* Added more test cases and continuous integration to make each build more robust
+* Improvements in sklearn compatible module
+* Added pip installation functionality for python module
diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
new file mode 100644
index 000000000000..6ae79f795aee
--- /dev/null
+++ b/CONTRIBUTORS.md
@@ -0,0 +1,50 @@
+Contributors of DMLC/XGBoost
+============================
+XGBoost has been developed and used by a group of active community. Everyone is more than welcomed to is a great way to make the project better and more accessible to more users.
+
+Comitters
+---------
+Committers are people who have made substantial contribution to the project and granted write access to the project.
+* [Tianqi Chen](https://github.com/tqchen), University of Washington
+  - Tianqi is a PhD working on large-scale machine learning, he is the creator of the project.
+* [Tong He](https://github.com/hetong007), Simon Fraser University
+  - Tong is a master student working on data mining, he is the maintainer of xgboost R package.
+* [Bing Xu](https://github.com/antinucleon)
+  - Bing is the original creator of xgboost python package and currently the maintainer of [XGBoost.jl](https://github.com/antinucleon/XGBoost.jl).
+* [Michael Benesty](https://github.com/pommedeterresautee)
+  - Micheal is a lawyer, data scientist in France, he is the creator of xgboost interactive analysis module in R.
+
+Become a Comitter
+-----------------
+XGBoost is a opensource project and we are actively looking for new comitters who are willing to help maintaining and lead the project.
+Committers comes from contributors who:
+* Made substantial contribution to the project.
+* Willing to spent time on maintaining and lead the project.
+
+New committers will be proposed by current comitter memembers, with support from more than two of current comitters.
+
+List of Contributors
+--------------------
+* [Full List of Contributors](https://github.com/dmlc/xgboost/graphs/contributors)
+  - To contributors: please add your name to the list when you submit a patch to the project:)
+* [Kailong Chen](https://github.com/kalenhaha)
+  - Kailong is an early contributor of xgboost, he is creator of ranking objectives in xgboost.
+* [Skipper Seabold](https://github.com/jseabold)
+  - Skipper is the major contributor to the scikit-learn module of xgboost.
+* [Zygmunt Zając](https://github.com/zygmuntz)
+  - Zygmunt is the master behind the early stopping feature frequently used by kagglers.
+* [Ajinkya Kale](https://github.com/ajkl)
+* [Boliang Chen](https://github.com/cblsjtu)
+* [Vadim Khotilovich](https://github.com/khotilov)
+* [Yangqing Men](https://github.com/yanqingmen)
+  - Yangqing is the creator of xgboost java package.
+* [Engpeng Yao](https://github.com/yepyao)
+* [Giulio](https://github.com/giuliohome)
+  - Giulio is the creator of windows project of xgboost
+* [Jamie Hall](https://github.com/nerdcha)
+  - Jamie is the initial creator of xgboost sklearn modue.
+* [Yen-Ying Lee](https://github.com/white1033)
+* [Masaaki Horikoshi](https://github.com/sinhrks)
+  - Masaaki is the initial creator of xgboost python plotting module.
+* [Hongliang Liu](https://github.com/phunterlau)
+  - Hongliang is the maintainer of xgboost python PyPI package for pip installation.
diff --git a/Makefile b/Makefile
index a24bea327534..c790f6b726ee 100644
--- a/Makefile
+++ b/Makefile
@@ -1,4 +1,5 @@
 export CC  = gcc
+#build on the fly
 export CXX = g++
 export MPICXX = mpicxx
 export LDFLAGS= -pthread -lm
@@ -11,6 +12,12 @@ ifeq ($(OS), Windows_NT)
 	export CC = gcc -m64
 endif
 
+UNAME= $(shell uname)
+
+ifeq ($(UNAME), Linux)
+	LDFLAGS += -lrt
+endif
+
 ifeq ($(no_omp),1)
 	CFLAGS += -DDISABLE_OPENMP
 else
@@ -161,9 +168,33 @@ Rcheck:
 	make Rbuild
 	R CMD check --as-cran xgboost*.tar.gz
 
+pythonpack:
+	#make clean
+	cd subtree/rabit;make clean;cd ..
+	rm -rf xgboost-deploy xgboost*.tar.gz
+	cp -r python-package xgboost-deploy
+	cp *.md xgboost-deploy/
+	cp LICENSE xgboost-deploy/
+	cp Makefile xgboost-deploy/xgboost
+	cp -r wrapper xgboost-deploy/xgboost
+	cp -r subtree xgboost-deploy/xgboost
+	cp -r multi-node xgboost-deploy/xgboost
+	cp -r windows xgboost-deploy/xgboost
+	cp -r src xgboost-deploy/xgboost
+
+	#make python
+
+pythonbuild:
+	make pythonpack
+	python setup.py install
+
+pythoncheck:
+	make pythonbuild
+	python -c 'import xgboost;print xgboost.core.find_lib_path()'
+
 # lint requires dmlc to be in current folder
 lint:
-	dmlc-core/scripts/lint.py xgboost $(LINT_LANG) src wrapper R-package
+	dmlc-core/scripts/lint.py xgboost $(LINT_LANG) src wrapper R-package python-package
 
 clean:
 	$(RM) -rf $(OBJ) $(BIN) $(MPIBIN) $(MPIOBJ) $(SLIB) *.o  */*.o */*/*.o *~ */*~ */*/*~
diff --git a/R-package/.Rbuildignore b/R-package/.Rbuildignore
index 6b3c4084e9f4..b37d627ba487 100644
--- a/R-package/.Rbuildignore
+++ b/R-package/.Rbuildignore
@@ -3,3 +3,4 @@
 \.dll$
 ^.*\.Rproj$
 ^\.Rproj\.user$
+README.md
diff --git a/R-package/DESCRIPTION b/R-package/DESCRIPTION
index c6975af5e75a..19410d65a44a 100644
--- a/R-package/DESCRIPTION
+++ b/R-package/DESCRIPTION
@@ -1,16 +1,16 @@
 Package: xgboost
 Type: Package
-Title: eXtreme Gradient Boosting
-Version: 0.4-0
-Date: 2015-05-11
+Title: Extreme Gradient Boosting
+Version: 0.4-2
+Date: 2015-08-01
 Author: Tianqi Chen <tianqi.tchen@gmail.com>, Tong He <hetong007@gmail.com>, Michael Benesty <michael@benesty.fr>
 Maintainer: Tong He <hetong007@gmail.com>
-Description: Xgboost is short for eXtreme Gradient Boosting, which is an 
-    efficient and scalable implementation of gradient boosting framework. 
-    This package is an R wrapper of xgboost. The package includes efficient 
+Description: Extreme Gradient Boosting, which is an 
+    efficient implementation of gradient boosting framework. 
+    This package is its R interface. The package includes efficient 
     linear model solver and tree learning algorithms. The package can automatically 
-    do parallel computation with OpenMP, and it can be more than 10 times faster
-    than existing gradient boosting packages such as gbm. It supports various
+    do parallel computation on a single machine which could be more than 10 times faster
+    than existing gradient boosting packages. It supports various
     objective functions, including regression, classification and ranking. The
     package is made to be extensible, so that users are also allowed to define
     their own objectives easily.
diff --git a/R-package/R/utils.R b/R-package/R/utils.R
index f7f6b919221a..e58601df8b61 100644
--- a/R-package/R/utils.R
+++ b/R-package/R/utils.R
@@ -288,7 +288,7 @@ xgb.cv.aggcv <- function(res, showsd = TRUE) {
     }
     ret <- paste(ret, sprintf("%f", mean(stats)), sep="")
     if (showsd) {
-      ret <- paste(ret, sprintf("+%f", sd(stats)), sep="")
+      ret <- paste(ret, sprintf("+%f", stats::sd(stats)), sep="")
     }
   }
   return (ret)
@@ -313,7 +313,7 @@ xgb.createFolds <- function(y, k = 10)
     if(cuts < 2) cuts <- 2
     if(cuts > 5) cuts <- 5
     y <- cut(y,
-             unique(quantile(y, probs = seq(0, 1, length = cuts))),
+             unique(stats::quantile(y, probs = seq(0, 1, length = cuts))),
              include.lowest = TRUE)
   }
 
diff --git a/R-package/R/xgb.cv.R b/R-package/R/xgb.cv.R
index 793d904cd195..a5364db52b8d 100644
--- a/R-package/R/xgb.cv.R
+++ b/R-package/R/xgb.cv.R
@@ -240,7 +240,7 @@ xgb.cv <- function(params=list(), data, nrounds, nfold, label = NULL, missing =
     else colnames <- colnamesMean
     
     type <- rep(x = "numeric", times = length(colnames))
-    dt <- read.table(text = "", colClasses = type, col.names = colnames) %>% as.data.table
+    dt <- utils::read.table(text = "", colClasses = type, col.names = colnames) %>% as.data.table
     split <- str_split(string = history, pattern = "\t")
     
     for(line in split) dt <- line[2:length(line)] %>% str_extract_all(pattern = "\\d*\\.+\\d*") %>% unlist %>% as.numeric %>% as.list %>% {rbindlist(list(dt, .), use.names = F, fill = F)}
diff --git a/R-package/R/xgb.model.dt.tree.R b/R-package/R/xgb.model.dt.tree.R
index 7eea3dfcd62d..d083566a56bd 100644
--- a/R-package/R/xgb.model.dt.tree.R
+++ b/R-package/R/xgb.model.dt.tree.R
@@ -133,34 +133,33 @@ xgb.model.dt.tree <- function(feature_names = NULL, filename_dump = NULL, model
     allTrees <- rbindlist(list(allTrees, dt), use.names = T, fill = F)
   }
   
-  yes <- allTrees[!is.na(Yes),Yes]
-                                                                                      
-  set(allTrees, i = which(allTrees[,Feature]!= "Leaf"), 
+  yes <- allTrees[!is.na(Yes), Yes]
+  
+  set(allTrees, i = which(allTrees[, Feature] != "Leaf"), 
       j = "Yes.Feature", 
-      value = allTrees[ID == yes,Feature])
-
-  set(allTrees, i = which(allTrees[,Feature]!= "Leaf"), 
+      value = allTrees[ID %in% yes, Feature])
+  
+  set(allTrees, i = which(allTrees[, Feature] != "Leaf"),
       j = "Yes.Cover", 
-      value = allTrees[ID == yes,Cover])
-
-  set(allTrees, i = which(allTrees[,Feature]!= "Leaf"), 
-    j = "Yes.Quality", 
-    value = allTrees[ID == yes,Quality])
+      value = allTrees[ID %in% yes, Cover])
   
-  no <- allTrees[!is.na(No),No]
+  set(allTrees, i = which(allTrees[, Feature] != "Leaf"),
+      j = "Yes.Quality", 
+      value = allTrees[ID %in% yes, Quality])
+  no <- allTrees[!is.na(No), No]
   
-  set(allTrees, i = which(allTrees[,Feature]!= "Leaf"), 
+  set(allTrees, i = which(allTrees[, Feature] != "Leaf"),
       j = "No.Feature", 
-      value = allTrees[ID == no,Feature])
+      value = allTrees[ID %in% no, Feature])
   
-  set(allTrees, i = which(allTrees[,Feature]!= "Leaf"), 
+  set(allTrees, i = which(allTrees[, Feature] != "Leaf"),
       j = "No.Cover", 
-      value = allTrees[ID == no,Cover])
+      value = allTrees[ID %in% no, Cover])
   
-  set(allTrees, i = which(allTrees[,Feature]!= "Leaf"), 
+  set(allTrees, i = which(allTrees[, Feature] != "Leaf"), 
       j = "No.Quality", 
-      value = allTrees[ID == no,Quality])
-        
+      value = allTrees[ID %in% no, Quality])
+  
   allTrees
 }
 
diff --git a/R-package/R/xgb.plot.importance.R b/R-package/R/xgb.plot.importance.R
index eb0f8e346c34..f126dfe464ae 100644
--- a/R-package/R/xgb.plot.importance.R
+++ b/R-package/R/xgb.plot.importance.R
@@ -33,7 +33,7 @@ xgb.plot.importance <- function(importance_matrix = NULL, numberOfClusters = c(1
   if (!"data.table" %in% class(importance_matrix))  {     
     stop("importance_matrix: Should be a data.table.")
   }
-  if (!require(ggplot2, quietly = TRUE)) {
+  if (!requireNamespace("ggplot2", quietly = TRUE)) {
     stop("ggplot2 package is required for plotting the importance", call. = FALSE)
   }
   if (!requireNamespace("Ckmeans.1d.dp", quietly = TRUE)) {
@@ -46,7 +46,7 @@ xgb.plot.importance <- function(importance_matrix = NULL, numberOfClusters = c(1
   clusters <- suppressWarnings(Ckmeans.1d.dp::Ckmeans.1d.dp(importance_matrix[,Gain], numberOfClusters))
   importance_matrix[,"Cluster":=clusters$cluster %>% as.character]
     
-  plot <- ggplot(importance_matrix, aes(x=reorder(Feature, Gain), y = Gain, width= 0.05), environment = environment())+  geom_bar(aes(fill=Cluster), stat="identity", position="identity") + coord_flip() + xlab("Features") + ylab("Gain") + ggtitle("Feature importance") + theme(plot.title = element_text(lineheight=.9, face="bold"), panel.grid.major.y = element_blank() )
+  plot <- ggplot2::ggplot(importance_matrix, ggplot2::aes(x=stats::reorder(Feature, Gain), y = Gain, width= 0.05), environment = environment())+  ggplot2::geom_bar(ggplot2::aes(fill=Cluster), stat="identity", position="identity") + ggplot2::coord_flip() + ggplot2::xlab("Features") + ggplot2::ylab("Gain") + ggplot2::ggtitle("Feature importance") + ggplot2::theme(plot.title = ggplot2::element_text(lineheight=.9, face="bold"), panel.grid.major.y = ggplot2::element_blank() )
   
   return(plot)  
 }
diff --git a/R-package/README.md b/R-package/README.md
index e974e3554ac0..c92bc9b96229 100644
--- a/R-package/README.md
+++ b/R-package/README.md
@@ -1,15 +1,44 @@
-# R package for xgboost.
+R package for xgboost
+=====================
 
-## Installation
+[![CRAN Status Badge](http://www.r-pkg.org/badges/version/xgboost)](http://cran.r-project.org/web/packages/xgboost)
+[![CRAN Downloads](http://cranlogs.r-pkg.org/badges/xgboost)](http://cran.rstudio.com/web/packages/xgboost/index.html)
 
-For up-to-date version (which is recommended), please install from github. Windows user will need to install [RTools](http://cran.r-project.org/bin/windows/Rtools/) first.
+Installation
+------------
+
+We are [on CRAN](https://cran.r-project.org/web/packages/xgboost/index.html) now. For stable/pre-compiled(for Windows and OS X) version, please install from CRAN:
 
 ```r
-devtools::install_github('dmlc/xgboost',subdir='R-package')
+install.packages('xgboost')
 ```
 
+For up-to-date version, please install from github. Windows user will need to install [RTools](http://cran.r-project.org/bin/windows/Rtools/) first.
 
-## Examples
+```r
+devtools::install_github('dmlc/xgboost',subdir='R-package')
+```
+
+Examples
+--------
 
 * Please visit [walk through example](demo).
 * See also the [example scripts](../demo/kaggle-higgs) for Kaggle Higgs Challenge, including [speedtest script](../demo/kaggle-higgs/speedtest.R) on this dataset and the one related to [Otto challenge](../demo/kaggle-otto), including a [RMarkdown documentation](../demo/kaggle-otto/understandingXGBoostModel.Rmd).
+
+Notes
+-----
+
+If you face an issue installing the package using  ```devtools::install_github```, something like this (even after updating libxml and RCurl as lot of forums say) -
+
+```
+devtools::install_github('dmlc/xgboost',subdir='R-package')
+Downloading github repo dmlc/xgboost@master
+Error in function (type, msg, asError = TRUE)  :
+  Peer certificate cannot be authenticated with given CA certificates
+```
+To get around this you can build the package locally as mentioned [here](https://github.com/dmlc/xgboost/issues/347) -
+```
+1. Clone the current repository and set your workspace to xgboost/R-package/
+2. Run R CMD INSTALL --build . in terminal to get the tarball.
+3. Run install.packages('path_to_the_tarball',repo=NULL) in R to install.
+```
diff --git a/R-package/demo/basic_walkthrough.R b/R-package/demo/basic_walkthrough.R
index 762a1c8e8386..532c5d873280 100644
--- a/R-package/demo/basic_walkthrough.R
+++ b/R-package/demo/basic_walkthrough.R
@@ -1,7 +1,7 @@
 require(xgboost)
 require(methods)
 # we load in the agaricus dataset
-# In this example, we are aiming to predict whether a mushroom can be eated
+# In this example, we are aiming to predict whether a mushroom can be eaten
 data(agaricus.train, package='xgboost')
 data(agaricus.test, package='xgboost')
 train <- agaricus.train
@@ -12,8 +12,8 @@ class(train$data)
 
 #-------------Basic Training using XGBoost-----------------
 # this is the basic usage of xgboost you can put matrix in data field
-# note: we are puting in sparse matrix here, xgboost naturally handles sparse input
-# use sparse matrix when your feature is sparse(e.g. when you using one-hot encoding vector)
+# note: we are putting in sparse matrix here, xgboost naturally handles sparse input
+# use sparse matrix when your feature is sparse(e.g. when you are using one-hot encoding vector)
 print("training xgboost with sparseMatrix")
 bst <- xgboost(data = train$data, label = train$label, max.depth = 2, eta = 1, nround = 2,
                nthread = 2, objective = "binary:logistic")
@@ -22,7 +22,7 @@ print("training xgboost with Matrix")
 bst <- xgboost(data = as.matrix(train$data), label = train$label, max.depth = 2, eta = 1, nround = 2,
                nthread = 2, objective = "binary:logistic")
 
-# you can also put in xgb.DMatrix object, stores label, data and other meta datas needed for advanced features
+# you can also put in xgb.DMatrix object, which stores label, data and other meta datas needed for advanced features
 print("training xgboost with xgb.DMatrix")
 dtrain <- xgb.DMatrix(data = train$data, label = train$label)
 bst <- xgboost(data = dtrain, max.depth = 2, eta = 1, nround = 2, nthread = 2, 
@@ -72,7 +72,7 @@ print(paste("sum(abs(pred3-pred))=", sum(abs(pred2-pred))))
 dtrain <- xgb.DMatrix(data = train$data, label=train$label)
 dtest <- xgb.DMatrix(data = test$data, label=test$label)
 #---------------Using watchlist----------------
-# watchlist is a list of xgb.DMatrix, each of them tagged with name
+# watchlist is a list of xgb.DMatrix, each of them is tagged with name
 watchlist <- list(train=dtrain, test=dtest)
 # to train with watchlist, use xgb.train, which contains more advanced features
 # watchlist allows us to monitor the evaluation result on all data in the list 
diff --git a/R-package/demo/create_sparse_matrix.R b/R-package/demo/create_sparse_matrix.R
index e3a536cfe7d2..2fbf41772029 100644
--- a/R-package/demo/create_sparse_matrix.R
+++ b/R-package/demo/create_sparse_matrix.R
@@ -1,11 +1,13 @@
 require(xgboost)
 require(Matrix)
 require(data.table)
-if (!require(vcd)) install.packages('vcd') #Available in Cran. Used for its dataset with categorical values.
-
+if (!require(vcd)) {
+  install.packages('vcd') #Available in Cran. Used for its dataset with categorical values.
+  require(vcd)
+}
 # According to its documentation, Xgboost works only on numbers.
 # Sometimes the dataset we have to work on have categorical data. 
-# A categorical variable is one which have a fixed number of values. By exemple, if for each observation a variable called "Colour" can have only "red", "blue" or "green" as value, it is a categorical variable.
+# A categorical variable is one which have a fixed number of values. By example, if for each observation a variable called "Colour" can have only "red", "blue" or "green" as value, it is a categorical variable.
 #
 # In R, categorical variable is called Factor. 
 # Type ?factor in console for more information.
@@ -72,11 +74,11 @@ importance <- xgb.importance(sparse_matrix@Dimnames[[2]], 'xgb.model.dump')
 print(importance)
 # According to the matrix below, the most important feature in this dataset to predict if the treatment will work is the Age. The second most important feature is having received a placebo or not. The sex is third. Then we see our generated features (AgeDiscret). We can see that their contribution is very low (Gain column).
 
-# Does these results make sense?
+# Does these result make sense?
 # Let's check some Chi2 between each of these features and the outcome.
 
 print(chisq.test(df$Age, df$Y))
-# Pearson correlation between Age and illness disapearing is 35
+# Pearson correlation between Age and illness disappearing is 35
 
 print(chisq.test(df$AgeDiscret, df$Y))
 # Our first simplification of Age gives a Pearson correlation of 8.
@@ -84,6 +86,6 @@ print(chisq.test(df$AgeDiscret, df$Y))
 print(chisq.test(df$AgeCat, df$Y))
 # The perfectly random split I did between young and old at 30 years old have a low correlation of 2. It's a result we may expect as may be in my mind > 30 years is being old (I am 32 and starting feeling old, this may explain that), but  for the illness we are studying, the age to be vulnerable is not the same. Don't let your "gut" lower the quality of your model. In "data science", there is science :-)
 
-# As you can see, in general destroying information by simplying it won't improve your model. Chi2 just demonstrates that. But in more complex cases, creating a new feature based on existing one which makes link with the outcome more obvious may help the algorithm and improve the model. The case studied here is not enough complex to show that. Check Kaggle forum for some challenging datasets.
+# As you can see, in general destroying information by simplifying it won't improve your model. Chi2 just demonstrates that. But in more complex cases, creating a new feature based on existing one which makes link with the outcome more obvious may help the algorithm and improve the model. The case studied here is not enough complex to show that. Check Kaggle forum for some challenging datasets.
 # However it's almost always worse when you add some arbitrary rules.
 # Moreover, you can notice that even if we have added some not useful new features highly correlated with other features, the boosting tree algorithm have been able to choose the best one, which in this case is the Age. Linear model may not be that strong in these scenario.
diff --git a/R-package/demo/custom_objective.R b/R-package/demo/custom_objective.R
index 201f23d98d20..7234ead869a3 100644
--- a/R-package/demo/custom_objective.R
+++ b/R-package/demo/custom_objective.R
@@ -33,7 +33,7 @@ evalerror <- function(preds, dtrain) {
   return(list(metric = "error", value = err))
 }
 
-param <- list(max.depth=2,eta=1,nthread = 2, silent=1, 
+param <- list(max.depth=2, eta=1, nthread = 2, silent=1, 
               objective=logregobj, eval_metric=evalerror)
 print ('start training with user customized objective')
 # training with customized objective, we can also do step by step training
@@ -57,9 +57,9 @@ logregobjattr <- function(preds, dtrain) {
   hess <- preds * (1 - preds)
   return(list(grad = grad, hess = hess))
 }
-
+param <- list(max.depth=2, eta=1, nthread = 2, silent=1, 
+              objective=logregobjattr, eval_metric=evalerror)
 print ('start training with user customized objective, with additional attributes in DMatrix')
 # training with customized objective, we can also do step by step training
 # simply look at xgboost.py's implementation of train
-bst <- xgb.train(param, dtrain, num_round, watchlist, 
-                 objective=logregobj, eval_metric=evalerror)
+bst <- xgb.train(param, dtrain, num_round, watchlist)
diff --git a/R-package/src/xgboost_R.cpp b/R-package/src/xgboost_R.cpp
index a8084b20675a..1d426c496a0f 100644
--- a/R-package/src/xgboost_R.cpp
+++ b/R-package/src/xgboost_R.cpp
@@ -32,14 +32,14 @@ extern "C" {
 bool CheckNAN(double v) {
   return ISNAN(v);
 }
-bool LogGamma(double v) {
+double LogGamma(double v) {
   return lgammafn(v);
 }
 }  // namespace utils
 
 namespace random {
 void Seed(unsigned seed) {
-  warning("parameter seed is ignored, please set random seed using set.seed");
+  //  warning("parameter seed is ignored, please set random seed using set.seed");
 }
 double Uniform(void) {
   return unif_rand();
diff --git a/R-package/vignettes/xgboostPresentation.Rmd b/R-package/vignettes/xgboostPresentation.Rmd
index b7648340d8a2..89d27fb45dc2 100644
--- a/R-package/vignettes/xgboostPresentation.Rmd
+++ b/R-package/vignettes/xgboostPresentation.Rmd
@@ -1,6 +1,6 @@
 ---
 title: "Xgboost presentation"
-output: 
+output:
   rmarkdown::html_vignette:
     css: vignette.css
     number_sections: yes
@@ -16,7 +16,7 @@ vignette: >
 Introduction
 ============
 
-**Xgboost** is short for e**X**treme **G**radient **Boost**ing package. 
+**Xgboost** is short for e**X**treme **G**radient **Boost**ing package.
 
 The purpose of this Vignette is to show you how to use **Xgboost** to build a model and make predictions.
 
@@ -25,9 +25,9 @@ It is an efficient and scalable implementation of gradient boosting framework by
 - *linear* model ;
 - *tree learning* algorithm.
 
-It supports various objective functions, including *regression*, *classification* and *ranking*. The package is made to be extendible, so that users are also allowed to define their own objective functions easily. 
+It supports various objective functions, including *regression*, *classification* and *ranking*. The package is made to be extendible, so that users are also allowed to define their own objective functions easily.
 
-It has been [used](https://github.com/dmlc/xgboost) to win several [Kaggle](http://www.kaggle.com) competitions. 
+It has been [used](https://github.com/dmlc/xgboost) to win several [Kaggle](http://www.kaggle.com) competitions.
 
 It has several features:
 
@@ -64,7 +64,7 @@ Formerly available versions can be obtained from the CRAN [archive](http://cran.
 Learning
 ========
 
-For the purpose of this tutorial we will load **Xgboost** package.
+For the purpose of this tutorial we will load **XGBoost** package.
 
 ```{r libLoading, results='hold', message=F, warning=F}
 require(xgboost)
@@ -73,7 +73,7 @@ require(xgboost)
 Dataset presentation
 --------------------
 
-In this example, we are aiming to predict whether a mushroom can be eaten or not (like in many tutorials, example data are the the same as you will use on in your every day life :-). 
+In this example, we are aiming to predict whether a mushroom can be eaten or not (like in many tutorials, example data are the the same as you will use on in your every day life :-).
 
 Mushroom data is cited from UCI Machine Learning Repository. @Bache+Lichman:2013.
 
@@ -85,7 +85,7 @@ We will load the `agaricus` datasets embedded with the package and will link the
 The datasets are already split in:
 
 * `train`: will be used to build the model ;
-* `test`: will be used to assess the quality of our model. 
+* `test`: will be used to assess the quality of our model.
 
 Why *split* the dataset in two parts?
 
@@ -115,7 +115,7 @@ dim(train$data)
 dim(test$data)
 ```
 
-This dataset is very small to not make the **R** package too heavy, however **Xgboost** is built to manage huge dataset very efficiently.
+This dataset is very small to not make the **R** package too heavy, however **XGBoost** is built to manage huge dataset very efficiently.
 
 As seen below, the `data` are stored in a `dgCMatrix` which is a *sparse* matrix and `label` vector is a `numeric` vector (`{0,1}`):
 
@@ -124,7 +124,7 @@ class(train$data)[1]
 class(train$label)
 ```
 
-Basic Training using Xgboost
+Basic Training using XGBoost
 ----------------------------
 
 This step is the most critical part of the process for the quality of our model.
@@ -160,7 +160,7 @@ bstDense <- xgboost(data = as.matrix(train$data), label = train$label, max.depth
 
 #### xgb.DMatrix
 
-**Xgboost** offers a way to group them in a `xgb.DMatrix`. You can even add other meta data in it. It will be usefull for the most advanced features we will discover later.
+**XGBoost** offers a way to group them in a `xgb.DMatrix`. You can even add other meta data in it. It will be usefull for the most advanced features we will discover later.
 
 ```{r trainingDmatrix, message=F, warning=F}
 dtrain <- xgb.DMatrix(data = train$data, label = train$label)
@@ -169,7 +169,7 @@ bstDMatrix <- xgboost(data = dtrain, max.depth = 2, eta = 1, nthread = 2, nround
 
 #### Verbose option
 
-**Xgboost** has severa features to help you to view how the learning progress internally. The purpose is to help you to set the best parameters, which is the key of your model quality.
+**XGBoost** has severa features to help you to view how the learning progress internally. The purpose is to help you to set the best parameters, which is the key of your model quality.
 
 One of the simplest way to see the training progress is to set the `verbose` option (see below for more advanced technics).
 
@@ -188,7 +188,7 @@ bst <- xgboost(data = dtrain, max.depth = 2, eta = 1, nthread = 2, nround = 2, o
 bst <- xgboost(data = dtrain, max.depth = 2, eta = 1, nthread = 2, nround = 2, objective = "binary:logistic", verbose = 2)
 ```
 
-Basic prediction using Xgboost
+Basic prediction using XGBoost
 ==============================
 
 Perform the prediction
@@ -211,7 +211,7 @@ These numbers doesn't look like *binary classification* `{0,1}`. We need to perf
 Transform the regression in a binary classification
 ---------------------------------------------------
 
-The only thing that **Xgboost** does is a *regression*. **Xgboost** is using `label` vector to build its *regression* model.
+The only thing that **XGBoost** does is a *regression*. **XGBoost** is using `label` vector to build its *regression* model.
 
 How can we use a *regression* model to perform a binary classification?
 
@@ -240,7 +240,7 @@ Steps explanation:
 2. `probabilityVectorPreviouslyComputed != test$label` computes the vector of error between true data and computed probabilities ;
 3. `mean(vectorOfErrors)` computes the *average error* itself.
 
-The most important thing to remember is that **to do a classification, you just do a regression to the** `label` **and then apply a threshold**. 
+The most important thing to remember is that **to do a classification, you just do a regression to the** `label` **and then apply a threshold**.
 
 *Multiclass* classification works in a similar way.
 
@@ -269,7 +269,7 @@ Both `xgboost` (simple) and `xgb.train` (advanced) functions train models.
 
 One of the special feature of `xgb.train` is the capacity to follow the progress of the learning after each round. Because of the way boosting works, there is a time when having too many rounds lead to an overfitting. You can see this feature as a cousin of cross-validation method. The following technics will help you to avoid overfitting or optimizing the learning time in stopping it as soon as possible.
 
-One way to measure progress in learning of a model is to provide to **Xgboost** a second dataset already classified. Therefore it can learn on the first dataset and test its model on the second one. Some metrics are measured after each round during the learning.
+One way to measure progress in learning of a model is to provide to **XGBoost** a second dataset already classified. Therefore it can learn on the first dataset and test its model on the second one. Some metrics are measured after each round during the learning.
 
 > in some way it is similar to what we have done above with the average error. The main difference is that below it was after building the model, and now it is during the construction that we measure errors.
 
@@ -281,7 +281,7 @@ watchlist <- list(train=dtrain, test=dtest)
 bst <- xgb.train(data=dtrain, max.depth=2, eta=1, nthread = 2, nround=2, watchlist=watchlist, objective = "binary:logistic")
 ```
 
-**Xgboost** has computed at each round the same average error metric than seen above (we set `nround` to 2, that is why we have two lines). Obviously, the `train-error` number is related to the training dataset (the one the algorithm learns from) and the `test-error` number to the test dataset. 
+**XGBoost** has computed at each round the same average error metric than seen above (we set `nround` to 2, that is why we have two lines). Obviously, the `train-error` number is related to the training dataset (the one the algorithm learns from) and the `test-error` number to the test dataset.
 
 Both training and test error related metrics are very similar, and in some way, it makes sense: what we have learned from the training dataset matches the observations from the test dataset.
 
@@ -298,13 +298,13 @@ bst <- xgb.train(data=dtrain, max.depth=2, eta=1, nthread = 2, nround=2, watchli
 Linear boosting
 ---------------
 
-Until know, all the learnings we have performed were based on boosting trees. **Xgboost** implements a second algorithm, based on linear boosting. The only difference with previous command is `booster = "gblinear"` parameter (and removing `eta` parameter).
+Until know, all the learnings we have performed were based on boosting trees. **XGBoost** implements a second algorithm, based on linear boosting. The only difference with previous command is `booster = "gblinear"` parameter (and removing `eta` parameter).
 
 ```{r linearBoosting, message=F, warning=F}
 bst <- xgb.train(data=dtrain, booster = "gblinear", max.depth=2, nthread = 2, nround=2, watchlist=watchlist, eval.metric = "error", eval.metric = "logloss", objective = "binary:logistic")
 ```
 
-In this specific case, *linear boosting* gets sligtly better performance metrics than decision trees based algorithm. 
+In this specific case, *linear boosting* gets sligtly better performance metrics than decision trees based algorithm.
 
 In simple cases, it will happem because there is nothing better than a linear algorithm to catch a linear link. However, decision trees are much better to catch a non linear link between predictors and outcome. Because there is no silver bullet, we advise you to check both algorithms with your own datasets to have an idea of what to use.
 
@@ -337,6 +337,17 @@ err <- as.numeric(sum(as.integer(pred > 0.5) != label))/length(label)
 print(paste("test-error=", err))
 ```
 
+View feature importance/influence from the learnt model
+-------------------------------------------------------
+
+Feature importance is similar to R gbm package's relative influence (rel.inf).
+
+```
+importance_matrix <- xgb.importance(model = bst)
+print(importance_matrix)
+xgb.plot.importance(importance_matrix)
+```
+
 View the trees from a model
 ---------------------------
 
@@ -346,6 +357,12 @@ You can dump the tree you learned using `xgb.dump` into a text file.
 xgb.dump(bst, with.stats = T)
 ```
 
+You can plot the trees from your model using ```xgb.plot.tree``
+
+```
+xgb.plot.tree(model = bst)
+```
+
 > if you provide a path to `fname` parameter you can save the trees to your hard drive.
 
 Save and load models
@@ -353,7 +370,7 @@ Save and load models
 
 May be your dataset is big, and it takes time to train a model on it? May be you are not a big fan of loosing time in redoing the same task again and again? In these very rare cases, you will want to save your model and load it when required.
 
-Hopefully for you, **Xgboost** implements such functions.
+Hopefully for you, **XGBoost** implements such functions.
 
 ```{r saveModel, message=F, warning=F}
 # save model to binary local file
@@ -380,7 +397,7 @@ file.remove("./xgboost.model")
 
 > result is `0`? We are good!
 
-In some very specific cases, like when you want to pilot **Xgboost** from `caret` package, you will want to save the model as a *R* binary vector. See below how to do it.
+In some very specific cases, like when you want to pilot **XGBoost** from `caret` package, you will want to save the model as a *R* binary vector. See below how to do it.
 
 ```{r saveLoadRBinVectorModel, message=F, warning=F}
 # save model to R's raw vector
@@ -395,9 +412,9 @@ pred3 <- predict(bst3, test$data)
 
 # pred2 should be identical to pred
 print(paste("sum(abs(pred3-pred))=", sum(abs(pred2-pred))))
-``` 
+```
 
-> Again `0`? It seems that `Xgboost` works pretty well!
+> Again `0`? It seems that `XGBoost` works pretty well!
 
 References
 ==========
diff --git a/README.md b/README.md
index cdd4c02f7304..121462d3851c 100644
--- a/README.md
+++ b/README.md
@@ -1,61 +1,78 @@
-XGBoost: eXtreme Gradient Boosting
-==================================
-
+<img src=https://raw.githubusercontent.com/dmlc/dmlc.github.io/master/img/logo-m/xgboost.png width=135/>  eXtreme Gradient Boosting
+===========
 [![Build Status](https://travis-ci.org/dmlc/xgboost.svg?branch=master)](https://travis-ci.org/dmlc/xgboost)
+[![Documentation Status](https://readthedocs.org/projects/xgboost/badge/?version=latest)](https://xgboost.readthedocs.org)
+[![CRAN Status Badge](http://www.r-pkg.org/badges/version/xgboost)](http://cran.r-project.org/web/packages/xgboost)
+[![Gitter chat for developers at https://gitter.im/dmlc/xgboost](https://badges.gitter.im/Join%20Chat.svg)](https://gitter.im/dmlc/xgboost?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge)
 
 An optimized general purpose gradient boosting library. The library is parallelized, and also provides an optimized distributed version.
-It implements machine learning algorithm under gradient boosting framework, including generalized linear model and gradient boosted regression tree (GBDT). XGBoost can also be distributed and scale to Terascale data
 
-Contributors: https://github.com/dmlc/xgboost/graphs/contributors
+It implements machine learning algorithms under the [Gradient Boosting](https://en.wikipedia.org/wiki/Gradient_boosting) framework, including [Generalized Linear Model](https://en.wikipedia.org/wiki/Generalized_linear_model) (GLM) and [Gradient Boosted Decision Trees](https://en.wikipedia.org/wiki/Gradient_boosting#Gradient_tree_boosting) (GBDT). XGBoost can also be [distributed](#features) and scale to Terascale data
 
-Documentations: [Documentation of xgboost](doc/README.md)
+XGBoost is part of [Distributed Machine Learning Common](http://dmlc.github.io/) <img src=https://avatars2.githubusercontent.com/u/11508361?v=3&s=20> projects
 
-Issues Tracker: [https://github.com/dmlc/xgboost/issues](https://github.com/dmlc/xgboost/issues?q=is%3Aissue+label%3Aquestion)
+Contents
+--------
+* [What's New](#whats-new)
+* [Version](#version)
+* [Documentation](doc/index.md)
+* [Build Instruction](doc/build.md)
+* [Features](#features)
+* [Distributed XGBoost](multi-node)
+* [Usecases](doc/README.md#highlight-links)
+* [Bug Reporting](#bug-reporting)
+* [Contributing to XGBoost](#contributing-to-xgboost)
+* [Committers and Contributors](CONTRIBUTORS.md)
+* [License](#license)
+* [XGBoost in Graphlab Create](#xgboost-in-graphlab-create)
 
-Please join [XGBoost User Group](https://groups.google.com/forum/#!forum/xgboost-user/) to ask questions and share your experience on xgboost.
-  - Use issue tracker for bug reports, feature requests etc.
-  - Use the user group to post your experience, ask questions about general usages.
+What's New
+----------
 
-Gitter for developers [![Gitter chat for developers at https://gitter.im/dmlc/xgboost](https://badges.gitter.im/Join%20Chat.svg)](https://gitter.im/dmlc/xgboost?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge)
+* XGBoost helps Chenglong Chen to win [Kaggle CrowdFlower Competition](https://www.kaggle.com/c/crowdflower-search-relevance)
+  Check out the [winning solution](https://github.com/ChenglongChen/Kaggle_CrowdFlower)
+* XGBoost-0.4 release, see [CHANGES.md](CHANGES.md#xgboost-04)
+* XGBoost helps three champion teams to win [WWW2015  Microsoft Malware Classification Challenge (BIG 2015)](http://www.kaggle.com/c/malware-classification/forums/t/13490/say-no-to-overfitting-approaches-sharing)
+  Check out the [winning solution](doc/README.md#highlight-links)
+* [External Memory Version](doc/external_memory.md)
 
-Distributed Version: [Distributed XGBoost](multi-node)
+Version
+-------
 
-Highlights of Usecases: [Highlight Links](doc/README.md#highlight-links)
+* Current version xgboost-0.4
+  - [Change log](CHANGES.md)
+  - This version is compatible with 0.3x versions
 
-XGBoost is part of [Distributed Machine Learning Common](http://dmlc.github.io/) projects
+Features
+--------
+* Easily accessible through CLI, [python](https://github.com/dmlc/xgboost/blob/master/demo/guide-python/basic_walkthrough.py),
+  [R](https://github.com/dmlc/xgboost/blob/master/R-package/demo/basic_walkthrough.R),
+  [Julia](https://github.com/antinucleon/XGBoost.jl/blob/master/demo/basic_walkthrough.jl)
+* Its fast! Benchmark numbers comparing xgboost, H20, Spark, R - [benchm-ml numbers](https://github.com/szilard/benchm-ml)
+* Memory efficient - Handles sparse matrices, supports external memory
+* Accurate prediction, and used extensively by data scientists and kagglers - [highlight links](https://github.com/dmlc/xgboost/blob/master/doc/README.md#highlight-links)
+* Distributed version runs on Hadoop (YARN), MPI, SGE etc., scales to billions of examples.
 
-What's New
-==========
-* XGBoost-0.4 release, see [CHANGES.md](CHANGES.md#xgboost-04)
-* XGBoost wins [WWW2015  Microsoft Malware Classification Challenge (BIG 2015)](http://www.kaggle.com/c/malware-classification/forums/t/13490/say-no-to-overfitting-approaches-sharing)
-  - Checkout the winning solution at [Highlight links](doc/README.md#highlight-links)
-* [External Memory Version](doc/external_memory.md)
+Bug Reporting
+-------------
 
-Features
-========
-* Easily accessible in python, R, Julia, CLI
-* Fast speed and memory efficient
-  - Can be more than 10 times faster than GBM in sklearn and R
-  - Handles sparse matrices, support external memory
-* Accurate prediction, and used extensively by data scientists and kagglers
-  - See [highlight links](https://github.com/dmlc/xgboost/blob/master/doc/README.md#highlight-links)
-* Distributed and Portable
-  - The distributed version runs on Hadoop (YARN), MPI, SGE etc.
-  - Scales to billions of examples and beyond
-
-Build
-=======
-* Run ```bash build.sh``` (you can also type make)
-  - Normally it gives what you want
-  - See [Build Instruction](doc/build.md) for more information
+* For reporting bugs please use the [xgboost/issues](https://github.com/dmlc/xgboost/issues) page.
+* For generic questions or to share your experience using xgboost please use the [XGBoost User Group](https://groups.google.com/forum/#!forum/xgboost-user/)
 
-Version
-=======
-* Current version xgboost-0.4, a lot improvment has been made since 0.3
-  - Change log in [CHANGES.md](CHANGES.md)
-  - This version is compatible with 0.3x versions
+
+Contributing to XGBoost
+-----------------------
+
+XGBoost has been developed and used by a group of active community members. Everyone is more than welcome to contribute. It is a way to make the project better and more accessible to more users.
+* Check out [Feature Wish List](https://github.com/dmlc/xgboost/labels/Wish-List) to see what can be improved, or open an issue if you want something.
+* Contribute to the [documents and examples](https://github.com/dmlc/xgboost/blob/master/doc/) to share your experience with other users.
+* Please add your name to [CONTRIBUTORS.md](CONTRIBUTORS.md) after your patch has been merged.
+
+License
+-------
+© Contributors, 2015. Licensed under an [Apache-2](https://github.com/dmlc/xgboost/blob/master/LICENSE) license.
 
 XGBoost in Graphlab Create
-==========================
-* XGBoost is adopted as part of boosted tree toolkit in Graphlab Create (GLC). Graphlab Create is a powerful python toolkit that allows you to data manipulation, graph processing, hyper-parameter search, and visualization of TeraBytes scale data in one framework. Try the Graphlab Create in http://graphlab.com/products/create/quick-start-guide.html
-* Nice blogpost by Jay Gu using GLC boosted tree to solve kaggle bike sharing challenge: http://blog.graphlab.com/using-gradient-boosted-trees-to-predict-bike-sharing-demand
+--------------------------
+* XGBoost is adopted as part of boosted tree toolkit in Graphlab Create (GLC). Graphlab Create is a powerful python toolkit that allows you to do data manipulation, graph processing, hyper-parameter search, and visualization of TeraBytes scale data in one framework. Try the [Graphlab Create](http://graphlab.com/products/create/quick-start-guide.html)
+* Nice [blogpost](http://blog.graphlab.com/using-gradient-boosted-trees-to-predict-bike-sharing-demand) by Jay Gu about using GLC boosted tree to solve kaggle bike sharing challenge:
diff --git a/appveyor.yml b/appveyor.yml
new file mode 100644
index 000000000000..c1367d52e877
--- /dev/null
+++ b/appveyor.yml
@@ -0,0 +1,36 @@
+environment:
+  global:
+   CMD_IN_ENV: "cmd /E:ON /V:ON /C .\\python-appveyor-demo\\appveyor\\run_with_env.cmd"
+   DISABLE_OPENMP: 1
+   VisualStudioVersion: 12.0
+   
+  matrix:
+    - PYTHON: "C:\\Python27-x64"
+      PYTHON_VERSION: "2.7.x" # currently 2.7.9
+      PYTHON_ARCH: "64"
+
+    - PYTHON: "C:\\Python33-x64"
+      PYTHON_VERSION: "3.3.x" # currently 3.3.5
+      PYTHON_ARCH: "64"
+
+platform:
+  - x64
+
+configuration:
+  - Release
+
+install:
+  - cmd: git clone https://github.com/ogrisel/python-appveyor-demo
+  - ECHO "Filesystem root:"
+  - ps: "ls \"C:/\""
+
+  - ECHO "Installed SDKs:"
+  - ps: "ls \"C:/Program Files/Microsoft SDKs/Windows\""
+
+  - ps: python-appveyor-demo\appveyor\install.ps1
+  - "SET PATH=%PYTHON%;%PYTHON%\\Scripts;%PATH%"
+  - "python --version"
+  - "python -c \"import struct; print(struct.calcsize('P') * 8)\""
+
+build: off
+  #project: windows\xgboost.sln
\ No newline at end of file
diff --git a/demo/.gitignore b/demo/.gitignore
index e52797d1512f..ee79c704b268 100644
--- a/demo/.gitignore
+++ b/demo/.gitignore
@@ -1 +1,2 @@
-*.libsvm
\ No newline at end of file
+*.libsvm
+*.pkl
diff --git a/demo/README.md b/demo/README.md
index 49e9e52b88e7..d6f061484962 100644
--- a/demo/README.md
+++ b/demo/README.md
@@ -1,14 +1,14 @@
-XGBoost Examples
-====
-This folder contains all the code examples using xgboost. 
+XGBoost Code Examples
+=====================
+This folder contains all the code examples using xgboost.
 
 * Contribution of examples, benchmarks is more than welcome!
 * If you like to share how you use xgboost to solve your problem, send a pull request:)
- 
+
 Features Walkthrough
-====
-This is a list of short codes introducing different functionalities of xgboost and its wrapper.
-* Basic walkthrough of wrappers 
+--------------------
+This is a list of short codes introducing different functionalities of xgboost packages.
+* Basic walkthrough of packages
   [python](guide-python/basic_walkthrough.py)
   [R](../R-package/demo/basic_walkthrough.R)
   [Julia](https://github.com/antinucleon/XGBoost.jl/blob/master/demo/basic_walkthrough.jl)
@@ -20,24 +20,24 @@ This is a list of short codes introducing different functionalities of xgboost a
   [python](guide-python/boost_from_prediction.py)
   [R](../R-package/demo/boost_from_prediction.R)
   [Julia](https://github.com/antinucleon/XGBoost.jl/blob/master/demo/boost_from_prediction.jl)
-* Predicting using first n trees 
+* Predicting using first n trees
   [python](guide-python/predict_first_ntree.py)
   [R](../R-package/demo/boost_from_prediction.R)
-  [Julia](https://github.com/antinucleon/XGBoost.jl/blob/master/demo/boost_from_prediction.jl)  
+  [Julia](https://github.com/antinucleon/XGBoost.jl/blob/master/demo/boost_from_prediction.jl)
 * Generalized Linear Model
   [python](guide-python/generalized_linear_model.py)
   [R](../R-package/demo/generalized_linear_model.R)
-  [Julia](https://github.com/antinucleon/XGBoost.jl/blob/master/demo/generalized_linear_model.jl)  
+  [Julia](https://github.com/antinucleon/XGBoost.jl/blob/master/demo/generalized_linear_model.jl)
 * Cross validation
   [python](guide-python/cross_validation.py)
   [R](../R-package/demo/cross_validation.R)
-  [Julia](https://github.com/antinucleon/XGBoost.jl/blob/master/demo/cross_validation.jl)  
+  [Julia](https://github.com/antinucleon/XGBoost.jl/blob/master/demo/cross_validation.jl)
 * Predicting leaf indices
   [python](guide-python/predict_leaf_indices.py)
   [R](../R-package/demo/predict_leaf_indices.R)
 
 Basic Examples by Tasks
-====
+-----------------------
 Most of examples in this section are based on CLI or python version.
 However, the parameter settings can be applied to all versions
 * [Binary classification](binary_classification)
@@ -46,7 +46,7 @@ However, the parameter settings can be applied to all versions
 * [Learning to Rank](rank)
 
 Benchmarks
-====
+----------
 * [Starter script for Kaggle Higgs Boson](kaggle-higgs)
-* [Kaggle Tradeshift winning solution by daxiongshu](https://github.com/daxiongshu/kaggle-tradeshift-winning-solution) 
+* [Kaggle Tradeshift winning solution by daxiongshu](https://github.com/daxiongshu/kaggle-tradeshift-winning-solution)
 
diff --git a/demo/binary_classification/README.md b/demo/binary_classification/README.md
index 8d1e5e2a5899..482666ec4f87 100644
--- a/demo/binary_classification/README.md
+++ b/demo/binary_classification/README.md
@@ -162,7 +162,11 @@ If you want to continue boosting from existing model, say 0002.model, use
 ```
 xgboost will load from 0002.model continue boosting for 2 rounds, and save output to continue.model. However, beware that the training and evaluation data specified in mushroom.conf should not change when you use this function.
 #### Use Multi-Threading
-When you are working with a large dataset, you may want to take advantage of parallelism. If your compiler supports OpenMP, xgboost is naturally multi-threaded, to set number of parallel running threads to 10, add ```nthread=10``` to your configuration.
+When you are working with a large dataset, you may want to take advantage of parallelism. If your compiler supports OpenMP, xgboost is naturally multi-threaded, to set number of parallel running add ```nthread``` parameter to you configuration. 
+Eg. ```nthread=10```
+
+Set nthread to be the number of your real cpu (On Unix, this can be found using ```lscpu```)
+Some systems will have ```Thread(s) per core = 2```, for example, a 4 core cpu with 8 threads, in such case set ```nthread=4``` and not 8.
 
 #### Additional Notes
 * What are ```agaricus.txt.test.buffer``` and ```agaricus.txt.train.buffer``` generated during runexp.sh? 
diff --git a/demo/guide-python/README.md b/demo/guide-python/README.md
index 32d0290ab7e6..d26b8fcf2b06 100644
--- a/demo/guide-python/README.md
+++ b/demo/guide-python/README.md
@@ -1,11 +1,12 @@
 XGBoost Python Feature Walkthrough
-====
-* [Basic walkthrough of wrappers](basic_walkthrough.py) 
+==================================
+* [Basic walkthrough of wrappers](basic_walkthrough.py)
 * [Cutomize loss function, and evaluation metric](custom_objective.py)
 * [Boosting from existing prediction](boost_from_prediction.py)
 * [Predicting using first n trees](predict_first_ntree.py)
 * [Generalized Linear Model](generalized_linear_model.py)
 * [Cross validation](cross_validation.py)
 * [Predicting leaf indices](predict_leaf_indices.py)
-* [Sklearn Wrapper](sklearn_example.py)
+* [Sklearn Wrapper](sklearn_examples.py)
+* [Sklearn Parallel](sklearn_parallel.py)
 * [External Memory](external_memory.py)
diff --git a/demo/guide-python/runall.sh b/demo/guide-python/runall.sh
index 8f4f9832a6d4..5c8ddf93ce94 100755
--- a/demo/guide-python/runall.sh
+++ b/demo/guide-python/runall.sh
@@ -2,7 +2,11 @@
 python basic_walkthrough.py
 python custom_objective.py
 python boost_from_prediction.py
+python predict_first_ntree.py
 python generalized_linear_model.py
 python cross_validation.py
 python predict_leaf_indices.py
+python sklearn_examples.py
+python sklearn_parallel.py
+python external_memory.py
 rm -rf *~ *.model *.buffer 
diff --git a/demo/guide-python/sklearn_examples.py b/demo/guide-python/sklearn_examples.py
index ce8c8d01e9b6..7ce95b491dd9 100755
--- a/demo/guide-python/sklearn_examples.py
+++ b/demo/guide-python/sklearn_examples.py
@@ -8,7 +8,7 @@
 import xgboost as xgb
 
 import numpy as np
-from sklearn.cross_validation import KFold
+from sklearn.cross_validation import KFold, train_test_split
 from sklearn.metrics import confusion_matrix, mean_squared_error
 from sklearn.grid_search import GridSearchCV
 from sklearn.datasets import load_iris, load_digits, load_boston
@@ -65,3 +65,13 @@
 pickle.dump(clf, open("best_boston.pkl", "wb"))
 clf2 = pickle.load(open("best_boston.pkl", "rb"))
 print(np.allclose(clf.predict(X), clf2.predict(X)))
+
+# Early-stopping
+
+X = digits['data']
+y = digits['target']
+X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
+clf = xgb.XGBClassifier()
+clf.fit(X_train, y_train, early_stopping_rounds=10, eval_metric="auc",
+        eval_set=[(X_test, y_test)])
+
diff --git a/demo/kaggle-otto/understandingXGBoostModel.Rmd b/demo/kaggle-otto/understandingXGBoostModel.Rmd
index 6bd64401d206..e04277d4ee20 100644
--- a/demo/kaggle-otto/understandingXGBoostModel.Rmd
+++ b/demo/kaggle-otto/understandingXGBoostModel.Rmd
@@ -1,7 +1,7 @@
 ---
 title: "Understanding XGBoost Model on Otto Dataset"
 author: "Michaël Benesty"
-output: 
+output:
   rmarkdown::html_vignette:
     css: ../../R-package/vignettes/vignette.css
     number_sections: yes
@@ -54,7 +54,7 @@ test[1:6,1:5, with =F]
 
 Each *column* represents a feature measured by an `integer`. Each *row* is an **Otto** product.
 
-Obviously the first column (`ID`) doesn't contain any useful information. 
+Obviously the first column (`ID`) doesn't contain any useful information.
 
 To let the algorithm focus on real stuff, we will delete it.
 
@@ -124,7 +124,7 @@ param <- list("objective" = "multi:softprob",
 cv.nround <- 5
 cv.nfold <- 3
 
-bst.cv = xgb.cv(param=param, data = trainMatrix, label = y, 
+bst.cv = xgb.cv(param=param, data = trainMatrix, label = y,
                 nfold = cv.nfold, nrounds = cv.nround)
 ```
 > As we can see the error rate is low on the test dataset (for a 5mn trained model).
@@ -144,7 +144,7 @@ Feature importance
 
 So far, we have built a model made of **`r nround`** trees.
 
-To build a tree, the dataset is divided recursively several times. At the end of the process, you get groups of observations (here, these observations are properties regarding **Otto** products). 
+To build a tree, the dataset is divided recursively several times. At the end of the process, you get groups of observations (here, these observations are properties regarding **Otto** products).
 
 Each division operation is called a *split*.
 
@@ -158,7 +158,7 @@ In the same way, in Boosting we try to optimize the missclassification at each r
 
 The improvement brought by each *split* can be measured, it is the *gain*.
 
-Each *split* is done on one feature only at one value. 
+Each *split* is done on one feature only at one value.
 
 Let's see what the model looks like.
 
@@ -168,7 +168,7 @@ model[1:10]
 ```
 > For convenience, we are displaying the first 10 lines of the model only.
 
-Clearly, it is not easy to understand what it means. 
+Clearly, it is not easy to understand what it means.
 
 Basically each line represents a *branch*, there is the *tree* ID, the feature ID, the point where it *splits*, and information regarding the next *branches* (left, right, when the row for this feature is N/A).
 
@@ -217,7 +217,7 @@ xgb.plot.tree(feature_names = names, model = bst, n_first_tree = 2)
 
 We are just displaying the first two trees here.
 
-On simple models the first two trees may be enough. Here, it might not be the case. We can see from the size of the trees that the intersaction between features is complicated. 
+On simple models the first two trees may be enough. Here, it might not be the case. We can see from the size of the trees that the intersaction between features is complicated.
 Besides, **XGBoost** generate `k` trees at each round for a `k`-classification problem. Therefore the two trees illustrated here are trying to classify data into different classes.
 
 Going deeper
diff --git a/doc/.gitignore b/doc/.gitignore
new file mode 100644
index 000000000000..382c3419ff43
--- /dev/null
+++ b/doc/.gitignore
@@ -0,0 +1,7 @@
+html
+latex
+*.sh
+_*
+doxygen
+parser.py
+*.pyc
diff --git a/doc/Makefile b/doc/Makefile
new file mode 100644
index 000000000000..40bba2a280db
--- /dev/null
+++ b/doc/Makefile
@@ -0,0 +1,192 @@
+# Makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line.
+SPHINXOPTS    =
+SPHINXBUILD   = sphinx-build
+PAPER         =
+BUILDDIR      = _build
+
+# User-friendly check for sphinx-build
+ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1)
+$(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/)
+endif
+
+# Internal variables.
+PAPEROPT_a4     = -D latex_paper_size=a4
+PAPEROPT_letter = -D latex_paper_size=letter
+ALLSPHINXOPTS   = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
+# the i18n builder cannot share the environment and doctrees with the others
+I18NSPHINXOPTS  = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
+
+.PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest coverage gettext
+
+help:
+	@echo "Please use \`make <target>' where <target> is one of"
+	@echo "  html       to make standalone HTML files"
+	@echo "  dirhtml    to make HTML files named index.html in directories"
+	@echo "  singlehtml to make a single large HTML file"
+	@echo "  pickle     to make pickle files"
+	@echo "  json       to make JSON files"
+	@echo "  htmlhelp   to make HTML files and a HTML help project"
+	@echo "  qthelp     to make HTML files and a qthelp project"
+	@echo "  applehelp  to make an Apple Help Book"
+	@echo "  devhelp    to make HTML files and a Devhelp project"
+	@echo "  epub       to make an epub"
+	@echo "  latex      to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
+	@echo "  latexpdf   to make LaTeX files and run them through pdflatex"
+	@echo "  latexpdfja to make LaTeX files and run them through platex/dvipdfmx"
+	@echo "  text       to make text files"
+	@echo "  man        to make manual pages"
+	@echo "  texinfo    to make Texinfo files"
+	@echo "  info       to make Texinfo files and run them through makeinfo"
+	@echo "  gettext    to make PO message catalogs"
+	@echo "  changes    to make an overview of all changed/added/deprecated items"
+	@echo "  xml        to make Docutils-native XML files"
+	@echo "  pseudoxml  to make pseudoxml-XML files for display purposes"
+	@echo "  linkcheck  to check all external links for integrity"
+	@echo "  doctest    to run all doctests embedded in the documentation (if enabled)"
+	@echo "  coverage   to run coverage check of the documentation (if enabled)"
+
+clean:
+	rm -rf $(BUILDDIR)/*
+
+html:
+	$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
+	@echo
+	@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
+
+dirhtml:
+	$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
+	@echo
+	@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
+
+singlehtml:
+	$(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
+	@echo
+	@echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
+
+pickle:
+	$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
+	@echo
+	@echo "Build finished; now you can process the pickle files."
+
+json:
+	$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
+	@echo
+	@echo "Build finished; now you can process the JSON files."
+
+htmlhelp:
+	$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
+	@echo
+	@echo "Build finished; now you can run HTML Help Workshop with the" \
+	      ".hhp project file in $(BUILDDIR)/htmlhelp."
+
+qthelp:
+	$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
+	@echo
+	@echo "Build finished; now you can run "qcollectiongenerator" with the" \
+	      ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
+	@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/rabit.qhcp"
+	@echo "To view the help file:"
+	@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/rabit.qhc"
+
+applehelp:
+	$(SPHINXBUILD) -b applehelp $(ALLSPHINXOPTS) $(BUILDDIR)/applehelp
+	@echo
+	@echo "Build finished. The help book is in $(BUILDDIR)/applehelp."
+	@echo "N.B. You won't be able to view it unless you put it in" \
+	      "~/Library/Documentation/Help or install it in your application" \
+	      "bundle."
+
+devhelp:
+	$(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
+	@echo
+	@echo "Build finished."
+	@echo "To view the help file:"
+	@echo "# mkdir -p $$HOME/.local/share/devhelp/rabit"
+	@echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/rabit"
+	@echo "# devhelp"
+
+epub:
+	$(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
+	@echo
+	@echo "Build finished. The epub file is in $(BUILDDIR)/epub."
+
+latex:
+	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
+	@echo
+	@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
+	@echo "Run \`make' in that directory to run these through (pdf)latex" \
+	      "(use \`make latexpdf' here to do that automatically)."
+
+latexpdf:
+	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
+	@echo "Running LaTeX files through pdflatex..."
+	$(MAKE) -C $(BUILDDIR)/latex all-pdf
+	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
+
+latexpdfja:
+	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
+	@echo "Running LaTeX files through platex and dvipdfmx..."
+	$(MAKE) -C $(BUILDDIR)/latex all-pdf-ja
+	@echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
+
+text:
+	$(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
+	@echo
+	@echo "Build finished. The text files are in $(BUILDDIR)/text."
+
+man:
+	$(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
+	@echo
+	@echo "Build finished. The manual pages are in $(BUILDDIR)/man."
+
+texinfo:
+	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
+	@echo
+	@echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
+	@echo "Run \`make' in that directory to run these through makeinfo" \
+	      "(use \`make info' here to do that automatically)."
+
+info:
+	$(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
+	@echo "Running Texinfo files through makeinfo..."
+	make -C $(BUILDDIR)/texinfo info
+	@echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
+
+gettext:
+	$(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
+	@echo
+	@echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
+
+changes:
+	$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
+	@echo
+	@echo "The overview file is in $(BUILDDIR)/changes."
+
+linkcheck:
+	$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
+	@echo
+	@echo "Link check complete; look for any errors in the above output " \
+	      "or in $(BUILDDIR)/linkcheck/output.txt."
+
+doctest:
+	$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
+	@echo "Testing of doctests in the sources finished, look at the " \
+	      "results in $(BUILDDIR)/doctest/output.txt."
+
+coverage:
+	$(SPHINXBUILD) -b coverage $(ALLSPHINXOPTS) $(BUILDDIR)/coverage
+	@echo "Testing of coverage in the sources finished, look at the " \
+	      "results in $(BUILDDIR)/coverage/python.txt."
+
+xml:
+	$(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml
+	@echo
+	@echo "Build finished. The XML files are in $(BUILDDIR)/xml."
+
+pseudoxml:
+	$(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml
+	@echo
+	@echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml."
diff --git a/doc/README b/doc/README
new file mode 100644
index 000000000000..ca391b77c020
--- /dev/null
+++ b/doc/README
@@ -0,0 +1,7 @@
+The document of xgboost is generated with recommonmark and sphinx.
+
+You can build it locally by typing "make html" in this folder.
+- clone https://github.com/tqchen/recommonmark to root
+- type make html
+
+Checkout https://recommonmark.readthedocs.org for guide on how to write markdown with extensions used in this doc, such as math formulas and table of content.
\ No newline at end of file
diff --git a/doc/README.md b/doc/README.md
deleted file mode 100644
index 371e18f21d0d..000000000000
--- a/doc/README.md
+++ /dev/null
@@ -1,35 +0,0 @@
-List of Documentations
-====
-* [Using XGBoost in Python](python.md)
-* [Using XGBoost in R](../R-package/vignettes/xgboostPresentation.Rmd)
-* [Learning to use xgboost by example](../demo)
-* [External Memory Version](external_memory.md)
-* [Text input format](input_format.md)
-* [Build Instruction](build.md)
-* [Notes on the Code](../src)
-* List of all parameters and their usage: [Parameters](parameter.md)
-  - [Notes on Parameter Tunning](param_tuning.md)
-* Learning about the model: [Introduction to Boosted Trees](http://homes.cs.washington.edu/~tqchen/pdf/BoostedTree.pdf)
-
-How to get started
-====
-* Try to read the [binary classification example](../demo/binary_classification) for getting started example
-* Find the guide specific language guide above for the language you like to use
-* [Learning to use xgboost by example](../demo) contains lots of useful examples
-
-Highlight Links
-====
-This section is about blogposts, presentation and videos discussing how to use xgboost to solve your interesting problem. If you think something belongs to here, send a pull request.
-* [Kaggle Malware Prediction winning solution](https://github.com/xiaozhouwang/kaggle_Microsoft_Malware)
-* [Kaggle Tradeshift winning solution by daxiongshu](https://github.com/daxiongshu/kaggle-tradeshift-winning-solution)
-* [Feature Importance Analysis with XGBoost in Tax audit](http://fr.slideshare.net/MichaelBENESTY/feature-importance-analysis-with-xgboost-in-tax-audit)
-* Video tutorial: [Better Optimization with Repeated Cross Validation and the XGBoost model](https://www.youtube.com/watch?v=Og7CGAfSr_Y)
-* [Winning solution of Kaggle Higgs competition: what a single model can do](http://no2147483647.wordpress.com/2014/09/17/winning-solution-of-kaggle-higgs-competition-what-a-single-model-can-do/) 
-
-Contribution
-====
-Contribution of documents and use-cases are welcomed!
-* This package use Google C++ style
-* Check tool of codestyle
-  - clone https://github.com/dmlc/dmlc-core into root directory
-  - type ```make lint``` and fix possible errors.
diff --git a/doc/build.md b/doc/build.md
index f9a626603d61..b97237bcbac3 100644
--- a/doc/build.md
+++ b/doc/build.md
@@ -1,5 +1,5 @@
 Build XGBoost
-====
+=============
 * Run ```bash build.sh``` (you can also type make)
 * If you have C++11 compiler, it is recommended to type ```make cxx11=1```
   - C++11 is not used by default
@@ -12,19 +12,19 @@ Build XGBoost
 * OS X with multi-threading support: see [next section](#openmp-for-os-x)
 
 Build XGBoost in OS X with OpenMP
-====
+---------------------------------
 Here is the complete solution to use OpenMp-enabled compilers to install XGBoost.
 
 1. Obtain gcc with openmp support by `brew install gcc --without-multilib` **or** clang with openmp by `brew install clang-omp`. The clang one is recommended because the first method requires us compiling gcc inside the machine (more than an hour in mine)! (BTW, `brew` is the de facto standard of `apt-get` on OS X. So installing [HPC](http://hpc.sourceforge.net/) separately is not recommended, but it should work.)
 
-2. **if you are planing to use clang-omp** - in step 3 and/or 4, change line 9 in `xgboost/src/utils/omp.h` to 
+2. **if you are planing to use clang-omp** - in step 3 and/or 4, change line 9 in `xgboost/src/utils/omp.h` to
 
   ```C++
-  #include <libiomp/omp.h> /* instead of #include <omp.h> */` 
+  #include <libiomp/omp.h> /* instead of #include <omp.h> */`
   ```
 
-  to make it work, otherwise you might get this error 
-  
+  to make it work, otherwise you might get this error
+
   `src/tree/../utils/omp.h:9:10: error: 'omp.h' file not found...`
 
 
@@ -43,11 +43,11 @@ Here is the complete solution to use OpenMp-enabled compilers to install XGBoost
   export CXX = clang-omp++
   ```
 
-  Remember to change `header` if using clang-omp. 
-  
+  Remember to change `header` (mentioned in step 2) if using clang-omp.
+
   Then `cd xgboost` then `bash build.sh` to compile XGBoost. And go to `wrapper` sub-folder to install python version.
 
-4. Set the `Makevars` file in highest piority for R. 
+4. Set the `Makevars` file in highest piority for R.
 
   The point is, there are three `Makevars` : `~/.R/Makevars`, `xgboost/R-package/src/Makevars`, and `/usr/local/Cellar/r/3.2.0/R.framework/Resources/etc/Makeconf` (the last one obtained by running `file.path(R.home("etc"), "Makeconf")` in R), and `SHLIB_OPENMP_CXXFLAGS` is not set by default!! After trying, it seems that the first one has highest piority (surprise!).
 
@@ -75,21 +75,21 @@ Here is the complete solution to use OpenMp-enabled compilers to install XGBoost
 
   Again, remember to change `header` if using clang-omp.
 
-  Then inside R, run 
+  Then inside R, run
 
   ```R
   install.packages('xgboost/R-package/', repos=NULL, type='source')
   ```
-  
+
   Or
-  
+
   ```R
   devtools::install_local('xgboost/', subdir = 'R-package') # you may use devtools
   ```
 
 
 Build with HDFS and S3 Support
-=====
+------------------------------
 * To build xgboost use with HDFS/S3 support and distributed learnig. It is recommended to build with dmlc, with the following steps
   - ```git clone https://github.com/dmlc/dmlc-core```
   - Follow instruction in dmlc-core/make/config.mk to compile libdmlc.a
diff --git a/doc/conf.py b/doc/conf.py
new file mode 100644
index 000000000000..05e1e91babf0
--- /dev/null
+++ b/doc/conf.py
@@ -0,0 +1,167 @@
+# -*- coding: utf-8 -*-
+#
+# documentation build configuration file, created by
+# sphinx-quickstart on Thu Jul 23 19:40:08 2015.
+#
+# This file is execfile()d with the current directory set to its
+# containing dir.
+#
+# Note that not all possible configuration values are present in this
+# autogenerated file.
+#
+# All configuration values have a default; values that are commented out
+# serve to show the default.
+import sys
+import os, subprocess
+import shlex
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
+libpath = os.path.join(curr_path, '../python-package/')
+sys.path.insert(0, libpath)
+sys.path.insert(0, curr_path)
+
+from sphinx_util import MarkdownParser, AutoStructify
+
+# -- mock out modules
+import mock
+MOCK_MODULES = ['numpy', 'scipy', 'scipy.sparse', 'sklearn', 'matplotlib']
+for mod_name in MOCK_MODULES:
+    sys.modules[mod_name] = mock.Mock()
+
+# -- General configuration ------------------------------------------------
+
+# General information about the project.
+project = u'xgboost'
+author = u'%s developers' % project
+copyright = u'2015, %s' % author
+github_doc_root = 'https://github.com/dmlc/xgboost/tree/master/doc/'
+
+# add markdown parser
+MarkdownParser.github_doc_root = github_doc_root
+source_parsers = {
+    '.md': MarkdownParser,
+}
+os.environ['XGBOOST_BUILD_DOC'] = '1'
+# Version information.
+import xgboost
+version = xgboost.__version__
+release = xgboost.__version__
+
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom ones
+extensions = [
+    'sphinx.ext.autodoc',
+    'sphinx.ext.napoleon',
+    'sphinx.ext.mathjax',
+]
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ['_templates']
+
+# The suffix(es) of source filenames.
+# You can specify multiple suffix as a list of string:
+# source_suffix = ['.rst', '.md']
+source_suffix = ['.rst', '.md']
+
+# The encoding of source files.
+#source_encoding = 'utf-8-sig'
+
+# The master toctree document.
+master_doc = 'index'
+
+# The language for content autogenerated by Sphinx. Refer to documentation
+# for a list of supported languages.
+#
+# This is also used if you do content translation via gettext catalogs.
+# Usually you set "language" from the command line for these cases.
+language = None
+
+# There are two options for replacing |today|: either, you set today to some
+# non-false value, then it is used:
+#today = ''
+# Else, today_fmt is used as the format for a strftime call.
+#today_fmt = '%B %d, %Y'
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+exclude_patterns = ['_build']
+
+# The reST default role (used for this markup: `text`) to use for all
+# documents.
+#default_role = None
+
+# If true, '()' will be appended to :func: etc. cross-reference text.
+#add_function_parentheses = True
+
+# If true, the current module name will be prepended to all description
+# unit titles (such as .. function::).
+#add_module_names = True
+
+# If true, sectionauthor and moduleauthor directives will be shown in the
+# output. They are ignored by default.
+#show_authors = False
+
+# The name of the Pygments (syntax highlighting) style to use.
+pygments_style = 'sphinx'
+
+# A list of ignored prefixes for module index sorting.
+#modindex_common_prefix = []
+
+# If true, keep warnings as "system message" paragraphs in the built documents.
+#keep_warnings = False
+
+# If true, `todo` and `todoList` produce output, else they produce nothing.
+todo_include_todos = False
+
+# -- Options for HTML output ----------------------------------------------
+
+# The theme to use for HTML and HTML Help pages.  See the documentation for
+# a list of builtin themes.
+# html_theme = 'alabaster'
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+html_static_path = ['_static']
+
+# Output file base name for HTML help builder.
+htmlhelp_basename = project + 'doc'
+
+# -- Options for LaTeX output ---------------------------------------------
+latex_elements = {
+}
+
+# Grouping the document tree into LaTeX files. List of tuples
+# (source start file, target name, title,
+#  author, documentclass [howto, manual, or own class]).
+latex_documents = [
+  (master_doc, '%s.tex' % project, project,
+   author, 'manual'),
+]
+
+# hook for doxygen
+def run_doxygen(folder):
+    """Run the doxygen make command in the designated folder."""
+    try:
+        retcode = subprocess.call("cd %s; make doxygen" % folder, shell=True)
+        if retcode < 0:
+            sys.stderr.write("doxygen terminated by signal %s" % (-retcode))
+    except OSError as e:
+        sys.stderr.write("doxygen execution failed: %s" % e)
+
+def generate_doxygen_xml(app):
+    """Run the doxygen make commands if we're on the ReadTheDocs server"""
+    read_the_docs_build = os.environ.get('READTHEDOCS', None) == 'True'
+    if read_the_docs_build:
+        run_doxygen('..')
+
+def setup(app):
+    # Add hook for building doxygen xml when needed
+    # no c++ API for now
+    # app.connect("builder-inited", generate_doxygen_xml)
+    app.add_config_value('recommonmark_config', {
+            'url_resolver': lambda url: github_doc_root + url,
+            }, True)
+    app.add_transform(AutoStructify)
diff --git a/doc/dev-guide/contribute.md b/doc/dev-guide/contribute.md
new file mode 100644
index 000000000000..03060ab599bc
--- /dev/null
+++ b/doc/dev-guide/contribute.md
@@ -0,0 +1,13 @@
+Developer Guide
+===============
+This page contains guide for developers of xgboost. XGBoost has been developed and used by a group of active community.
+Everyone is more than welcomed to is a great way to make the project better.
+The project is maintained by a committee of [committers](../../CONTRIBUTORS.md#comitters) who will review and merge pull requests from contributors.
+
+Contributing Code
+-----------------
+* The C++ code follows Google C++ style
+* We follow numpy style to document our python module
+* Tools to precheck codestyle
+  - clone https://github.com/dmlc/dmlc-core into root directory
+  - type ```make lint``` and fix possible errors.
diff --git a/doc/external_memory.md b/doc/external_memory.md
index f8eec83fc8d1..e50c02e570fb 100644
--- a/doc/external_memory.md
+++ b/doc/external_memory.md
@@ -1,5 +1,5 @@
 Using XGBoost External Memory Version(beta)
-====
+===========================================
 There is no big difference between using external memory version and in-memory version.
 The only difference is the filename format.
 
@@ -19,13 +19,13 @@ You can find that there is additional ```#dtrain.cache``` following the libsvm f
 For CLI version, simply use ```"../data/agaricus.txt.train#dtrain.cache"``` in filename.
 
 Performance Note
-====
+----------------
 * the parameter ```nthread``` should be set to number of ***real*** cores
   - Most modern CPU offer hyperthreading, which means you can have a 4 core cpu with 8 threads
   - Set nthread to be 4 for maximum performance in such case
 
 Distributed Version
-====
+-------------------
 The external memory mode naturally works on distributed version, you can simply set path like
 ```
 data = "hdfs:///path-to-data/#dtrain.cache"
@@ -34,8 +34,8 @@ xgboost will cache the data to the local position. When you run on YARN, the cur
 so that you can directly use ```dtrain.cache``` to cache to current folder.
 
 
-Usage Note:
-====
+Usage Note
+----------
 * This is a experimental version
   - If you like to try and test it, report results to https://github.com/dmlc/xgboost/issues/244
 * Currently only importing from libsvm format is supported
diff --git a/doc/faq.md b/doc/faq.md
new file mode 100644
index 000000000000..32dc5a1b33f8
--- /dev/null
+++ b/doc/faq.md
@@ -0,0 +1,65 @@
+Frequent Asked Questions
+========================
+This document contains the frequent asked question to xgboost.
+
+How to tune parameters
+----------------------
+See [Parameter Tunning Guide](param_tuning.md)
+
+Description on the model
+------------------------
+See [Introduction to Boosted Trees](model.md)
+
+
+I have a big dataset
+--------------------
+XGBoost is designed to be memory efficient. Usually it could handle problems as long as the data fit into your memory
+(This usually means millions of instances).
+If you are running out of memory, checkout [external memory version](external_memory.md) or
+[distributed version](https://github.com/dmlc/wormhole/tree/master/learn/xgboost) of xgboost.
+
+
+Running xgboost on Platform X (Hadoop/Yarn, Mesos)
+--------------------------------------------------
+The distributed version of XGBoost is designed to be portable to various environment.
+Distributed XGBoost can be ported to any platform that supports [rabit](https://github.com/dmlc/rabit).
+You can directly run xgboost on Yarn. In theory Mesos and other resource allocation engine can be easily supported as well.
+
+
+Why not implement distributed xgboost on top of X (Spark, Hadoop)
+-----------------------------------------------------------------
+The first fact we need to know is going distributed does not necessarily solve all the problems.
+Instead, it creates more problems such as more communication over head and fault tolerance.
+The ultimate question will still come back into how to push the limit of each computation node
+and use less resources to complete the task (thus with less communication and chance of failure).
+
+To achieve these, we decide to reuse the optimizations in the single node xgboost and build distributed version on top of it.
+The demand of communication in machine learning is rather simple, in a sense that we can depend on a limited set of API (in our case rabit).
+Such design allows us to reuse most of the code, and being portable to major platforms such as Hadoop/Yarn, MPI, SGE.
+Most importantly, pushs the limit of the computation resources we can use.
+
+
+How can I port the model to my own system
+-----------------------------------------
+The model and data format of XGBoost is exchangable.
+Which means the model trained by one langauge can be loaded in another.
+This means you can train the model using R, while running prediction using
+Java or C++, which are more common in production system.
+You can also train the model using distributed version,
+and load them in from python to do some interactive analysis.
+
+
+Do you support LambdaMART
+-------------------------
+Yes, xgboost implements LambdaMART. Checkout the objective section in [parameters](parameter.md)
+
+
+How to deal with Missing Value
+------------------------------
+xgboost support missing value by default
+
+
+Slightly different result between runs
+--------------------------------------
+This could happen, due to non-determinism in floating point summation order and multi-threading.
+Though the general accuracy will usually remain the same.
diff --git a/doc/img/cart.png b/doc/img/cart.png
new file mode 100644
index 000000000000..aaaa9ddbec4e
Binary files /dev/null and b/doc/img/cart.png differ
diff --git a/doc/img/split_find.png b/doc/img/split_find.png
new file mode 100644
index 000000000000..f6116073f6d1
Binary files /dev/null and b/doc/img/split_find.png differ
diff --git a/doc/img/step_fit.png b/doc/img/step_fit.png
new file mode 100644
index 000000000000..b94b5e196296
Binary files /dev/null and b/doc/img/step_fit.png differ
diff --git a/doc/img/struct_score.png b/doc/img/struct_score.png
new file mode 100644
index 000000000000..c347c95de019
Binary files /dev/null and b/doc/img/struct_score.png differ
diff --git a/doc/img/twocart.png b/doc/img/twocart.png
new file mode 100644
index 000000000000..f1c0dae7c0f4
Binary files /dev/null and b/doc/img/twocart.png differ
diff --git a/doc/index.md b/doc/index.md
new file mode 100644
index 000000000000..40b7c519252f
--- /dev/null
+++ b/doc/index.md
@@ -0,0 +1,71 @@
+XGBoost Documentation
+=====================
+This is document of xgboost library.
+XGBoost is short for eXtreme gradient boosting. This is a library that is designed, and optimized for boosted (tree) algorithms.
+The goal of this library is to push the extreme of the computation limits of machines to provide a ***scalable***, ***portable*** and ***accurate***
+for large scale tree boosting.
+
+
+This document is hosted at http://xgboost.readthedocs.org/. You can also browse most of the documents in github directly.
+
+How to Get Started
+------------------
+The best way to get started to learn xgboost is by the examples. There are three types of examples you can find in xgboost.
+* [Tutorials](#tutorials) are self-conatained tutorials on a complete data science tasks.
+* [XGBoost Code Examples](../demo/) are collections of code and benchmarks of xgboost.
+  - There is a walkthrough section in this to walk you through specific API features.
+* [Highlight Solutions](#highlight-solutions) are presentations using xgboost to solve real world problems.
+  - These examples are usually more advanced. You can usually find state-of-art solutions to many problems and challenges in here.
+
+After you gets familiar with the interface, checkout the following additional resources
+* [Frequently Asked Questions](faq.md)
+* [Learning what is in Behind: Introduction to Boosted Trees](model.md)
+* [User Guide](#user-guide) contains comprehensive list of documents of xgboost.
+* [Developer Guide](dev-guide/contribute.md)
+
+Tutorials
+---------
+Tutorials are self contained materials that teaches you how to achieve a complete data science task with xgboost, these
+are great resources to learn xgboost by real examples. If you think you have something that belongs to here, send a pull request.
+* [Binary classification using XGBoost Command Line](../demo/binary_classification/) (CLI)
+  - This tutorial introduces the basic usage of CLI version of xgboost
+* [Introduction of XGBoost in Python](python/python_intro.md) (python)
+  - This tutorial introduces the python package of xgboost
+* [Introduction to XGBoost in R](../R-package/vignettes/xgboostPresentation.Rmd) (R package)
+  - This is a general presentation about xgboost in R.
+* [Discover your data with XGBoost in R](../R-package/vignettes/discoverYourData.Rmd) (R package)
+  - This tutorial explaining feature analysis in xgboost.
+* [Understanding XGBoost Model on Otto Dataset](../demo/kaggle-otto/understandingXGBoostModel.Rmd) (R package)
+  - This tutorial teaches you how to use xgboost to compete kaggle otto challenge.
+
+
+Highlight Solutions
+-------------------
+This section is about blogposts, presentation and videos discussing how to use xgboost to solve your interesting problem. If you think something belongs to here, send a pull request.
+* [Kaggle CrowdFlower winner's solution by Chenglong Chen](https://github.com/ChenglongChen/Kaggle_CrowdFlower)
+* [Kaggle Malware Prediction winner's solution](https://github.com/xiaozhouwang/kaggle_Microsoft_Malware)
+* [Kaggle Tradeshift winning solution by daxiongshu](https://github.com/daxiongshu/kaggle-tradeshift-winning-solution)
+* [Feature Importance Analysis with XGBoost in Tax audit](http://fr.slideshare.net/MichaelBENESTY/feature-importance-analysis-with-xgboost-in-tax-audit)
+* Video tutorial: [Better Optimization with Repeated Cross Validation and the XGBoost model](https://www.youtube.com/watch?v=Og7CGAfSr_Y)
+* [Winning solution of Kaggle Higgs competition: what a single model can do](http://no2147483647.wordpress.com/2014/09/17/winning-solution-of-kaggle-higgs-competition-what-a-single-model-can-do/)
+
+User Guide
+----------
+* [Frequently Asked Questions](faq.md)
+* [Introduction to Boosted Trees](model.md)
+* [Using XGBoost in Python](python/python_intro.md)
+* [Using XGBoost in R](../R-package/vignettes/xgboostPresentation.Rmd)
+* [Learning to use XGBoost by Example](../demo)
+* [External Memory Version](external_memory.md)
+* [Text input format](input_format.md)
+* [Build Instruction](build.md)
+* [Parameters](parameter.md)
+* [Notes on Parameter Tunning](param_tuning.md)
+
+Developer Guide
+---------------
+* [Developer Guide](dev-guide/contribute.md)
+
+API Reference
+-------------
+* [Python API Reference](python/python_api.rst)
diff --git a/doc/input_format.md b/doc/input_format.md
index 557b875121f0..3986d07fb182 100644
--- a/doc/input_format.md
+++ b/doc/input_format.md
@@ -1,12 +1,13 @@
-Input Format
-====
+Text Input Format of DMatrix
+============================
+
 ## Basic Input Format
 As we have mentioned, XGBoost takes LibSVM format. For training or predicting, XGBoost takes an instance file with the format as below:
 
 train.txt
 ```
 1 101:1.2 102:0.03
-0 1:2.1 10001:300 10002:400 
+0 1:2.1 10001:300 10002:400
 0 0:1.3 1:0.3
 1 0:0.01 1:0.3
 0 0:0.2 1:0.3
@@ -37,7 +38,7 @@ train.txt.weight
 0.5
 ```
 It means that XGBoost will emphasize more on the first and fourth instance， that is to say positive instances while training.
-The configuration is similar to configuring the group information. If the instance file name is "xxx", XGBoost will check whether there is a file named "xxx.weight" in the same directory and if there is, will use the weights while training models. Weights will be included into an "xxx.buffer" file that is created by XGBoost automatically. If you want to update the weights, you need to delete the "xxx.buffer" file prior to launching XGBoost. 
+The configuration is similar to configuring the group information. If the instance file name is "xxx", XGBoost will check whether there is a file named "xxx.weight" in the same directory and if there is, will use the weights while training models. Weights will be included into an "xxx.buffer" file that is created by XGBoost automatically. If you want to update the weights, you need to delete the "xxx.buffer" file prior to launching XGBoost.
 
 ## Initial Margin file
 XGBoost supports providing each instance an initial margin prediction. For example, if we have a initial prediction using logistic regression for "train.txt" file, we can create the following file:
diff --git a/doc/model.md b/doc/model.md
new file mode 100644
index 000000000000..f4373b3fc537
--- /dev/null
+++ b/doc/model.md
@@ -0,0 +1,232 @@
+Introduction to Boosted Trees
+=============================
+XGBoost is short for "Extreme Gradient Boosting", where the term "Gradient Boosting" is proposed in the paper _Greedy Function Approximation: A Gradient Boosting Machine_, Friedman. Based on this original model. This is a tutorial on boosted trees, most of content are based on this [slide](http://homes.cs.washington.edu/~tqchen/pdf/BoostedTree.pdf) by the author of xgboost.
+
+The GBM(boosted trees) has been around for really a while, and there are a lot of materials on the topic. This tutorial tries to explain boosted trees in a self-contained and principled way of supervised learning. We think this explaination is cleaner, more formal, and motivates the variant used in xgboost.
+
+Elements of Supervised Learning
+-------------------------------
+XGBoost is used for supervised learning problems, where we use the training data ``$ x_i $`` to predict a target variable ``$ y_i $``.
+Before we get dived into trees, let us start from reviwing the basic elements in supervised learning.
+
+### Model and Parameters
+The ***model*** in supervised learning usually refers to the mathematical structure on how to given the prediction ``$ y_i $`` given ``$ x_i $``.
+For example, a common model is *linear model*, where the prediction is given by ``$ \hat{y}_i = \sum_j w_j x_{ij} $``, a linear combination of weighted input features.
+The prediction value can have different interpretations, depending on the task.
+For example, it can be logistic transformed to get the probability of postitive class in logistic regression, it can also be used as ranking score when we want to rank the outputs.
+
+The ***parameters*** are the undermined part that we need to learn from data. In linear regression problem, the parameters are the co-efficients ``$ w $``.
+Usually we will use ``$ \Theta $`` to denote the parameters.
+
+### Object Function : Training Loss + Regularization
+
+Based on different understanding or assumption of ``$ y_i $``, we can have different problems as regression, classification, ordering, etc.
+We need to find a way to find the best parameters given the training data. In order to do so, we need to define a so called ***objective function***,
+to measure the performance of the model under certain set of parameters.
+
+A very important about objective functions, is they ***must always*** contains two parts: training loss and regularization.
+
+```math
+Obj(\Theta) = L(\Theta) + \Omega(\Theta)
+```
+
+where ``$ L $`` is the training loss function, and ``$ \Omega $`` is the regularization term. The training loss measures how *predictive* our model is on training data.
+For example, a commonly used training loss is mean squared error.
+
+```math
+L(\Theta) = \sum_i (y_i-\hat{y}_i)^2
+```
+Another commonly used loss function is logistic loss for logistic regression
+
+```math
+L(\theta) = \sum_i[ y_i\ln (1+e^{-\hat{y}_i}) + (1-y_i)\ln (1+e^{\hat{y}_i})]
+```
+
+The ***regularization term*** is usually people forget to add. The regularization term controls the complexity of the model, this helps us to avoid overfitting.
+This sounds a bit abstract, let us consider the following problem in the following picture. You are asked to *fit* visually a step function given the input data points
+on the upper left corner of the image, which solution among the tree you think is the best fit?
+
+![Step function](img/step_fit.png)
+
+The answer is already marked as red. Please think if it is reasonable to you visually. The general principle is we want a ***simple*** and ***predictive*** model.
+The tradeoff between the two is also referred as bias-variance tradeoff in machine learning.
+
+
+### Why introduce the general principle
+The elements introduced in above forms the basic elements of supervised learning, and they are naturally the building blocks of machine learning toolkits.
+For example, you should be able to answer what is the difference and common parts between boosted trees and random forest.
+Understanding the process in a formalized way also helps us to understand the objective what we are learning and getting the reason behind the heurestics such as
+pruning and smoothing.
+
+Tree Ensemble
+-------------
+Now we have introduce the elements of supervised learning, let us getting started with real trees.
+To begin with, let us first learn what is the ***model*** of xgboost: tree ensembles.
+The tree ensemble model is a set of classification and regression trees (CART). Here's a simple example of a CART
+that classifies is someone will like computer games.
+
+![CART](img/cart.png)
+
+We classify the members in thie family into different leaves, and assign them the score on corresponding leaf.
+A CART is a bit different from decision trees, where the leaf only contain decision values. In CART, a real score
+is associated with each of the leaves, this allows gives us richer interpretations that go beyond classification.
+This also makes the unified optimization step easier, as we will see in later part of this tutorial.
+
+Usually, a single tree is not so strong enough to be used in practice. What is actually used is the so called
+tree ensemble model, that sumes the prediction of multiple trees together.
+
+![TwoCART](img/twocart.png)
+
+Here is an example of tree ensemble of two trees. The prediction scores of each individual tree are summed up to get the final score.
+If you look at the example, an important fact is that the two trees tries to *complement* each other.
+Mathematically, we can write our model into the form
+
+```math
+\hat{y}_i = \sum_{k=1}^K f_k(x_i), f_k \in \mathcal{F}
+```
+
+where ``$ K $`` is the number of trees, ``$ f $`` is a function in the functional space ``$ \mathcal{F} $``, and ``$ \mathcal{F} $`` is the set of all possible CARTs. Therefore our objective to optimize can be written as
+
+```math
+obj(\Theta) = \sum_i^n l(y_i, \hat{y}_i) + \sum_{k=1}^K \Omega(f_k)
+```
+Now here comes the question, what is the *model* of random forest? It is exactly tree ensembles! So random forest and boosted trees are not different in terms of model,
+the difference is how we train them. This means if you write a predictive service of tree ensembles, you only need to write one of them and they should directly work
+for both random forest and boosted trees. One example of elements of supervised learning rocks.
+
+Tree Boosting
+-------------
+After introducing the model, let us begin with the real training part. How should we learn the trees?
+The answer is, as is always for all supervised learning models: *define an objective function, and optimize it*!
+
+Assume we have the following objective function (remember it always need to contain training loss, and regularization)
+```math
+Obj = \sum_{i=1}^n l(y_i, \hat{y}_i^{(t)}) + \sum_{i=1}^t\Omega(f_i) \\
+```
+
+### Additive Training
+
+First thing we want to ask is what are ***parameters*** of trees. You can find what we need to learn are those functions ``$f_i$``, with each contains the structure
+of the tree, and the leaf score. This is much harder than traditional optimization problem where you can take the gradient and go.
+It is not easy to train all the trees at once.
+Instead, we use an additive strategy: fix what we have learned, add a new tree at a time.
+We note the prediction value at step ``$t$`` by ``$ \hat{y}_i^{(t)}$``, so we have
+
+```math
+\hat{y}_i^{(0)} &= 0\\
+\hat{y}_i^{(1)} &= f_1(x_i) = \hat{y}_i^{(0)} + f_1(x_i)\\
+\hat{y}_i^{(2)} &= f_1(x_i) + f_2(x_i)= \hat{y}_i^{(1)} + f_2(x_i)\\
+&\dots\\
+\hat{y}_i^{(t)} &= \sum_{k=1}^t f_k(x_i)= \hat{y}_i^{(t-1)} + f_t(x_i)
+```
+
+It remains to ask Which tree do we want at each step?  A natural thing is to add the one that optimizes our objective.
+
+```math
+Obj^{(t)} & = \sum_{i=1}^n l(y_i, \hat{y}_i^{(t)}) + \sum_{i=1}^t\Omega(f_i) \\
+          & = \sum_{i=1}^n l(y_i, \hat{y}_i^{(t-1)} + f_t(x_i)) + \Omega(f_t) + constant
+```
+
+If we  consider using MSE as our loss function, it becomes the following form.
+
+```math
+Obj^{(t)} & = \sum_{i=1}^n (y_i - (\hat{y}_i^{(t-1)} + f_t(x_i)))^2 + \sum_{i=1}^t\Omega(f_i) \\
+          & = \sum_{i=1}^n [2(\hat{y}_i^{(t-1)} - y_i)f_t(x_i) + f_t(x_i)^2] + \Omega(f_t) + constant
+```
+
+The form of MSE is friendly, with a first order term (usually called residual) and a quadratic term.
+For other loss of interest (for example, logistic loss), it is not so easy to get such a nice form.
+So in general case, we take the Taylor expansion of the loss function up to the second order
+
+```math
+Obj^{(t)} = \sum_{i=1}^n [l(y_i, \hat{y}_i^{(t-1)}) + g_i f_t(x_i) + \frac{1}{2} h_i f_t^2(x_i)] + \Omega(f_t) + constant
+```
+where the ``$g_i$`` and ``$h_i$`` are defined as
+
+```math
+g_i &= \partial_{\hat{y}_i^{(t)}} l(y_i, \hat{y}_i^{(t-1)})\\
+h_i &= \partial_{\hat{y}_i^{(t)}}^2 l(y_i, \hat{y}_i^{(t-1)})
+```
+
+After we remove all the constants, the specific objective at t step becomes
+
+```math
+\sum_{i=1}^n [g_i f_t(x_i) + \frac{1}{2} h_i f_t^2(x_i)] + \Omega(f_t)
+```
+
+This becomes our optimization goal for the new tree. One important advantage of this definition, is that
+it only depends on ``$g_i$`` and ``$h_i$``, this is how xgboost allows support of customization of loss functions.
+We can optimized every loss function, including logistic regression, weighted logistic regression, using the exactly
+the same solver that takes ``$g_i$`` and ``$h_i$`` as input!
+
+### Model Complexity
+We have introduced the training step, but wait, there is one important thing, the ***regularization***!
+We need to define the complexity of the tree ``$\Omega(f)$``. In order to do so, let us first refine the definition of the tree a tree ``$ f(x) $`` as
+
+```math
+f_t(x) = w_{q(x)}, w \in R^T, q:R^d\rightarrow \{1,2,\cdots,T\} .
+```
+
+Here ``$ w $`` is the vector of scores on leaves, ``$ q $`` is a function assigning each data point to the corresponding leaf and``$ T $`` is the number of leaves.
+In XGBoost, we define the complexity as
+
+```math
+\Omega(f) = \gamma T + \frac{1}{2}\lambda \sum_{j=1}^T w_j^2
+```
+Of course there is more than one way to define the complexity, but this specific one works well in practice. The regularization is one part most tree packages takes
+less carefully, or simply ignore. This was due to the traditional treatment tree learning only emphasize improving impurity, while the complexity control part
+are more lies as part of heuristics. By defining it formally, we can get a better idea of what we are learning, and yes it works well in practice.
+
+### The Structure Score
+
+Here is the magical part of the derivation. After reformalizing the tree model, we can write the objective value with the ``$ t$``-th tree as:
+
+```math
+Obj^{(t)} &\approx \sum_{i=1}^n [g_i w_q(x_i) + \frac{1}{2} h_i w_{q(x_i)}^2] + \gamma T + \frac{1}{2}\lambda \sum_{j=1}^T w_j^2\\
+&= \sum^T_{j=1} [(\sum_{i\in I_j} g_i) w_j + \frac{1}{2} (\sum_{i\in I_j} h_i + \lambda) w_j^2 ] + \gamma T
+```
+
+where ``$ I_j = \{i|q(x_i)=j\} $`` is the set of indices of data points assigned to the ``$ j $``-th leaf. Notice that in the second line we have change the index of the summation because all the data points on the same leaf get the same score. We could further compress the expression by defining ``$ G_j = \sum_{i\in I_j} g_i $`` and ``$ H_j = \sum_{i\in I_j} h_i $``:
+
+```math
+Obj^{(t)} = \sum^T_{j=1} [G_jw_j + \frac{1}{2} (H_j+\lambda) w_j^2] +\gamma T
+```
+
+In this equation ``$ w_j $`` are independent to each other, the form ``$ G_jw_j+\frac{1}{2}(H_j+\lambda)w_j^2 $`` is quadratic and the best ``$ w_j $`` for a given structure ``$q(x)$`` and the best objective reduction we can get:
+
+```math
+w_j^\ast = -\frac{G_j}{H_j+\lambda}\\
+Obj^\ast = -\frac{1}{2} \sum_{j=1}^T \frac{G_j^2}{H_j+\lambda} + \gamma T
+```
+The last equation measures ***how good*** a tree structure ``$q(x)$`` is.
+
+![Structure Score](img/struct_score.png)
+
+If all these sounds a bit complicated. Let us take a look the the picture, and see how the scores can be calculated.
+Basically, for a given tree structure, we push the statistics ``$g_i$`` and ``$h_i$`` to the leaves they belong to,
+sum the statistics together, and use the formula to calulate how good the tree is.
+This score is like impurity measure in decision tree, except that it also takes the model complexity into account.
+
+### Learn the tree structure
+Now we have a way to measure how good a tree is ideally we can enumerate all possible trees and pick the best one.
+In practice it is impossible, so we will try to one level of the tree at a time.
+Specifically we try to split a leaf into two leaves, and the score it gains is
+
+```math
+Gain = \frac{1}{2} \left[\frac{G_L^2}{H_L+\lambda}+\frac{G_R^2}{H_R+\lambda}-\frac{(G_L+G_R)^2}{H_L+H_R+\lambda}\right] - \gamma
+```
+This formula can be decomposited as 1) the score on the new left leaf 2) the score on the new right leaf 3) The score on the original leaf 4) regularization on the additional leaf.
+We can find an important fact here: if the gain is smaller than ``$\gamma$``, we would better not to add that branch. This is exactly the ***prunning*** techniques in tree based
+models! By using the principles of supervised learning, we can naturally comes up with the reason these techniques :)
+
+For real valued data, we usually want to search for an optimal split. To efficiently doing so, we place all the instances in a sorted way, like the following picture.
+![Best split](img/split_find.png)
+Then a left to right scan is sufficient to calculate the structure score of all possible split solutions, and we can find the best split efficiently.
+
+Final words on XGBoost
+----------------------
+Now you have understand what is a boosted tree, you may ask, where is the introduction on [XGBoost](https://github.com/dmlc/xgboost)?
+XGBoost is exactly a tool motivated by the formal principle introduced in this tutorial!
+More importantly, it is developed with both deep consideration in terms of ***systems optimization*** and ***principles in machine learning***.
+The goal of this library is to push the extreme of the computation limits of machines to provide a ***scalable***, ***portable*** and ***accurate*** library.
+Make sure you [try it out](https://github.com/dmlc/xgboost), and most importantly, contribute your piece of wisdom (code, examples, tutorials) to the community!
diff --git a/doc/param_tuning.md b/doc/param_tuning.md
index 78263a6a859a..c5848f6024d6 100644
--- a/doc/param_tuning.md
+++ b/doc/param_tuning.md
@@ -1,5 +1,5 @@
 Notes on Parameter Tuning
-====
+=========================
 Parameter tuning is a dark art in machine learning, the optimal parameters
 of a model can depend on many scenarios. So it is impossible to create a
 comprehensive guide for doing so.
@@ -8,7 +8,7 @@ This document tries to provide some guideline for parameters in xgboost.
 
 
 Understanding Bias-Variance Tradeoff
-====
+------------------------------------
 If you take a machine learning or statistics course, this is likely to be one
 of the most important concepts.
 When we allow the model to get more complicated (e.g. more depth), the model
@@ -22,7 +22,7 @@ will make the model more conservative or not. This can be used to help you
 turn the knob between complicated model and simple model.
 
 Control Overfitting
-====
+-------------------
 When you observe high training accuracy, but low tests accuracy, it is likely that you encounter overfitting problem.
 
 There are in general two ways that you can control overfitting in xgboost
@@ -31,9 +31,9 @@ There are in general two ways that you can control overfitting in xgboost
 * The second way is to add randomness to make training robust to noise
   - This include ```subsample```, ```colsample_bytree```
   - You can also reduce stepsize ```eta```, but needs to remember to increase ```num_round``` when you do so.
-  
-Handle Imbalanced Dataset 
-===
+
+Handle Imbalanced Dataset
+-------------------------
 For common cases such as ads clickthrough log, the dataset is extremely imbalanced.
 This can affect the training of xgboost model, and there are two ways to improve it.
 * If you care only about the ranking order (AUC) of your prediction
diff --git a/doc/parameter.md b/doc/parameter.md
index 13eefa0fec6a..4e0f365bf3db 100644
--- a/doc/parameter.md
+++ b/doc/parameter.md
@@ -1,15 +1,17 @@
 XGBoost Parameters
-====
+==================
 Before running XGboost, we must set three types of parameters, general parameters, booster parameters and task parameters:
 - General parameters relates to which booster we are using to do boosting, commonly tree or linear model
 - Booster parameters depends on which booster you have chosen
-- Task parameters that decides on the learning scenario, for example, regression tasks may use different parameters with ranking tasks.
-- In addition to these parameters, there can be console parameters that relates to behavior of console version of xgboost(e.g. when to save model)
+- Learning Task parameters that decides on the learning scenario, for example, regression tasks may use different parameters with ranking tasks.
+- Command line parameters that relates to behavior of CLI version of xgboost.
 
-### Parameters in R Package
+Parameters in R Package
+-----------------------
 In R-package, you can use .(dot) to replace under score in the parameters, for example, you can use max.depth as max_depth. The underscore parameters are also valid in R.
 
-### General Parameters
+General Parameters
+------------------
 * booster [default=gbtree]
   - which booster to use, can be gbtree or gblinear. gbtree uses tree based model while gblinear uses linear function.
 * silent [default=0]
@@ -21,10 +23,8 @@ In R-package, you can use .(dot) to replace under score in the parameters, for e
 * num_feature [set automatically by xgboost, no need to be set by user]
   - feature dimension used in boosting, set to maximum dimension of the feature
 
-### Booster Parameters
-From xgboost-unity, the ```bst:``` prefix is no longer needed for booster parameters. Parameter with or without bst: prefix will be equivalent(i.e. both bst:eta and eta will be valid parameter setting) .
-
-#### Parameter for Tree Booster
+Parameters for Tree Booster
+---------------------------
 * eta [default=0.3]
   - step size shrinkage used in update to prevents overfitting. After each boosting step, we can directly get the weights of new features. and eta actually shrinks the feature weights to make the boosting process more conservative.
   - range: [0,1]
@@ -47,7 +47,8 @@ From xgboost-unity, the ```bst:``` prefix is no longer needed for booster parame
   - subsample ratio of columns when constructing each tree.
   - range: (0,1]
 
-#### Parameter for Linear Booster
+Parameters for Linear Booster
+-----------------------------
 * lambda [default=0]
   - L2 regularization term on weights
 * alpha [default=0]
@@ -55,7 +56,8 @@ From xgboost-unity, the ```bst:``` prefix is no longer needed for booster parame
 * lambda_bias
   - L2 regularization term on bias, default 0(no L1 reg on bias because it is not important)
 
-### Task Parameters
+Learning Task Parameters
+------------------------
 * objective [ default=reg:linear ]
  - specify the learning task and the corresponding learning objective, and the objective options are below:
  - "reg:linear" --linear regression
@@ -87,7 +89,8 @@ training repeatively
 * seed [ default=0 ]
  - random number seed.
 
-### Console Parameters
+Command Line Parameters
+-----------------------
 The following parameters are only used in the console version of xgboost
 * use_buffer [ default=1 ]
  -  whether create binary buffer for text input, this normally will speedup loading when do
diff --git a/doc/python/python_api.rst b/doc/python/python_api.rst
new file mode 100644
index 000000000000..1374e4bfc99a
--- /dev/null
+++ b/doc/python/python_api.rst
@@ -0,0 +1,47 @@
+Python API Reference
+====================
+This page gives the Python API reference of xgboost, please also refer to Python Package Introduction for more information about python package.
+
+The document in this page is automatically generated by sphinx. The content do not render at github, you can view it at http://xgboost.readthedocs.org/en/latest/python/python_api.html
+
+Core Data Structure
+-------------------
+.. automodule:: xgboost.core
+
+.. autoclass:: xgboost.DMatrix
+    :members:
+    :show-inheritance:
+
+.. autoclass:: xgboost.Booster
+    :members:
+    :show-inheritance:
+
+
+Learning API
+------------
+.. automodule:: xgboost.training
+
+.. autofunction:: xgboost.train
+
+.. autofunction:: xgboost.cv
+
+
+Scikit-Learn API
+----------------
+.. automodule:: xgboost.sklearn
+.. autoclass:: xgboost.XGBRegressor
+    :members:
+    :show-inheritance:
+.. autoclass:: xgboost.XGBClassifier
+    :members:
+    :show-inheritance:
+
+Plotting API
+------------
+.. automodule:: xgboost.plotting
+
+.. autofunction:: xgboost.plot_importance
+
+.. autofunction:: xgboost.plot_tree
+
+.. autofunction:: xgboost.to_graphviz
diff --git a/doc/python.md b/doc/python/python_intro.md
similarity index 61%
rename from doc/python.md
rename to doc/python/python_intro.md
index dfe886fe9f99..b46358877dd4 100644
--- a/doc/python.md
+++ b/doc/python/python_intro.md
@@ -1,31 +1,27 @@
-XGBoost Python Module
-====
+Python Package Introduction
+===========================
+This document gives a basic walkthrough of xgboost python package.
 
-This page will introduce XGBoost Python module, including:
-* [Building and Import](#building-and-import)
-* [Data Interface](#data-interface)
-* [Setting Parameters](#setting-parameters)
-* [Train Model](#training-model)
-* [Early Stopping](#early-stopping)
-* [Prediction](#prediction)
+***List of other Helpful Links***
+* [Python walkthrough code collections](https://github.com/tqchen/xgboost/blob/master/demo/guide-python)
+* [Python API Reference](python_api.rst)
 
-A [walk through python example](https://github.com/tqchen/xgboost/blob/master/demo/guide-python) for UCI Mushroom dataset is provided.
-
-=
-#### Install
-
-To install XGBoost, you need to run `make` in the root directory of the project and then in the `wrappers` directory run
+Install XGBoost
+---------------
+To install XGBoost, do the following steps.
 
+* You need to run `make` in the root directory of the project
+* In the  `python-package` directory run
 ```shell
 python setup.py install
 ```
-Then import the module in Python as usual
+
 ```python
 import xgboost as xgb
 ```
 
-=
-#### Data Interface
+Data Interface
+--------------
 XGBoost python module is able to loading from libsvm txt format file, Numpy 2D array and xgboost binary buffer file. The data will be store in ```DMatrix``` object.
 
 * To load libsvm text format file and XGBoost binary file into ```DMatrix```, the usage is like
@@ -41,8 +37,8 @@ dtrain = xgb.DMatrix( data, label=label)
 ```
 * Build ```DMatrix``` from ```scipy.sparse```
 ```python
-csr = scipy.sparse.csr_matrix( (dat, (row,col)) )
-dtrain = xgb.DMatrix( csr )
+csr = scipy.sparse.csr_matrix((dat, (row, col)))
+dtrain = xgb.DMatrix(csr)
 ```
 * Saving ```DMatrix``` into XGBoost binary file will make loading faster in next time. The usage is like:
 ```python
@@ -51,18 +47,17 @@ dtrain.save_binary("train.buffer")
 ```
 * To handle missing value in ```DMatrix```, you can initialize the ```DMatrix``` like:
 ```python
-dtrain = xgb.DMatrix( data, label=label, missing = -999.0)
+dtrain = xgb.DMatrix(data, label=label, missing = -999.0)
 ```
 * Weight can be set when needed, like
 ```python
-w = np.random.rand(5,1)
-dtrain = xgb.DMatrix( data, label=label, missing = -999.0, weight=w)
+w = np.random.rand(5, 1)
+dtrain = xgb.DMatrix(data, label=label, missing = -999.0, weight=w)
 ```
 
-
-=
-#### Setting Parameters
-XGBoost use list of pair to save [parameters](parameter.md). Eg
+Setting Parameters
+------------------
+XGBoost use list of pair to save [parameters](../parameter.md). Eg
 * Booster parameters
 ```python
 param = {'bst:max_depth':2, 'bst:eta':1, 'silent':1, 'objective':'binary:logistic' }
@@ -76,8 +71,9 @@ plst += [('eval_metric', 'ams@0')]
 evallist  = [(dtest,'eval'), (dtrain,'train')]
 ```
 
-=
-#### Training Model
+Training
+--------
+
 With parameter list and data, you are able to train a model.
 * Training
 ```python
@@ -103,10 +99,11 @@ After you save your model, you can load model file at anytime by using
 bst = xgb.Booster({'nthread':4}) #init model
 bst.load_model("model.bin") # load data
 ```
-=
-#### Early stopping
 
-If you have a validation set, you can use early stopping to find the optimal number of boosting rounds. Early stopping requires at least one set in `evals`. If there's more than one, it will use the last.
+Early Stopping
+--------------
+If you have a validation set, you can use early stopping to find the optimal number of boosting rounds.
+Early stopping requires at least one set in `evals`. If there's more than one, it will use the last.
 
 `train(..., evals=evals, early_stopping_rounds=10)`
 
@@ -116,16 +113,41 @@ If early stopping occurs, the model will have two additional fields: `bst.best_s
 
 This works with both metrics to minimize (RMSE, log loss, etc.) and to maximize (MAP, NDCG, AUC).
 
-=
-#### Prediction
+Prediction
+----------
 After you training/loading a model and preparing the data, you can start to do prediction.
 ```python
-data = np.random.rand(7,10) # 7 entities, each contains 10 features
-dtest = xgb.DMatrix( data, missing = -999.0 )
-ypred = bst.predict( xgmat )
+# 7 entities, each contains 10 features
+data = np.random.rand(7, 10)
+dtest = xgb.DMatrix(data)
+ypred = bst.predict(xgmat)
 ```
 
 If early stopping is enabled during training, you can predict with the best iteration.
 ```python
 ypred = bst.predict(xgmat,ntree_limit=bst.best_iteration)
 ```
+
+Plotting
+--------
+
+You can use plotting module to plot importance and output tree.
+
+To plot importance, use ``plot_importance``. This function requires ``matplotlib`` to be installed.
+
+```python
+xgb.plot_importance(bst)
+```
+
+To output tree via ``matplotlib``, use ``plot_tree`` specifying ordinal number of the target tree.
+This function requires ``graphviz`` and ``matplotlib``.
+
+```python
+xgb.plot_tree(bst, num_trees=2)
+```
+
+When you use ``IPython``, you can use ``to_graphviz`` function which converts the target tree to ``graphviz`` instance. ``graphviz`` instance is automatically rendered on ``IPython``.
+
+```python
+xgb.to_graphviz(bst, num_trees=2)
+```
\ No newline at end of file
diff --git a/doc/sphinx_util.py b/doc/sphinx_util.py
new file mode 100644
index 000000000000..a09f1e08baa5
--- /dev/null
+++ b/doc/sphinx_util.py
@@ -0,0 +1,17 @@
+# -*- coding: utf-8 -*-
+"""Helper utilty function for customization."""
+import sys
+import os
+import docutils
+import subprocess
+
+if os.environ.get('READTHEDOCS', None) == 'True':
+    subprocess.call('cd ..; rm -rf recommonmark;' +
+                    'git clone https://github.com/tqchen/recommonmark', shell=True)
+
+sys.path.insert(0, os.path.abspath('../recommonmark/'))
+
+from recommonmark import parser, transform
+
+MarkdownParser = parser.CommonMarkParser
+AutoStructify = transform.AutoStructify
diff --git a/python-package/.gitignore b/python-package/.gitignore
new file mode 100644
index 000000000000..d765c67c773e
--- /dev/null
+++ b/python-package/.gitignore
@@ -0,0 +1,3 @@
+build
+dist
+*.egg*
\ No newline at end of file
diff --git a/python-package/MANIFEST.in b/python-package/MANIFEST.in
new file mode 100644
index 000000000000..2d93429a9ff6
--- /dev/null
+++ b/python-package/MANIFEST.in
@@ -0,0 +1,7 @@
+include *.sh *.md
+recursive-include xgboost *
+recursive-include xgboost/wrapper *
+recursive-include xgboost/windows *
+recursive-include xgboost/subtree *
+recursive-include xgboost/src *
+recursive-include xgboost/multi-node *
diff --git a/python-package/README.md b/python-package/README.md
new file mode 100644
index 000000000000..eb0fa8cca53f
--- /dev/null
+++ b/python-package/README.md
@@ -0,0 +1,27 @@
+XGBoost Python Package
+======================
+Installation
+------------
+We are on [PyPI](https://pypi.python.org/pypi/xgboost) now. For stable version, please install using pip:
+
+* ```pip install xgboost```
+* Note for windows users: this pip installation may not work on some windows environment, and it may cause unexpected errors. pip installation on windows is currently disabled for further invesigation, please install from github.
+
+For up-to-date version, please install from github.
+
+* To make the python module, type ```./build.sh``` in the root directory of project
+* Make sure you have [setuptools](https://pypi.python.org/pypi/setuptools)
+* Install with `python setup.py install` from this directory.
+* For windows users, please use the Visual Studio project file under [windows folder](../windows/). See also the [installation tutorial](https://www.kaggle.com/c/otto-group-product-classification-challenge/forums/t/13043/run-xgboost-from-windows-and-python) from Kaggle Otto Forum.
+
+Examples
+------
+
+* Refer also to the walk through example in [demo folder](../demo/guide-python)
+* See also the [example scripts](../demo/kaggle-higgs) for Kaggle Higgs Challenge, including [speedtest script](../demo/kaggle-higgs/speedtest.py) on this dataset.
+
+Note
+-----
+
+* If you want to build xgboost on Mac OS X with multiprocessing support where clang in XCode by default doesn't support, please install gcc 4.9 or higher using [homebrew](http://brew.sh/) ```brew tap homebrew/versions; brew install gcc49```
+* If you want to run XGBoost process in parallel using the fork backend for joblib/multiprocessing, you must build XGBoost without support for OpenMP by `make no_omp=1`. Otherwise, use the forkserver (in Python 3.4) or spawn backend. See the [sklearn_parallel.py](../demo/guide-python/sklearn_parallel.py) demo.
diff --git a/python-package/setup.cfg b/python-package/setup.cfg
new file mode 100644
index 000000000000..b88034e414bc
--- /dev/null
+++ b/python-package/setup.cfg
@@ -0,0 +1,2 @@
+[metadata]
+description-file = README.md
diff --git a/python-package/setup.py b/python-package/setup.py
new file mode 100644
index 000000000000..3f8ad9a1d875
--- /dev/null
+++ b/python-package/setup.py
@@ -0,0 +1,49 @@
+# pylint: disable=invalid-name
+"""Setup xgboost package."""
+from __future__ import absolute_import
+import sys
+from setuptools import setup, find_packages
+import subprocess
+sys.path.insert(0, '.')
+
+import os
+#build on the fly if install in pip
+#otherwise, use build.sh in the parent directory
+
+if 'pip' in __file__:
+    if not os.name == 'nt': #if not windows
+        build_sh = subprocess.Popen(['sh', 'xgboost/build-python.sh'])
+        build_sh.wait()
+        output = build_sh.communicate()
+        print(output)
+
+import xgboost
+
+LIB_PATH = xgboost.core.find_lib_path()
+#print LIB_PATH
+
+#to deploy to pip, please use
+#make pythonpack
+#python setup.py register sdist upload
+#and be sure to test it firstly using "python setup.py register sdist upload -r pypitest"
+setup(name='xgboost',
+      version=xgboost.__version__,
+      #version='0.4a13',
+      description=xgboost.__doc__,
+      install_requires=[
+          'numpy',
+          'scipy',
+      ],
+      maintainer='Hongliang Liu',
+      maintainer_email='phunter.lau@gmail.com',
+      zip_safe=False,
+      packages=find_packages(),
+      #don't need this and don't use this, give everything to MANIFEST.in
+      #package_dir = {'':'xgboost'},
+      #package_data = {'': ['*.txt','*.md','*.sh'],
+      #               }
+      #this will use MANIFEST.in during install where we specify additional files,
+      #this is the golden line
+      include_package_data=True,
+      data_files=[('xgboost', LIB_PATH)],
+      url='https://github.com/dmlc/xgboost')
diff --git a/python-package/xgboost/__init__.py b/python-package/xgboost/__init__.py
new file mode 100644
index 000000000000..b251b450119b
--- /dev/null
+++ b/python-package/xgboost/__init__.py
@@ -0,0 +1,18 @@
+# coding: utf-8
+"""XGBoost: eXtreme Gradient Boosting library.
+
+Contributors: https://github.com/dmlc/xgboost/blob/master/CONTRIBUTORS.md
+"""
+
+from __future__ import absolute_import
+from .core import DMatrix, Booster
+from .training import train, cv
+from .sklearn import XGBModel, XGBClassifier, XGBRegressor
+from .plotting import plot_importance, plot_tree, to_graphviz
+
+__version__ = '0.4'
+
+__all__ = ['DMatrix', 'Booster',
+           'train', 'cv',
+           'XGBModel', 'XGBClassifier', 'XGBRegressor',
+           'plot_importance', 'plot_tree', 'to_graphviz']
diff --git a/python-package/xgboost/build-python.sh b/python-package/xgboost/build-python.sh
new file mode 100755
index 000000000000..398b076b819d
--- /dev/null
+++ b/python-package/xgboost/build-python.sh
@@ -0,0 +1,26 @@
+#!/bin/bash
+# This is a simple script to make xgboost in MAC and Linux for python wrapper only
+# Basically, it first try to make with OpenMP, if fails, disable OpenMP and make it again.
+# This will automatically make xgboost for MAC users who don't have OpenMP support.
+# In most cases, type make will give what you want.
+
+# See additional instruction in doc/build.md
+
+# note: this script is build for python package only, and it might have some filename
+#       conflict with build.sh which is for everything. 
+
+
+pushd xgboost
+if make python; then
+    echo "Successfully build multi-thread xgboost"
+else
+    echo "-----------------------------"
+    echo "Building multi-thread xgboost failed"
+    echo "Start to build single-thread xgboost"
+    make clean
+    make python no_omp=1
+    echo "Successfully build single-thread xgboost"
+    echo "If you want multi-threaded version"
+    echo "See additional instructions in doc/build.md"
+fi
+popd
diff --git a/wrapper/xgboost.py b/python-package/xgboost/core.py
similarity index 57%
rename from wrapper/xgboost.py
rename to python-package/xgboost/core.py
index 96f6c25735d2..41943cd61218 100644
--- a/wrapper/xgboost.py
+++ b/python-package/xgboost/core.py
@@ -1,17 +1,10 @@
 # coding: utf-8
-"""
-xgboost: eXtreme Gradient Boosting library
-
-Version: 0.40
-Authors: Tianqi Chen, Bing Xu
-Early stopping by Zygmunt Zając
-"""
-# pylint: disable=too-many-arguments, too-many-locals, too-many-lines, invalid-name
+# pylint: disable=too-many-arguments
+"""Core XGBoost Library."""
 from __future__ import absolute_import
 
 import os
 import sys
-import re
 import ctypes
 import platform
 import collections
@@ -19,13 +12,6 @@
 import numpy as np
 import scipy.sparse
 
-try:
-    from sklearn.base import BaseEstimator
-    from sklearn.base import RegressorMixin, ClassifierMixin
-    from sklearn.preprocessing import LabelEncoder
-    SKLEARN_INSTALLED = True
-except ImportError:
-    SKLEARN_INSTALLED = False
 
 class XGBoostLibraryNotFound(Exception):
     """Error throwed by when xgboost is not found"""
@@ -35,7 +21,6 @@ class XGBoostError(Exception):
     """Error throwed by xgboost trainer."""
     pass
 
-__all__ = ['DMatrix', 'CVPack', 'Booster', 'aggcv', 'cv', 'mknfold', 'train']
 
 if sys.version_info[0] == 3:
     # pylint: disable=invalid-name
@@ -44,30 +29,53 @@ class XGBoostError(Exception):
     # pylint: disable=invalid-name
     STRING_TYPES = basestring,
 
-def load_xglib():
-    """Load the xgboost library."""
+
+def find_lib_path():
+    """Load find the path to xgboost dynamic library files.
+
+    Returns
+    -------
+    lib_path: list(string)
+       List of all found library path to xgboost
+    """
     curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
-    dll_path = [curr_path]
+    #make pythonpack hack: copy this directory one level upper for setup.py
+    dll_path = [curr_path, os.path.join(curr_path, '../../wrapper/')
+                , os.path.join(curr_path, './wrapper/')]
     if os.name == 'nt':
         if platform.architecture()[0] == '64bit':
-            dll_path.append(os.path.join(curr_path, '../windows/x64/Release/'))
+            dll_path.append(os.path.join(curr_path, '../../windows/x64/Release/'))
+            #hack for pip installation when copy all parent source directory here
+            dll_path.append(os.path.join(curr_path, './windows/x64/Release/'))
         else:
-            dll_path.append(os.path.join(curr_path, '../windows/Release/'))
+            dll_path.append(os.path.join(curr_path, '../../windows/Release/'))
+            #hack for pip installation when copy all parent source directory here
+            dll_path.append(os.path.join(curr_path, './windows/Release/'))
     if os.name == 'nt':
         dll_path = [os.path.join(p, 'xgboost_wrapper.dll') for p in dll_path]
     else:
         dll_path = [os.path.join(p, 'libxgboostwrapper.so') for p in dll_path]
     lib_path = [p for p in dll_path if os.path.exists(p) and os.path.isfile(p)]
-    if len(dll_path) == 0:
+    if len(lib_path) == 0 and not os.environ.get('XGBOOST_BUILD_DOC', False):
         raise XGBoostLibraryNotFound(
-            'cannot find find the files in the candicate path ' + str(dll_path))
+            'Cannot find XGBoost Libarary in the candicate path, ' +
+            'did you run build.sh in root path?\n'
+            'List of candidates:\n' + ('\n'.join(dll_path)))
+    return lib_path
+
+
+def _load_lib():
+    """Load xgboost Library."""
+    lib_path = find_lib_path()
+    if len(lib_path) == 0:
+        return None
     lib = ctypes.cdll.LoadLibrary(lib_path[0])
     lib.XGBGetLastError.restype = ctypes.c_char_p
-
     return lib
 
+
 # load the XGBoost library globally
-_LIB = load_xglib()
+_LIB = _load_lib()
 
 def _check_call(ret):
     """Check the return value of C API call
@@ -117,7 +125,12 @@ def c_array(ctype, values):
 
 
 class DMatrix(object):
-    """Data Matrix used in XGBoost."""
+    """Data Matrix used in XGBoost.
+
+    DMatrix is a internal data structure that used by XGBoost
+    which is optimized for both memory efficiency and training speed.
+    You can construct DMatrix from numpy.arrays
+    """
     def __init__(self, data, label=None, missing=0.0, weight=None, silent=False):
         """
         Data matrix used in XGBoost.
@@ -125,15 +138,16 @@ def __init__(self, data, label=None, missing=0.0, weight=None, silent=False):
         Parameters
         ----------
         data : string/numpy array/scipy.sparse
-            Data source, string type is the path of svmlight format txt file,
-            xgb buffer or path to cache_file
-        label : list or numpy 1-D array (optional)
+            Data source of DMatrix.
+            When data is string type, it represents the path libsvm format txt file,
+            or binary file that xgboost can read from.
+        label : list or numpy 1-D array, optional
             Label of the training data.
-        missing : float
+        missing : float, optional
             Value in the data which needs to be present as a missing value.
-        weight : list or numpy 1-D array (optional)
+        weight : list or numpy 1-D array , optional
             Weight for each instance.
-        silent: boolean
+        silent : boolean, optional
             Whether print messages during construction
         """
         # force into void_p, mac need to pass things in as void_p
@@ -400,11 +414,14 @@ def slice(self, rindex):
 
 
 class Booster(object):
-    """"A Booster of of XGBoost."""
+    """"A Booster of of XGBoost.
+
+    Booster is the model of xgboost, that contains low level routines for
+    training, prediction and evaluation.
+    """
     def __init__(self, params=None, cache=(), model_file=None):
         # pylint: disable=invalid-name
-        """
-        Learner class.
+        """Initialize the Booster.
 
         Parameters
         ----------
@@ -464,13 +481,22 @@ def copy(self):
         """Copy the booster object.
 
         Returns
-        --------
-        a copied booster model
+        -------
+        booster: `Booster`
+          a copied booster model
         """
         return self.__copy__()
 
     def set_param(self, params, value=None):
-        """Set parameters into the DMatrix."""
+        """Set parameters into the Booster.
+
+        Parameters
+        ----------
+        params: dict/list/str
+           list of key,value paris, dict of key to value or simply str key
+        value: optional
+           value of the specified parameter, when params is str key
+        """
         if isinstance(params, collections.Mapping):
             params = params.items()
         elif isinstance(params, STRING_TYPES) and value is not None:
@@ -480,7 +506,7 @@ def set_param(self, params, value=None):
 
     def update(self, dtrain, iteration, fobj=None):
         """
-        Update (one iteration).
+        Update for one iteration, with objective function calculated internally.
 
         Parameters
         ----------
@@ -502,7 +528,7 @@ def update(self, dtrain, iteration, fobj=None):
 
     def boost(self, dtrain, grad, hess):
         """
-        Update.
+        Boost the booster for one iteration, with customized gradient statistics.
 
         Parameters
         ----------
@@ -537,7 +563,8 @@ def eval_set(self, evals, iteration=0, feval=None):
 
         Returns
         -------
-        evaluation result
+        result: str
+            Evaluation result string.
         """
         if feval is None:
             for d in evals:
@@ -562,18 +589,21 @@ def eval_set(self, evals, iteration=0, feval=None):
     def eval(self, data, name='eval', iteration=0):
         """Evaluate the model on mat.
 
-
         Parameters
-        ---------
+        ----------
         data : DMatrix
             The dmatrix storing the input.
 
-        name : str (default = 'eval')
-            The name of the dataset
+        name : str, optional
+            The name of the dataset.
 
+        iteration : int, optional
+            The current iteration number.
 
-        iteration : int (default = 0)
-            The current iteration number
+        Returns
+        -------
+        result: str
+            Evaluation result string.
         """
         return self.eval_set([(data, name)], iteration)
 
@@ -735,436 +765,3 @@ def get_fscore(self, fmap=''):
                 else:
                     fmap[fid] += 1
         return fmap
-
-
-def train(params, dtrain, num_boost_round=10, evals=(), obj=None, feval=None,
-          early_stopping_rounds=None, evals_result=None):
-    # pylint: disable=too-many-statements,too-many-branches, attribute-defined-outside-init
-    """Train a booster with given parameters.
-
-    Parameters
-    ----------
-    params : dict
-        Booster params.
-    dtrain : DMatrix
-        Data to be trained.
-    num_boost_round: int
-        Number of boosting iterations.
-    watchlist (evals): list of pairs (DMatrix, string)
-        List of items to be evaluated during training, this allows user to watch
-        performance on the validation set.
-    obj : function
-        Customized objective function.
-    feval : function
-        Customized evaluation function.
-    early_stopping_rounds: int
-        Activates early stopping. Validation error needs to decrease at least
-        every <early_stopping_rounds> round(s) to continue training.
-        Requires at least one item in evals.
-        If there's more than one, will use the last.
-        Returns the model from the last iteration (not the best one).
-        If early stopping occurs, the model will have two additional fields:
-        bst.best_score and bst.best_iteration.
-    evals_result: dict
-        This dictionary stores the evaluation results of all the items in watchlist
-
-    Returns
-    -------
-    booster : a trained booster model
-    """
-
-    evals = list(evals)
-    bst = Booster(params, [dtrain] + [d[0] for d in evals])
-
-    if evals_result is not None:
-        if isinstance(evals_result, dict):
-            raise TypeError('evals_result has to be a dictionary')
-        else:
-            evals_name = [d[1] for d in evals]
-            evals_result.clear()
-            evals_result.update({key:[] for key in evals_name})
-
-    if not early_stopping_rounds:
-        for i in range(num_boost_round):
-            bst.update(dtrain, i, obj)
-            if len(evals) != 0:
-                bst_eval_set = bst.eval_set(evals, i, feval)
-                if isinstance(bst_eval_set, STRING_TYPES):
-                    msg = bst_eval_set
-                else:
-                    msg = bst_eval_set.decode()
-
-                sys.stderr.write(msg + '\n')
-                if evals_result is not None:
-                    res = re.findall(":([0-9.]+).", msg)
-                    for key, val in zip(evals_name, res):
-                        evals_result[key].append(val)
-        return bst
-
-    else:
-        # early stopping
-        if len(evals) < 1:
-            raise ValueError('For early stopping you need at least one set in evals.')
-
-        sys.stderr.write("Will train until {} error hasn't decreased in {} rounds.\n".format(\
-                evals[-1][1], early_stopping_rounds))
-
-        # is params a list of tuples? are we using multiple eval metrics?
-        if isinstance(params, list):
-            if len(params) != len(dict(params).items()):
-                raise ValueError('Check your params.'\
-                                     'Early stopping works with single eval metric only.')
-            params = dict(params)
-
-        # either minimize loss or maximize AUC/MAP/NDCG
-        maximize_score = False
-        if 'eval_metric' in params:
-            maximize_metrics = ('auc', 'map', 'ndcg')
-            if any(params['eval_metric'].startswith(x) for x in maximize_metrics):
-                maximize_score = True
-
-        if maximize_score:
-            best_score = 0.0
-        else:
-            best_score = float('inf')
-
-        best_msg = ''
-        best_score_i = 0
-
-        for i in range(num_boost_round):
-            bst.update(dtrain, i, obj)
-            bst_eval_set = bst.eval_set(evals, i, feval)
-
-            if isinstance(bst_eval_set, STRING_TYPES):
-                msg = bst_eval_set
-            else:
-                msg = bst_eval_set.decode()
-
-            sys.stderr.write(msg + '\n')
-
-            if evals_result is not None:
-                res = re.findall(":([0-9.]+).", msg)
-                for key, val in zip(evals_name, res):
-                    evals_result[key].append(val)
-
-            score = float(msg.rsplit(':', 1)[1])
-            if (maximize_score and score > best_score) or \
-                    (not maximize_score and score < best_score):
-                best_score = score
-                best_score_i = i
-                best_msg = msg
-            elif i - best_score_i >= early_stopping_rounds:
-                sys.stderr.write("Stopping. Best iteration:\n{}\n\n".format(best_msg))
-                bst.best_score = best_score
-                bst.best_iteration = best_score_i
-                break
-        bst.best_score = best_score
-        bst.best_iteration = best_score_i
-        return bst
-
-class CVPack(object):
-    """"Auxiliary datastruct to hold one fold of CV."""
-    def __init__(self, dtrain, dtest, param):
-        """"Initialize the CVPack"""
-        self.dtrain = dtrain
-        self.dtest = dtest
-        self.watchlist = [(dtrain, 'train'), (dtest, 'test')]
-        self.bst = Booster(param, [dtrain, dtest])
-
-    def update(self, iteration, fobj):
-        """"Update the boosters for one iteration"""
-        self.bst.update(self.dtrain, iteration, fobj)
-
-    def eval(self, iteration, feval):
-        """"Evaluate the CVPack for one iteration."""
-        return self.bst.eval_set(self.watchlist, iteration, feval)
-
-
-def mknfold(dall, nfold, param, seed, evals=(), fpreproc=None):
-    """
-    Make an n-fold list of CVPack from random indices.
-    """
-    evals = list(evals)
-    np.random.seed(seed)
-    randidx = np.random.permutation(dall.num_row())
-    kstep = len(randidx) / nfold
-    idset = [randidx[(i * kstep): min(len(randidx), (i + 1) * kstep)] for i in range(nfold)]
-    ret = []
-    for k in range(nfold):
-        dtrain = dall.slice(np.concatenate([idset[i] for i in range(nfold) if k != i]))
-        dtest = dall.slice(idset[k])
-        # run preprocessing on the data set if needed
-        if fpreproc is not None:
-            dtrain, dtest, tparam = fpreproc(dtrain, dtest, param.copy())
-        else:
-            tparam = param
-        plst = list(tparam.items()) + [('eval_metric', itm) for itm in evals]
-        ret.append(CVPack(dtrain, dtest, plst))
-    return ret
-
-
-def aggcv(rlist, show_stdv=True):
-    # pylint: disable=invalid-name
-    """
-    Aggregate cross-validation results.
-    """
-    cvmap = {}
-    ret = rlist[0].split()[0]
-    for line in rlist:
-        arr = line.split()
-        assert ret == arr[0]
-        for it in arr[1:]:
-            if not isinstance(it, STRING_TYPES):
-                it = it.decode()
-            k, v = it.split(':')
-            if k not in cvmap:
-                cvmap[k] = []
-            cvmap[k].append(float(v))
-    for k, v in sorted(cvmap.items(), key=lambda x: x[0]):
-        v = np.array(v)
-        if not isinstance(ret, STRING_TYPES):
-            ret = ret.decode()
-        if show_stdv:
-            ret += '\tcv-%s:%f+%f' % (k, np.mean(v), np.std(v))
-        else:
-            ret += '\tcv-%s:%f' % (k, np.mean(v))
-    return ret
-
-
-def cv(params, dtrain, num_boost_round=10, nfold=3, metrics=(),
-       obj=None, feval=None, fpreproc=None, show_stdv=True, seed=0):
-    # pylint: disable = invalid-name
-    """Cross-validation with given paramaters.
-
-    Parameters
-    ----------
-    params : dict
-        Booster params.
-    dtrain : DMatrix
-        Data to be trained.
-    num_boost_round : int
-        Number of boosting iterations.
-    nfold : int
-        Number of folds in CV.
-    metrics : list of strings
-        Evaluation metrics to be watched in CV.
-    obj : function
-        Custom objective function.
-    feval : function
-        Custom evaluation function.
-    fpreproc : function
-        Preprocessing function that takes (dtrain, dtest, param) and returns
-        transformed versions of those.
-    show_stdv : bool
-        Whether to display the standard deviation.
-    seed : int
-        Seed used to generate the folds (passed to numpy.random.seed).
-
-    Returns
-    -------
-    evaluation history : list(string)
-    """
-    results = []
-    cvfolds = mknfold(dtrain, nfold, params, seed, metrics, fpreproc)
-    for i in range(num_boost_round):
-        for fold in cvfolds:
-            fold.update(i, obj)
-        res = aggcv([f.eval(i, feval) for f in cvfolds], show_stdv)
-        sys.stderr.write(res + '\n')
-        results.append(res)
-    return results
-
-
-# used for compatiblity without sklearn
-XGBModelBase = object
-XGBClassifierBase = object
-XGBRegressorBase = object
-if SKLEARN_INSTALLED:
-    XGBModelBase = BaseEstimator
-    XGBRegressorBase = RegressorMixin
-    XGBClassifierBase = ClassifierMixin
-
-class XGBModel(XGBModelBase):
-    # pylint: disable=too-many-arguments, too-many-instance-attributes, invalid-name
-    """Implementation of the Scikit-Learn API for XGBoost.
-
-    Parameters
-    ----------
-    max_depth : int
-        Maximum tree depth for base learners.
-    learning_rate : float
-        Boosting learning rate (xgb's "eta")
-    n_estimators : int
-        Number of boosted trees to fit.
-    silent : boolean
-        Whether to print messages while running boosting.
-    objective : string
-        Specify the learning task and the corresponding learning objective.
-
-    nthread : int
-        Number of parallel threads used to run xgboost.
-    gamma : float
-        Minimum loss reduction required to make a further partition on a leaf node of the tree.
-    min_child_weight : int
-        Minimum sum of instance weight(hessian) needed in a child.
-    max_delta_step : int
-        Maximum delta step we allow each tree's weight estimation to be.
-    subsample : float
-        Subsample ratio of the training instance.
-    colsample_bytree : float
-        Subsample ratio of columns when constructing each tree.
-
-    base_score:
-        The initial prediction score of all instances, global bias.
-    seed : int
-        Random number seed.
-    missing : float, optional
-        Value in the data which needs to be present as a missing value. If
-        None, defaults to np.nan.
-    """
-    def __init__(self, max_depth=3, learning_rate=0.1, n_estimators=100,
-                 silent=True, objective="reg:linear",
-                 nthread=-1, gamma=0, min_child_weight=1, max_delta_step=0,
-                 subsample=1, colsample_bytree=1,
-                 base_score=0.5, seed=0, missing=None):
-        if not SKLEARN_INSTALLED:
-            raise XGBoostError('sklearn needs to be installed in order to use this module')
-        self.max_depth = max_depth
-        self.learning_rate = learning_rate
-        self.n_estimators = n_estimators
-        self.silent = silent
-        self.objective = objective
-
-        self.nthread = nthread
-        self.gamma = gamma
-        self.min_child_weight = min_child_weight
-        self.max_delta_step = max_delta_step
-        self.subsample = subsample
-        self.colsample_bytree = colsample_bytree
-
-        self.base_score = base_score
-        self.seed = seed
-        self.missing = missing if missing is not None else np.nan
-        self._Booster = None
-
-    def __setstate__(self, state):
-        # backward compatiblity code
-        # load booster from raw if it is raw
-        # the booster now support pickle
-        bst = state["_Booster"]
-        if bst is not None and not isinstance(bst, Booster):
-            state["_Booster"] = Booster(model_file=bst)
-        self.__dict__.update(state)
-
-    def booster(self):
-        """Get the underlying xgboost Booster of this model.
-
-        This will raise an exception when fit was not called
-
-        Returns
-        -------
-        booster : a xgboost booster of underlying model
-        """
-        if self._Booster is None:
-            raise XGBoostError('need to call fit beforehand')
-        return self._Booster
-
-    def get_params(self, deep=False):
-        """Get parameter.s"""
-        params = super(XGBModel, self).get_params(deep=deep)
-        if params['missing'] is np.nan:
-            params['missing'] = None  # sklearn doesn't handle nan. see #4725
-        return params
-
-    def get_xgb_params(self):
-        """Get xgboost type parameters."""
-        xgb_params = self.get_params()
-
-        xgb_params['silent'] = 1 if self.silent else 0
-
-        if self.nthread <= 0:
-            xgb_params.pop('nthread', None)
-        return xgb_params
-
-    def fit(self, data, y):
-        # pylint: disable=missing-docstring,invalid-name
-        train_dmatrix = DMatrix(data, label=y, missing=self.missing)
-        self._Booster = train(self.get_xgb_params(), train_dmatrix, self.n_estimators)
-        return self
-
-    def predict(self, data):
-        # pylint: disable=missing-docstring,invalid-name
-        test_dmatrix = DMatrix(data, missing=self.missing)
-        return self.booster().predict(test_dmatrix)
-
-
-class XGBClassifier(XGBModel, XGBClassifierBase):
-    # pylint: disable=missing-docstring,too-many-arguments,invalid-name
-    __doc__ = """
-    Implementation of the scikit-learn API for XGBoost classification
-    """ + "\n".join(XGBModel.__doc__.split('\n')[2:])
-
-    def __init__(self, max_depth=3, learning_rate=0.1,
-                 n_estimators=100, silent=True,
-                 objective="binary:logistic",
-                 nthread=-1, gamma=0, min_child_weight=1,
-                 max_delta_step=0, subsample=1, colsample_bytree=1,
-                 base_score=0.5, seed=0, missing=None):
-        super(XGBClassifier, self).__init__(max_depth, learning_rate,
-                                            n_estimators, silent, objective,
-                                            nthread, gamma, min_child_weight,
-                                            max_delta_step, subsample,
-                                            colsample_bytree,
-                                            base_score, seed, missing)
-
-    def fit(self, X, y, sample_weight=None):
-        # pylint: disable = attribute-defined-outside-init,arguments-differ
-        self.classes_ = list(np.unique(y))
-        self.n_classes_ = len(self.classes_)
-        if self.n_classes_ > 2:
-            # Switch to using a multiclass objective in the underlying XGB instance
-            self.objective = "multi:softprob"
-            xgb_options = self.get_xgb_params()
-            xgb_options['num_class'] = self.n_classes_
-        else:
-            xgb_options = self.get_xgb_params()
-
-        self._le = LabelEncoder().fit(y)
-        training_labels = self._le.transform(y)
-
-        if sample_weight is not None:
-            train_dmatrix = DMatrix(X, label=training_labels, weight=sample_weight,
-                                    missing=self.missing)
-        else:
-            train_dmatrix = DMatrix(X, label=training_labels,
-                                    missing=self.missing)
-
-        self._Booster = train(xgb_options, train_dmatrix, self.n_estimators)
-
-        return self
-
-    def predict(self, data):
-        test_dmatrix = DMatrix(data, missing=self.missing)
-        class_probs = self.booster().predict(test_dmatrix)
-        if len(class_probs.shape) > 1:
-            column_indexes = np.argmax(class_probs, axis=1)
-        else:
-            column_indexes = np.repeat(0, data.shape[0])
-            column_indexes[class_probs > 0.5] = 1
-        return self._le.inverse_transform(column_indexes)
-
-    def predict_proba(self, data):
-        test_dmatrix = DMatrix(data, missing=self.missing)
-        class_probs = self.booster().predict(test_dmatrix)
-        if self.objective == "multi:softprob":
-            return class_probs
-        else:
-            classone_probs = class_probs
-            classzero_probs = 1.0 - classone_probs
-            return np.vstack((classzero_probs, classone_probs)).transpose()
-
-class XGBRegressor(XGBModel, XGBRegressorBase):
-    # pylint: disable=missing-docstring
-    __doc__ = """
-    Implementation of the scikit-learn API for XGBoost regression
-    """ + "\n".join(XGBModel.__doc__.split('\n')[2:])
diff --git a/python-package/xgboost/plotting.py b/python-package/xgboost/plotting.py
new file mode 100644
index 000000000000..9c9b2a97d189
--- /dev/null
+++ b/python-package/xgboost/plotting.py
@@ -0,0 +1,223 @@
+# coding: utf-8
+# pylint: disable=too-many-locals, too-many-arguments, invalid-name,
+# pylint: disable=too-many-branches
+"""Plotting Library."""
+from __future__ import absolute_import
+
+import re
+import numpy as np
+from .core import Booster
+
+from io import BytesIO
+
+def plot_importance(booster, ax=None, height=0.2,
+                    xlim=None, title='Feature importance',
+                    xlabel='F score', ylabel='Features',
+                    grid=True, **kwargs):
+
+    """Plot importance based on fitted trees.
+
+    Parameters
+    ----------
+    booster : Booster or dict
+        Booster instance, or dict taken by Booster.get_fscore()
+    ax : matplotlib Axes, default None
+        Target axes instance. If None, new figure and axes will be created.
+    height : float, default 0.2
+        Bar height, passed to ax.barh()
+    xlim : tuple, default None
+        Tuple passed to axes.xlim()
+    title : str, default "Feature importance"
+        Axes title. To disable, pass None.
+    xlabel : str, default "F score"
+        X axis title label. To disable, pass None.
+    ylabel : str, default "Features"
+        Y axis title label. To disable, pass None.
+    kwargs :
+        Other keywords passed to ax.barh()
+
+    Returns
+    -------
+    ax : matplotlib Axes
+    """
+
+    try:
+        import matplotlib.pyplot as plt
+    except ImportError:
+        raise ImportError('You must install matplotlib to plot importance')
+
+    if isinstance(booster, Booster):
+        importance = booster.get_fscore()
+    elif isinstance(booster, dict):
+        importance = booster
+    else:
+        raise ValueError('tree must be Booster or dict instance')
+
+    if len(importance) == 0:
+        raise ValueError('Booster.get_fscore() results in empty')
+
+    tuples = [(k, importance[k]) for k in importance]
+    tuples = sorted(tuples, key=lambda x: x[1])
+    labels, values = zip(*tuples)
+
+    if ax is None:
+        _, ax = plt.subplots(1, 1)
+
+    ylocs = np.arange(len(values))
+    ax.barh(ylocs, values, align='center', height=height, **kwargs)
+
+    for x, y in zip(values, ylocs):
+        ax.text(x + 1, y, x, va='center')
+
+    ax.set_yticks(ylocs)
+    ax.set_yticklabels(labels)
+
+    if xlim is not None:
+        if not isinstance(xlim, tuple) or len(xlim, 2):
+            raise ValueError('xlim must be a tuple of 2 elements')
+    else:
+        xlim = (0, max(values) * 1.1)
+    ax.set_xlim(xlim)
+
+    if title is not None:
+        ax.set_title(title)
+    if xlabel is not None:
+        ax.set_xlabel(xlabel)
+    if ylabel is not None:
+        ax.set_ylabel(ylabel)
+    ax.grid(grid)
+    return ax
+
+
+_NODEPAT = re.compile(r'(\d+):\[(.+)\]')
+_LEAFPAT = re.compile(r'(\d+):(leaf=.+)')
+_EDGEPAT = re.compile(r'yes=(\d+),no=(\d+),missing=(\d+)')
+
+
+def _parse_node(graph, text):
+    """parse dumped node"""
+    match = _NODEPAT.match(text)
+    if match is not None:
+        node = match.group(1)
+        graph.node(node, label=match.group(2), shape='circle')
+        return node
+    match = _LEAFPAT.match(text)
+    if match is not None:
+        node = match.group(1)
+        graph.node(node, label=match.group(2), shape='box')
+        return node
+    raise ValueError('Unable to parse node: {0}'.format(text))
+
+
+def _parse_edge(graph, node, text, yes_color='#0000FF', no_color='#FF0000'):
+    """parse dumped edge"""
+    match = _EDGEPAT.match(text)
+    if match is not None:
+        yes, no, missing = match.groups()
+        if yes == missing:
+            graph.edge(node, yes, label='yes, missing', color=yes_color)
+            graph.edge(node, no, label='no', color=no_color)
+        else:
+            graph.edge(node, yes, label='yes', color=yes_color)
+            graph.edge(node, no, label='no, missing', color=no_color)
+        return
+    raise ValueError('Unable to parse edge: {0}'.format(text))
+
+
+def to_graphviz(booster, num_trees=0, rankdir='UT',
+                yes_color='#0000FF', no_color='#FF0000', **kwargs):
+
+    """Convert specified tree to graphviz instance. IPython can automatically plot the
+    returned graphiz instance. Otherwise, you shoud call .render() method
+    of the returned graphiz instance.
+
+    Parameters
+    ----------
+    booster : Booster
+        Booster instance
+    num_trees : int, default 0
+        Specify the ordinal number of target tree
+    rankdir : str, default "UT"
+        Passed to graphiz via graph_attr
+    yes_color : str, default '#0000FF'
+        Edge color when meets the node condigion.
+    no_color : str, default '#FF0000'
+        Edge color when doesn't meet the node condigion.
+    kwargs :
+        Other keywords passed to graphviz graph_attr
+
+    Returns
+    -------
+    ax : matplotlib Axes
+    """
+
+    try:
+        from graphviz import Digraph
+    except ImportError:
+        raise ImportError('You must install graphviz to plot tree')
+
+    if not isinstance(booster, Booster):
+        raise ValueError('booster must be Booster instance')
+
+    tree = booster.get_dump()[num_trees]
+    tree = tree.split()
+
+    kwargs = kwargs.copy()
+    kwargs.update({'rankdir': rankdir})
+    graph = Digraph(graph_attr=kwargs)
+
+    for i, text in enumerate(tree):
+        if text[0].isdigit():
+            node = _parse_node(graph, text)
+        else:
+            if i == 0:
+                # 1st string must be node
+                raise ValueError('Unable to parse given string as tree')
+            _parse_edge(graph, node, text, yes_color=yes_color,
+                        no_color=no_color)
+
+    return graph
+
+
+def plot_tree(booster, num_trees=0, rankdir='UT', ax=None, **kwargs):
+    """Plot specified tree.
+
+    Parameters
+    ----------
+    booster : Booster
+        Booster instance
+    num_trees : int, default 0
+        Specify the ordinal number of target tree
+    rankdir : str, default "UT"
+        Passed to graphiz via graph_attr
+    ax : matplotlib Axes, default None
+        Target axes instance. If None, new figure and axes will be created.
+    kwargs :
+        Other keywords passed to to_graphviz
+
+    Returns
+    -------
+    ax : matplotlib Axes
+
+    """
+
+    try:
+        import matplotlib.pyplot as plt
+        import matplotlib.image as image
+    except ImportError:
+        raise ImportError('You must install matplotlib to plot tree')
+
+
+    if ax is None:
+        _, ax = plt.subplots(1, 1)
+
+    g = to_graphviz(booster, num_trees=num_trees, rankdir=rankdir, **kwargs)
+
+    s = BytesIO()
+    s.write(g.pipe(format='png'))
+    s.seek(0)
+    img = image.imread(s)
+
+    ax.imshow(img)
+    ax.axis('off')
+    return ax
diff --git a/python-package/xgboost/sklearn.py b/python-package/xgboost/sklearn.py
new file mode 100644
index 000000000000..6f176972aced
--- /dev/null
+++ b/python-package/xgboost/sklearn.py
@@ -0,0 +1,339 @@
+# coding: utf-8
+# pylint: disable=too-many-arguments, too-many-locals, invalid-name, fixme
+"""Scikit-Learn Wrapper interface for XGBoost."""
+from __future__ import absolute_import
+
+import numpy as np
+from .core import Booster, DMatrix, XGBoostError
+from .training import train
+
+try:
+    from sklearn.base import BaseEstimator
+    from sklearn.base import RegressorMixin, ClassifierMixin
+    from sklearn.preprocessing import LabelEncoder
+    SKLEARN_INSTALLED = True
+except ImportError:
+    SKLEARN_INSTALLED = False
+
+# used for compatiblity without sklearn
+XGBModelBase = object
+XGBClassifierBase = object
+XGBRegressorBase = object
+
+if SKLEARN_INSTALLED:
+    XGBModelBase = BaseEstimator
+    XGBRegressorBase = RegressorMixin
+    XGBClassifierBase = ClassifierMixin
+
+class XGBModel(XGBModelBase):
+    # pylint: disable=too-many-arguments, too-many-instance-attributes, invalid-name
+    """Implementation of the Scikit-Learn API for XGBoost.
+
+    Parameters
+    ----------
+    max_depth : int
+        Maximum tree depth for base learners.
+    learning_rate : float
+        Boosting learning rate (xgb's "eta")
+    n_estimators : int
+        Number of boosted trees to fit.
+    silent : boolean
+        Whether to print messages while running boosting.
+    objective : string
+        Specify the learning task and the corresponding learning objective.
+
+    nthread : int
+        Number of parallel threads used to run xgboost.
+    gamma : float
+        Minimum loss reduction required to make a further partition on a leaf node of the tree.
+    min_child_weight : int
+        Minimum sum of instance weight(hessian) needed in a child.
+    max_delta_step : int
+        Maximum delta step we allow each tree's weight estimation to be.
+    subsample : float
+        Subsample ratio of the training instance.
+    colsample_bytree : float
+        Subsample ratio of columns when constructing each tree.
+
+    base_score:
+        The initial prediction score of all instances, global bias.
+    seed : int
+        Random number seed.
+    missing : float, optional
+        Value in the data which needs to be present as a missing value. If
+        None, defaults to np.nan.
+    """
+    def __init__(self, max_depth=3, learning_rate=0.1, n_estimators=100,
+                 silent=True, objective="reg:linear",
+                 nthread=-1, gamma=0, min_child_weight=1, max_delta_step=0,
+                 subsample=1, colsample_bytree=1,
+                 base_score=0.5, seed=0, missing=None):
+        if not SKLEARN_INSTALLED:
+            raise XGBoostError('sklearn needs to be installed in order to use this module')
+        self.max_depth = max_depth
+        self.learning_rate = learning_rate
+        self.n_estimators = n_estimators
+        self.silent = silent
+        self.objective = objective
+
+        self.nthread = nthread
+        self.gamma = gamma
+        self.min_child_weight = min_child_weight
+        self.max_delta_step = max_delta_step
+        self.subsample = subsample
+        self.colsample_bytree = colsample_bytree
+
+        self.base_score = base_score
+        self.seed = seed
+        self.missing = missing if missing is not None else np.nan
+        self._Booster = None
+
+    def __setstate__(self, state):
+        # backward compatiblity code
+        # load booster from raw if it is raw
+        # the booster now support pickle
+        bst = state["_Booster"]
+        if bst is not None and not isinstance(bst, Booster):
+            state["_Booster"] = Booster(model_file=bst)
+        self.__dict__.update(state)
+
+    def booster(self):
+        """Get the underlying xgboost Booster of this model.
+
+        This will raise an exception when fit was not called
+
+        Returns
+        -------
+        booster : a xgboost booster of underlying model
+        """
+        if self._Booster is None:
+            raise XGBoostError('need to call fit beforehand')
+        return self._Booster
+
+    def get_params(self, deep=False):
+        """Get parameter.s"""
+        params = super(XGBModel, self).get_params(deep=deep)
+        if params['missing'] is np.nan:
+            params['missing'] = None  # sklearn doesn't handle nan. see #4725
+        if not params.get('eval_metric', True):
+            del params['eval_metric']  # don't give as None param to Booster
+        return params
+
+    def get_xgb_params(self):
+        """Get xgboost type parameters."""
+        xgb_params = self.get_params()
+
+        xgb_params['silent'] = 1 if self.silent else 0
+
+        if self.nthread <= 0:
+            xgb_params.pop('nthread', None)
+        return xgb_params
+
+    def fit(self, X, y, eval_set=None, eval_metric=None,
+            early_stopping_rounds=None, verbose=True):
+        # pylint: disable=missing-docstring,invalid-name,attribute-defined-outside-init
+        """
+        Fit the gradient boosting model
+
+        Parameters
+        ----------
+        X : array_like
+            Feature matrix
+        y : array_like
+            Labels
+        eval_set : list, optional
+            A list of (X, y) tuple pairs to use as a validation set for
+            early-stopping
+        eval_metric : str, callable, optional
+            If a str, should be a built-in evaluation metric to use. See
+            doc/parameter.md. If callable, a custom evaluation metric. The call
+            signature is func(y_predicted, y_true) where y_true will be a
+            DMatrix object such that you may need to call the get_label
+            method. It must return a str, value pair where the str is a name
+            for the evaluation and value is the value of the evaluation
+            function. This objective is always minimized.
+        early_stopping_rounds : int
+            Activates early stopping. Validation error needs to decrease at
+            least every <early_stopping_rounds> round(s) to continue training.
+            Requires at least one item in evals.  If there's more than one,
+            will use the last. Returns the model from the last iteration
+            (not the best one). If early stopping occurs, the model will
+            have two additional fields: bst.best_score and bst.best_iteration.
+        verbose : bool
+            If `verbose` and an evaluation set is used, writes the evaluation
+            metric measured on the validation set to stderr.
+        """
+        trainDmatrix = DMatrix(X, label=y, missing=self.missing)
+
+        eval_results = {}
+        if eval_set is not None:
+            evals = list(DMatrix(x[0], label=x[1]) for x in eval_set)
+            evals = list(zip(evals, ["validation_{}".format(i) for i in
+                                     range(len(evals))]))
+        else:
+            evals = ()
+
+        params = self.get_xgb_params()
+
+        feval = eval_metric if callable(eval_metric) else None
+        if eval_metric is not None:
+            if callable(eval_metric):
+                eval_metric = None
+            else:
+                params.update({'eval_metric': eval_metric})
+
+        self._Booster = train(params, trainDmatrix,
+                              self.n_estimators, evals=evals,
+                              early_stopping_rounds=early_stopping_rounds,
+                              evals_result=eval_results, feval=feval,
+                              verbose_eval=verbose)
+        if eval_results:
+            eval_results = {k: np.array(v, dtype=float)
+                            for k, v in eval_results.items()}
+            eval_results = {k: np.array(v) for k, v in eval_results.items()}
+            self.eval_results = eval_results
+
+        if early_stopping_rounds is not None:
+            self.best_score = self._Booster.best_score
+            self.best_iteration = self._Booster.best_iteration
+        return self
+
+    def predict(self, data):
+        # pylint: disable=missing-docstring,invalid-name
+        test_dmatrix = DMatrix(data, missing=self.missing)
+        return self.booster().predict(test_dmatrix)
+
+
+class XGBClassifier(XGBModel, XGBClassifierBase):
+    # pylint: disable=missing-docstring,too-many-arguments,invalid-name
+    __doc__ = """Implementation of the scikit-learn API for XGBoost classification.
+
+    """ + '\n'.join(XGBModel.__doc__.split('\n')[2:])
+
+    def __init__(self, max_depth=3, learning_rate=0.1,
+                 n_estimators=100, silent=True,
+                 objective="binary:logistic",
+                 nthread=-1, gamma=0, min_child_weight=1,
+                 max_delta_step=0, subsample=1, colsample_bytree=1,
+                 base_score=0.5, seed=0, missing=None):
+        super(XGBClassifier, self).__init__(max_depth, learning_rate,
+                                            n_estimators, silent, objective,
+                                            nthread, gamma, min_child_weight,
+                                            max_delta_step, subsample,
+                                            colsample_bytree,
+                                            base_score, seed, missing)
+
+    def fit(self, X, y, sample_weight=None, eval_set=None, eval_metric=None,
+            early_stopping_rounds=None, verbose=True):
+        # pylint: disable = attribute-defined-outside-init,arguments-differ
+        """
+        Fit gradient boosting classifier
+
+        Parameters
+        ----------
+        X : array_like
+            Feature matrix
+        y : array_like
+            Labels
+        sample_weight : array_like
+            Weight for each instance
+        eval_set : list, optional
+            A list of (X, y) pairs to use as a validation set for
+            early-stopping
+        eval_metric : str, callable, optional
+            If a str, should be a built-in evaluation metric to use. See
+            doc/parameter.md. If callable, a custom evaluation metric. The call
+            signature is func(y_predicted, y_true) where y_true will be a
+            DMatrix object such that you may need to call the get_label
+            method. It must return a str, value pair where the str is a name
+            for the evaluation and value is the value of the evaluation
+            function. This objective is always minimized.
+        early_stopping_rounds : int, optional
+            Activates early stopping. Validation error needs to decrease at
+            least every <early_stopping_rounds> round(s) to continue training.
+            Requires at least one item in evals.  If there's more than one,
+            will use the last. Returns the model from the last iteration
+            (not the best one). If early stopping occurs, the model will
+            have two additional fields: bst.best_score and bst.best_iteration.
+        verbose : bool
+            If `verbose` and an evaluation set is used, writes the evaluation
+            metric measured on the validation set to stderr.
+        """
+        eval_results = {}
+        self.classes_ = list(np.unique(y))
+        self.n_classes_ = len(self.classes_)
+        if self.n_classes_ > 2:
+            # Switch to using a multiclass objective in the underlying XGB instance
+            self.objective = "multi:softprob"
+            xgb_options = self.get_xgb_params()
+            xgb_options['num_class'] = self.n_classes_
+        else:
+            xgb_options = self.get_xgb_params()
+
+        feval = eval_metric if callable(eval_metric) else None
+        if eval_metric is not None:
+            if callable(eval_metric):
+                eval_metric = None
+            else:
+                xgb_options.update({"eval_metric": eval_metric})
+
+        if eval_set is not None:
+            # TODO: use sample_weight if given?
+            evals = list(DMatrix(x[0], label=x[1]) for x in eval_set)
+            nevals = len(evals)
+            eval_names = ["validation_{}".format(i) for i in range(nevals)]
+            evals = list(zip(evals, eval_names))
+        else:
+            evals = ()
+
+        self._le = LabelEncoder().fit(y)
+        training_labels = self._le.transform(y)
+
+        if sample_weight is not None:
+            train_dmatrix = DMatrix(X, label=training_labels, weight=sample_weight,
+                                    missing=self.missing)
+        else:
+            train_dmatrix = DMatrix(X, label=training_labels,
+                                    missing=self.missing)
+
+        self._Booster = train(xgb_options, train_dmatrix, self.n_estimators,
+                              evals=evals,
+                              early_stopping_rounds=early_stopping_rounds,
+                              evals_result=eval_results, feval=feval,
+                              verbose_eval=verbose)
+
+        if eval_results:
+            eval_results = {k: np.array(v, dtype=float)
+                            for k, v in eval_results.items()}
+            self.eval_results = eval_results
+
+        if early_stopping_rounds is not None:
+            self.best_score = self._Booster.best_score
+            self.best_iteration = self._Booster.best_iteration
+
+        return self
+
+    def predict(self, data):
+        test_dmatrix = DMatrix(data, missing=self.missing)
+        class_probs = self.booster().predict(test_dmatrix)
+        if len(class_probs.shape) > 1:
+            column_indexes = np.argmax(class_probs, axis=1)
+        else:
+            column_indexes = np.repeat(0, data.shape[0])
+            column_indexes[class_probs > 0.5] = 1
+        return self._le.inverse_transform(column_indexes)
+
+    def predict_proba(self, data):
+        test_dmatrix = DMatrix(data, missing=self.missing)
+        class_probs = self.booster().predict(test_dmatrix)
+        if self.objective == "multi:softprob":
+            return class_probs
+        else:
+            classone_probs = class_probs
+            classzero_probs = 1.0 - classone_probs
+            return np.vstack((classzero_probs, classone_probs)).transpose()
+
+class XGBRegressor(XGBModel, XGBRegressorBase):
+    # pylint: disable=missing-docstring
+    __doc__ = """Implementation of the scikit-learn API for XGBoost regression.
+    """ + '\n'.join(XGBModel.__doc__.split('\n')[2:])
diff --git a/python-package/xgboost/training.py b/python-package/xgboost/training.py
new file mode 100644
index 000000000000..1f2d722aca0f
--- /dev/null
+++ b/python-package/xgboost/training.py
@@ -0,0 +1,252 @@
+# coding: utf-8
+# pylint: disable=too-many-locals, too-many-arguments, invalid-name
+"""Training Library containing training routines."""
+from __future__ import absolute_import
+
+import sys
+import re
+import numpy as np
+from .core import Booster, STRING_TYPES
+
+def train(params, dtrain, num_boost_round=10, evals=(), obj=None, feval=None,
+          early_stopping_rounds=None, evals_result=None, verbose_eval=True):
+    # pylint: disable=too-many-statements,too-many-branches, attribute-defined-outside-init
+    """Train a booster with given parameters.
+
+    Parameters
+    ----------
+    params : dict
+        Booster params.
+    dtrain : DMatrix
+        Data to be trained.
+    num_boost_round: int
+        Number of boosting iterations.
+    watchlist (evals): list of pairs (DMatrix, string)
+        List of items to be evaluated during training, this allows user to watch
+        performance on the validation set.
+    obj : function
+        Customized objective function.
+    feval : function
+        Customized evaluation function.
+    early_stopping_rounds: int
+        Activates early stopping. Validation error needs to decrease at least
+        every <early_stopping_rounds> round(s) to continue training.
+        Requires at least one item in evals.
+        If there's more than one, will use the last.
+        Returns the model from the last iteration (not the best one).
+        If early stopping occurs, the model will have two additional fields:
+        bst.best_score and bst.best_iteration.
+    evals_result: dict
+        This dictionary stores the evaluation results of all the items in watchlist
+    verbose_eval : bool
+        If `verbose_eval` then the evaluation metric on the validation set, if
+        given, is printed at each boosting stage.
+
+    Returns
+    -------
+    booster : a trained booster model
+    """
+    evals = list(evals)
+    bst = Booster(params, [dtrain] + [d[0] for d in evals])
+
+    if evals_result is not None:
+        if not isinstance(evals_result, dict):
+            raise TypeError('evals_result has to be a dictionary')
+        else:
+            evals_name = [d[1] for d in evals]
+            evals_result.clear()
+            evals_result.update({key: [] for key in evals_name})
+
+    if not early_stopping_rounds:
+        for i in range(num_boost_round):
+            bst.update(dtrain, i, obj)
+            if len(evals) != 0:
+                bst_eval_set = bst.eval_set(evals, i, feval)
+                if isinstance(bst_eval_set, STRING_TYPES):
+                    msg = bst_eval_set
+                else:
+                    msg = bst_eval_set.decode()
+
+                if verbose_eval:
+                    sys.stderr.write(msg + '\n')
+                if evals_result is not None:
+                    res = re.findall(":-?([0-9.]+).", msg)
+                    for key, val in zip(evals_name, res):
+                        evals_result[key].append(val)
+        return bst
+
+    else:
+        # early stopping
+        if len(evals) < 1:
+            raise ValueError('For early stopping you need at least one set in evals.')
+
+        sys.stderr.write("Will train until {} error hasn't decreased in {} rounds.\n".format(\
+                evals[-1][1], early_stopping_rounds))
+
+        # is params a list of tuples? are we using multiple eval metrics?
+        if isinstance(params, list):
+            if len(params) != len(dict(params).items()):
+                raise ValueError('Check your params.'\
+                                     'Early stopping works with single eval metric only.')
+            params = dict(params)
+
+        # either minimize loss or maximize AUC/MAP/NDCG
+        maximize_score = False
+        if 'eval_metric' in params:
+            maximize_metrics = ('auc', 'map', 'ndcg')
+            if any(params['eval_metric'].startswith(x) for x in maximize_metrics):
+                maximize_score = True
+
+        if maximize_score:
+            best_score = 0.0
+        else:
+            best_score = float('inf')
+
+        best_msg = ''
+        best_score_i = 0
+
+        for i in range(num_boost_round):
+            bst.update(dtrain, i, obj)
+            bst_eval_set = bst.eval_set(evals, i, feval)
+
+            if isinstance(bst_eval_set, STRING_TYPES):
+                msg = bst_eval_set
+            else:
+                msg = bst_eval_set.decode()
+
+            if verbose_eval:
+                sys.stderr.write(msg + '\n')
+
+            if evals_result is not None:
+                res = re.findall(":-([0-9.]+).", msg)
+                for key, val in zip(evals_name, res):
+                    evals_result[key].append(val)
+
+            score = float(msg.rsplit(':', 1)[1])
+            if (maximize_score and score > best_score) or \
+                    (not maximize_score and score < best_score):
+                best_score = score
+                best_score_i = i
+                best_msg = msg
+            elif i - best_score_i >= early_stopping_rounds:
+                sys.stderr.write("Stopping. Best iteration:\n{}\n\n".format(best_msg))
+                bst.best_score = best_score
+                bst.best_iteration = best_score_i
+                break
+        bst.best_score = best_score
+        bst.best_iteration = best_score_i
+        return bst
+
+
+class CVPack(object):
+    """"Auxiliary datastruct to hold one fold of CV."""
+    def __init__(self, dtrain, dtest, param):
+        """"Initialize the CVPack"""
+        self.dtrain = dtrain
+        self.dtest = dtest
+        self.watchlist = [(dtrain, 'train'), (dtest, 'test')]
+        self.bst = Booster(param, [dtrain, dtest])
+
+    def update(self, iteration, fobj):
+        """"Update the boosters for one iteration"""
+        self.bst.update(self.dtrain, iteration, fobj)
+
+    def eval(self, iteration, feval):
+        """"Evaluate the CVPack for one iteration."""
+        return self.bst.eval_set(self.watchlist, iteration, feval)
+
+
+def mknfold(dall, nfold, param, seed, evals=(), fpreproc=None):
+    """
+    Make an n-fold list of CVPack from random indices.
+    """
+    evals = list(evals)
+    np.random.seed(seed)
+    randidx = np.random.permutation(dall.num_row())
+    kstep = len(randidx) / nfold
+    idset = [randidx[(i * kstep): min(len(randidx), (i + 1) * kstep)] for i in range(nfold)]
+    ret = []
+    for k in range(nfold):
+        dtrain = dall.slice(np.concatenate([idset[i] for i in range(nfold) if k != i]))
+        dtest = dall.slice(idset[k])
+        # run preprocessing on the data set if needed
+        if fpreproc is not None:
+            dtrain, dtest, tparam = fpreproc(dtrain, dtest, param.copy())
+        else:
+            tparam = param
+        plst = list(tparam.items()) + [('eval_metric', itm) for itm in evals]
+        ret.append(CVPack(dtrain, dtest, plst))
+    return ret
+
+
+def aggcv(rlist, show_stdv=True):
+    # pylint: disable=invalid-name
+    """
+    Aggregate cross-validation results.
+    """
+    cvmap = {}
+    ret = rlist[0].split()[0]
+    for line in rlist:
+        arr = line.split()
+        assert ret == arr[0]
+        for it in arr[1:]:
+            if not isinstance(it, STRING_TYPES):
+                it = it.decode()
+            k, v = it.split(':')
+            if k not in cvmap:
+                cvmap[k] = []
+            cvmap[k].append(float(v))
+    for k, v in sorted(cvmap.items(), key=lambda x: x[0]):
+        v = np.array(v)
+        if not isinstance(ret, STRING_TYPES):
+            ret = ret.decode()
+        if show_stdv:
+            ret += '\tcv-%s:%f+%f' % (k, np.mean(v), np.std(v))
+        else:
+            ret += '\tcv-%s:%f' % (k, np.mean(v))
+    return ret
+
+
+def cv(params, dtrain, num_boost_round=10, nfold=3, metrics=(),
+       obj=None, feval=None, fpreproc=None, show_stdv=True, seed=0):
+    # pylint: disable = invalid-name
+    """Cross-validation with given paramaters.
+
+    Parameters
+    ----------
+    params : dict
+        Booster params.
+    dtrain : DMatrix
+        Data to be trained.
+    num_boost_round : int
+        Number of boosting iterations.
+    nfold : int
+        Number of folds in CV.
+    metrics : list of strings
+        Evaluation metrics to be watched in CV.
+    obj : function
+        Custom objective function.
+    feval : function
+        Custom evaluation function.
+    fpreproc : function
+        Preprocessing function that takes (dtrain, dtest, param) and returns
+        transformed versions of those.
+    show_stdv : bool
+        Whether to display the standard deviation.
+    seed : int
+        Seed used to generate the folds (passed to numpy.random.seed).
+
+    Returns
+    -------
+    evaluation history : list(string)
+    """
+    results = []
+    cvfolds = mknfold(dtrain, nfold, params, seed, metrics, fpreproc)
+    for i in range(num_boost_round):
+        for fold in cvfolds:
+            fold.update(i, obj)
+        res = aggcv([f.eval(i, feval) for f in cvfolds], show_stdv)
+        sys.stderr.write(res + '\n')
+        results.append(res)
+    return results
+
diff --git a/scripts/travis_after_failure.sh b/scripts/travis_after_failure.sh
index 230f3348c5d0..921e14953e4b 100755
--- a/scripts/travis_after_failure.sh
+++ b/scripts/travis_after_failure.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
 
 if [ ${TASK} == "R-package" ]; then
-    cat R-package/xgboost.Rcheck/00install.out
-fi
\ No newline at end of file
+    cat xgboost/xgboost.Rcheck/*.log
+fi
diff --git a/scripts/travis_osx_install.sh b/scripts/travis_osx_install.sh
new file mode 100755
index 000000000000..adc620a52922
--- /dev/null
+++ b/scripts/travis_osx_install.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+
+if [ ${TRAVIS_OS_NAME} != "osx" ]; then
+    exit 0
+fi
+
+brew update
+
+if [ ${TASK} == "python-package" ]; then
+    brew install python git graphviz
+    easy_install pip
+    pip install numpy scipy matplotlib nose
+fi
+
+if [ ${TASK} == "python-package3" ]; then
+    brew install python3 git graphviz
+    sudo pip3 install --upgrade setuptools
+    pip3 install numpy scipy matplotlib nose graphviz
+fi
diff --git a/scripts/travis_script.sh b/scripts/travis_script.sh
index 5702d35cd6b5..c5708b0c8b9e 100755
--- a/scripts/travis_script.sh
+++ b/scripts/travis_script.sh
@@ -2,7 +2,14 @@
 
 # main script of travis
 if [ ${TASK} == "lint" ]; then
-    make lint  || exit -1
+    if [ ${TRAVIS_OS_NAME} != "osx" ]; then
+        make lint  || exit -1
+    fi
+fi
+
+if [ ${TRAVIS_OS_NAME} == "osx" ]; then
+    export no_omp=1
+    export NO_OPENMP=1
 fi
 
 if [ ${TASK} == "build" ]; then
@@ -12,7 +19,11 @@ fi
 if [ ${TASK} == "build-with-dmlc" ]; then
     cd dmlc-core
     cp make/config.mk .
-    echo "USE_S3=1" >> config.mk
+    if [ ${TRAVIS_OS_NAME} != "osx" ]; then
+        echo "USE_S3=1" >> config.mk
+    else
+        echo "USE_S3=0" >> config.mk
+    fi
     make all CXX=${CXX}|| exit -1
     cd ..
     make dmlc=dmlc-core CXX=${CXX} || exit -1
@@ -23,11 +34,35 @@ if [ ${TASK} == "R-package" ]; then
 fi
 
 if [ ${TASK} == "python-package" ]; then
+    sudo apt-get install graphviz
+    sudo apt-get install python-numpy python-scipy python-matplotlib python-nose
+    sudo python -m pip install graphviz
     make all CXX=${CXX} || exit -1
     nosetests tests/python || exit -1
 fi
 
+if [ ${TASK} == "python-package3" ]; then
+    sudo apt-get install graphviz
+    # python3-matplotlib is unavailale on Ubuntu 12.04
+    sudo apt-get install python3-dev
+    sudo apt-get install python3-numpy python3-scipy python3-nose python3-setuptools
+
+    make all CXX=${CXX} || exit -1
+
+    if [ ${TRAVIS_OS_NAME} != "osx" ]; then
+        sudo easy_install3 pip
+        sudo easy_install3 -U distribute
+        sudo pip install graphviz matplotlib
+        nosetests3 tests/python || exit -1
+    else
+        nosetests tests/python || exit -1
+    fi
+fi
+
+# only test java under linux for now
 if [ ${TASK} == "java-package" ]; then
-    make java CXX=${CXX} || exit -1
-    scripts/travis_java_script.sh || exit -1
+    if [ ${TRAVIS_OS_NAME} != "osx" ]; then
+        make java CXX=${CXX} || exit -1
+        scripts/travis_java_script.sh || exit -1
+    fi
 fi
diff --git a/src/io/page_fmatrix-inl.hpp b/src/io/page_fmatrix-inl.hpp
index 2aaec5b19e44..2fa5c83bd950 100644
--- a/src/io/page_fmatrix-inl.hpp
+++ b/src/io/page_fmatrix-inl.hpp
@@ -319,7 +319,7 @@ class FMatrixPage : public IFMatrix {
       bytes_write += spage;
       double tnow = rabit::utils::GetTime();
       double tdiff = tnow - tstart;
-      utils::Printf("Writting to %s in %g MB/s, %lu MB written current speed:%g MB/s\n",
+      utils::Printf("Writting to %s in %g MB/s, %lu MB written\n",
                     col_data_name_.c_str(),
                     (bytes_write >> 20UL) / tdiff,
                     (bytes_write >> 20UL));
diff --git a/src/utils/omp.h b/src/utils/omp.h
index ddd3467d9e48..c7a04dc3207b 100644
--- a/src/utils/omp.h
+++ b/src/utils/omp.h
@@ -7,10 +7,10 @@
 #ifndef XGBOOST_UTILS_OMP_H_
 #define XGBOOST_UTILS_OMP_H_
 
-#if defined(_OPENMP)
+#if defined(_OPENMP) && !defined(DISABLE_OPENMP)
 #include <omp.h>
 #else
-#ifndef DISABLE_OPENMP
+#if !defined(DISABLE_OPENMP) && !defined(_MSC_VER)
 // use pragma message instead of warning
 #pragma message("Warning: OpenMP is not available,"\
                 "xgboost will be compiled into single-thread code."\
diff --git a/src/utils/quantile.h b/src/utils/quantile.h
index ffd9142da547..adcd0222de7d 100644
--- a/src/utils/quantile.h
+++ b/src/utils/quantile.h
@@ -173,14 +173,6 @@ struct WQSummary {
       }
     }
   }
-  /*! \brief used for debug purpose, print the summary */
-  inline void Print(void) const {
-    for (size_t i = 0; i < size; ++i) {
-      std::cout << "x=" << data[i].value << "\t"
-                << "[" << data[i].rmin << "," << data[i].rmax << "]"
-                << " wmin=" << data[i].wmin << std::endl;
-    }
-  }
   /*!
    * \brief set current summary to be pruned summary of src
    *        assume data field is already allocated to be at least maxsize
@@ -276,9 +268,67 @@ struct WQSummary {
       } while (b != b_end);
     }
     this->size = dst - data;
+    const RType tol = 10;
+    RType err_mingap, err_maxgap, err_wgap;
+    this->FixError(&err_mingap, &err_maxgap, &err_wgap);
+    if (err_mingap > tol || err_maxgap > tol || err_wgap > tol) {
+      utils::Printf("INFO: mingap=%g, maxgap=%g, wgap=%g\n",
+                    err_mingap, err_maxgap, err_wgap);
+    }
+
     utils::Assert(size <= sa.size + sb.size, "bug in combine");
   }
+  // helper function to print the current content of sketch
+  inline void Print() const {
+    for (size_t i = 0; i < this->size; ++i) {
+      utils::Printf("[%lu] rmin=%g, rmax=%g, wmin=%g, v=%g\n",
+                    i, data[i].rmin, data[i].rmax,
+                    data[i].wmin, data[i].value);
+    }
+  }
+  // try to fix rounding error
+  // and re-establish invariance
+  inline void FixError(RType *err_mingap,
+                       RType *err_maxgap,
+                       RType *err_wgap) const {
+    *err_mingap = 0;
+    *err_maxgap = 0;
+    *err_wgap = 0;
+    RType prev_rmin = 0, prev_rmax = 0;
+    for (size_t i = 0; i < this->size; ++i) {
+      if (data[i].rmin < prev_rmin) {
+        data[i].rmin = prev_rmin;
+        *err_mingap = std::max(*err_mingap, prev_rmin - data[i].rmin);
+      } else {
+        prev_rmin = data[i].rmin;
+      }
+      if (data[i].rmax < prev_rmax) {
+        data[i].rmax = prev_rmax;
+        *err_maxgap = std::max(*err_maxgap, prev_rmax - data[i].rmax);
+      }
+      RType rmin_next = data[i].rmin_next();
+      if (data[i].rmax < rmin_next) {
+        data[i].rmax = rmin_next;
+        *err_wgap = std::max(*err_wgap, data[i].rmax - rmin_next);
+      }
+      prev_rmax = data[i].rmax;
+    }
+  }
+  // check consistency of the summary
+  inline bool Check(const char *msg) const {
+    const float tol = 10.0f;
+    for (size_t i = 0; i < this->size; ++i) {
+      if (data[i].rmin + data[i].wmin > data[i].rmax + tol ||
+          data[i].rmin < -1e-6f || data[i].rmax < -1e-6f) {
+        utils::Printf("----%s: Check not Pass------\n", msg);
+        this->Print();
+        return false;
+      }
+    }
+    return true;
+  }
 };
+
 /*! \brief try to do efficient prunning */
 template<typename DType, typename RType>
 struct WXQSummary : public WQSummary<DType, RType> {
@@ -334,11 +384,7 @@ struct WXQSummary : public WQSummary<DType, RType> {
       utils::Printf("LOG: srcsize=%lu, maxsize=%lu, range=%g, chunk=%g\n",
                     src.size, maxsize, static_cast<double>(range),
                     static_cast<double>(chunk));
-      for (size_t i = 0; i < src.size; ++i) {
-        utils::Printf("[%lu] rmin=%g, rmax=%g, wmin=%g, v=%g, isbig=%d\n", i,
-                      src.data[i].rmin, src.data[i].rmax,  src.data[i].wmin,
-                      src.data[i].value, CheckLarge(src.data[i], chunk));
-      }
+      src.Print();
       utils::Assert(nbig < n - 1, "quantile: too many large chunk");
     }
     this->data[0] = src.data[0];
@@ -357,6 +403,7 @@ struct WXQSummary : public WQSummary<DType, RType> {
             if (dx2 >= maxdx2) break;
             while (i < end &&
                    dx2 >= src.data[i + 1].rmax + src.data[i + 1].rmin) ++i;
+            if (i == end) break;
             if (dx2 < src.data[i].rmin_next() + src.data[i + 1].rmax_prev()) {
               if (i != lastidx) {
                 this->data[this->size++] = src.data[i]; lastidx = i;
diff --git a/src/utils/thread.h b/src/utils/thread.h
index 78b488cff7de..a6e8e7fdc7fe 100644
--- a/src/utils/thread.h
+++ b/src/utils/thread.h
@@ -11,7 +11,7 @@
 #ifdef _MSC_VER
 #include <windows.h>
 #include <process.h>
-#include "../xgboost/utils.h"
+#include "./utils.h"
 namespace xgboost {
 namespace utils {
 /*! \brief simple semaphore used for synchronization */
diff --git a/src/utils/thread_buffer.h b/src/utils/thread_buffer.h
index 2119f53ab670..bc4fb9f5e0d9 100644
--- a/src/utils/thread_buffer.h
+++ b/src/utils/thread_buffer.h
@@ -11,9 +11,14 @@
 #include <cstring>
 #include <cstdlib>
 #include "./utils.h"
+// threading util could not run on solaris
+#ifndef XGBOOST_STRICT_CXX98_
 #include "./thread.h"
+#endif
+
 namespace xgboost {
 namespace utils {
+#if !defined(XGBOOST_STRICT_CXX98_)
 /*!
  * \brief buffered loading iterator that uses multithread
  * this template method will assume the following paramters
@@ -201,6 +206,52 @@ class ThreadBuffer {
     loading_need.Post();
   }
 };
+#else
+// a dummy single threaded ThreadBuffer
+// use this to resolve R's solaris compatibility for now
+template<typename Elem, typename ElemFactory>
+class ThreadBuffer {
+ public:
+  ThreadBuffer() : init_end_(false) {}
+  ~ThreadBuffer() {
+    if (init_end_) {
+      factory_.FreeSpace(data_);
+      factory_.Destroy();
+    }
+  }
+  inline void SetParam(const char *name, const char *val) {
+  }
+  inline bool Init(void) {
+    if (!factory_.Init()) return false;
+    data_ = factory_.Create();
+    return (init_end_ = true);
+  }
+  inline void BeforeFirst(void) {
+    factory_.BeforeFirst();
+  }
+  inline bool Next(Elem &elem) { // NOLINT(*)
+    if (factory_.LoadNext(data_)) {
+      elem = data_; return true;
+    } else {
+      return false;
+    }
+  }
+  inline ElemFactory &get_factory() {
+    return factory_;
+  }
+  inline const ElemFactory &get_factory() const {
+    return factory_;
+  }
+
+ private:
+  // initialized
+  bool init_end_;
+  // current data
+  Elem data_;
+  // factory object used to load configures
+  ElemFactory factory_;
+};
+#endif  // !defined(XGBOOST_STRICT_CXX98_)
 }  // namespace utils
 }  // namespace xgboost
 #endif  // XGBOOST_UTILS_THREAD_BUFFER_H_
diff --git a/subtree/rabit/windows/rabit/rabit.vcxproj b/subtree/rabit/windows/rabit/rabit.vcxproj
index 5948e4c17332..c9594b182530 100644
--- a/subtree/rabit/windows/rabit/rabit.vcxproj
+++ b/subtree/rabit/windows/rabit/rabit.vcxproj
@@ -29,7 +29,7 @@
     <CharacterSet>MultiByte</CharacterSet>
   </PropertyGroup>
   <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
-    <ConfigurationType>Application</ConfigurationType>
+    <ConfigurationType>StaticLibrary</ConfigurationType>
     <UseDebugLibraries>true</UseDebugLibraries>
     <CharacterSet>MultiByte</CharacterSet>
   </PropertyGroup>
diff --git a/tests/python/test_basic.py b/tests/python/test_basic.py
index 77d19595b53f..93ebaa7fdadf 100644
--- a/tests/python/test_basic.py
+++ b/tests/python/test_basic.py
@@ -29,3 +29,47 @@ def test_basic():
     # assert they are the same
     assert np.sum(np.abs(preds2-preds)) == 0
 
+def test_plotting():
+    bst2 = xgb.Booster(model_file='xgb.model')
+    # plotting
+
+    import matplotlib
+    matplotlib.use('Agg')
+
+    from matplotlib.axes import Axes
+    from graphviz import Digraph
+
+    ax = xgb.plot_importance(bst2)
+    assert isinstance(ax, Axes)
+    assert ax.get_title() == 'Feature importance'
+    assert ax.get_xlabel() == 'F score'
+    assert ax.get_ylabel() == 'Features'
+    assert len(ax.patches) == 4
+
+    ax = xgb.plot_importance(bst2, color='r',
+                             title='t', xlabel='x', ylabel='y')
+    assert isinstance(ax, Axes)
+    assert ax.get_title() == 't'
+    assert ax.get_xlabel() == 'x'
+    assert ax.get_ylabel() == 'y'
+    assert len(ax.patches) == 4
+    for p in ax.patches:
+        assert p.get_facecolor() == (1.0, 0, 0, 1.0) # red
+
+
+    ax = xgb.plot_importance(bst2, color=['r', 'r', 'b', 'b'],
+                             title=None, xlabel=None, ylabel=None)
+    assert isinstance(ax, Axes)
+    assert ax.get_title() == ''
+    assert ax.get_xlabel() == ''
+    assert ax.get_ylabel() == ''
+    assert len(ax.patches) == 4
+    assert ax.patches[0].get_facecolor() == (1.0, 0, 0, 1.0) # red
+    assert ax.patches[1].get_facecolor() == (1.0, 0, 0, 1.0) # red
+    assert ax.patches[2].get_facecolor() == (0, 0, 1.0, 1.0) # blue
+    assert ax.patches[3].get_facecolor() == (0, 0, 1.0, 1.0) # blue
+
+    g = xgb.to_graphviz(bst2, num_trees=0)
+    assert isinstance(g, Digraph)
+    ax = xgb.plot_tree(bst2, num_trees=0)
+    assert isinstance(ax, Axes)
diff --git a/windows/README.md b/windows/README.md
index cb1cc9dd9eb4..564c97d25337 100644
--- a/windows/README.md
+++ b/windows/README.md
@@ -11,7 +11,7 @@ This should give you xgboost.exe for CLI version and xgboost_wrapper.dll for pyt
 
 Use Python Module
 =====
-* After you build the dll, you can install the Python package from the [../wrapper](../wrapper) folder
+* After you build the dll, you can install the Python package from the [../python-package](../python-package) folder
 
 ```
 python setup.py install
diff --git a/windows/xgboost.sln b/windows/xgboost.sln
index 7bd8db5b2c5c..d94c14932231 100644
--- a/windows/xgboost.sln
+++ b/windows/xgboost.sln
@@ -7,6 +7,9 @@ Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "xgboost", "xgboost\xgboost.
 	EndProjectSection
 EndProject
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "xgboost_wrapper", "xgboost_wrapper\xgboost_wrapper.vcxproj", "{B0E22ADD-7849-4D3A-BDC6-0932C5F11ED5}"
+	ProjectSection(ProjectDependencies) = postProject
+		{D7B77D06-4F5F-4BD7-B81E-7CC8EBBE684F} = {D7B77D06-4F5F-4BD7-B81E-7CC8EBBE684F}
+	EndProjectSection
 EndProject
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "rabit", "..\subtree\rabit\windows\rabit\rabit.vcxproj", "{D7B77D06-4F5F-4BD7-B81E-7CC8EBBE684F}"
 EndProject
@@ -22,15 +25,15 @@ Global
 	GlobalSection(ProjectConfigurationPlatforms) = postSolution
 		{19766C3F-7508-49D0-BAAC-0988FCC9970C}.Debug|Win32.ActiveCfg = Debug|Win32
 		{19766C3F-7508-49D0-BAAC-0988FCC9970C}.Debug|Win32.Build.0 = Debug|Win32
-		{19766C3F-7508-49D0-BAAC-0988FCC9970C}.Debug|x64.ActiveCfg = Release|x64
-		{19766C3F-7508-49D0-BAAC-0988FCC9970C}.Debug|x64.Build.0 = Release|x64
+		{19766C3F-7508-49D0-BAAC-0988FCC9970C}.Debug|x64.ActiveCfg = Debug|x64
 		{19766C3F-7508-49D0-BAAC-0988FCC9970C}.Release|Win32.ActiveCfg = Release|Win32
 		{19766C3F-7508-49D0-BAAC-0988FCC9970C}.Release|Win32.Build.0 = Release|Win32
 		{19766C3F-7508-49D0-BAAC-0988FCC9970C}.Release|x64.ActiveCfg = Release|x64
 		{19766C3F-7508-49D0-BAAC-0988FCC9970C}.Release|x64.Build.0 = Release|x64
 		{B0E22ADD-7849-4D3A-BDC6-0932C5F11ED5}.Debug|Win32.ActiveCfg = Debug|Win32
 		{B0E22ADD-7849-4D3A-BDC6-0932C5F11ED5}.Debug|Win32.Build.0 = Debug|Win32
-		{B0E22ADD-7849-4D3A-BDC6-0932C5F11ED5}.Debug|x64.ActiveCfg = Debug|Win32
+		{B0E22ADD-7849-4D3A-BDC6-0932C5F11ED5}.Debug|x64.ActiveCfg = Debug|x64
+		{B0E22ADD-7849-4D3A-BDC6-0932C5F11ED5}.Debug|x64.Build.0 = Debug|x64
 		{B0E22ADD-7849-4D3A-BDC6-0932C5F11ED5}.Release|Win32.ActiveCfg = Release|Win32
 		{B0E22ADD-7849-4D3A-BDC6-0932C5F11ED5}.Release|Win32.Build.0 = Release|Win32
 		{B0E22ADD-7849-4D3A-BDC6-0932C5F11ED5}.Release|x64.ActiveCfg = Release|x64
@@ -46,7 +49,6 @@ Global
 		{20A0E4D7-20C7-4EC1-BDF6-0D469CE239AA}.Debug|Win32.ActiveCfg = Debug|Win32
 		{20A0E4D7-20C7-4EC1-BDF6-0D469CE239AA}.Debug|Win32.Build.0 = Debug|Win32
 		{20A0E4D7-20C7-4EC1-BDF6-0D469CE239AA}.Debug|x64.ActiveCfg = Debug|x64
-		{20A0E4D7-20C7-4EC1-BDF6-0D469CE239AA}.Debug|x64.Build.0 = Debug|x64
 		{20A0E4D7-20C7-4EC1-BDF6-0D469CE239AA}.Release|Win32.ActiveCfg = Release|Win32
 		{20A0E4D7-20C7-4EC1-BDF6-0D469CE239AA}.Release|Win32.Build.0 = Release|Win32
 		{20A0E4D7-20C7-4EC1-BDF6-0D469CE239AA}.Release|x64.ActiveCfg = Release|x64
diff --git a/windows/xgboost/xgboost.vcxproj b/windows/xgboost/xgboost.vcxproj
index c14d84645f73..00846f36abcd 100644
--- a/windows/xgboost/xgboost.vcxproj
+++ b/windows/xgboost/xgboost.vcxproj
@@ -85,6 +85,7 @@
     </ClCompile>
     <Link>
       <GenerateDebugInformation>true</GenerateDebugInformation>
+      <AdditionalDependencies>$(OutDir)\rabit.lib;%(AdditionalDependencies)</AdditionalDependencies>
     </Link>
   </ItemDefinitionGroup>
   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
diff --git a/windows/xgboost_wrapper/xgboost_wrapper.vcxproj b/windows/xgboost_wrapper/xgboost_wrapper.vcxproj
index 62f7d0fd3315..cff3cde6559a 100644
--- a/windows/xgboost_wrapper/xgboost_wrapper.vcxproj
+++ b/windows/xgboost_wrapper/xgboost_wrapper.vcxproj
@@ -86,6 +86,7 @@
     </ClCompile>
     <Link>
       <GenerateDebugInformation>true</GenerateDebugInformation>
+      <AdditionalDependencies>$(OutDir)\rabit.lib;%(AdditionalDependencies)</AdditionalDependencies>
     </Link>
   </ItemDefinitionGroup>
   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
diff --git a/wrapper/README.md b/wrapper/README.md
index ab013faf601e..77316e15c117 100644
--- a/wrapper/README.md
+++ b/wrapper/README.md
@@ -1,19 +1,9 @@
-Wrapper of XGBoost
-=====
-This folder provides wrapper of xgboost to other languages
-
-Python
-=====
-* To make the python module, type ```./build.sh``` in the root directory of project
-* Install with `python setup.py install` from this directory.
-* Refer also to the walk through example in [demo folder](../demo/guide-python)
-* **NOTE**: if you want to run XGBoost process in parallel using the fork backend for joblib/multiprocessing, you must build XGBoost without support for OpenMP by `make no_omp=1`. Otherwise, use the forkserver (in Python 3.4) or spawn backend. See the sklearn_parallel.py demo.
-
-
-R
-=====
-* See [R-package](../R-package)
-
-Julia
-=====
-* See [XGBoost.jl](https://github.com/antinucleon/XGBoost.jl)
+XGBoost Wrappers
+================
+This folder provides wrapper to create xgboost packages to other languages.
+
+***Supported Language Packages***
+* [Python package](../python-package)
+* [R-package](../R-package)
+* [Java Package](../java)
+* [Julia Package](https://github.com/antinucleon/XGBoost.jl)
diff --git a/wrapper/__init__.py b/wrapper/__init__.py
deleted file mode 100644
index e69de29bb2d1..000000000000
diff --git a/wrapper/setup.py b/wrapper/setup.py
deleted file mode 100644
index 5365d61b0b0a..000000000000
--- a/wrapper/setup.py
+++ /dev/null
@@ -1,39 +0,0 @@
-# pylint: disable=invalid-name
-"""Setup xgboost package."""
-import os
-import platform
-from setuptools import setup
-
-
-class XGBoostLibraryNotFound(Exception):
-    """Exception to raise when xgboost library cannot be found."""
-    pass
-
-
-curr_dir = os.path.dirname(os.path.abspath(__file__))
-dll_path = [curr_dir]
-
-if os.name == 'nt':
-    if platform.architecture()[0] == '64bit':
-        dll_path.append(os.path.join(curr_dir, '../windows/x64/Release/'))
-    else:
-        dll_path.append(os.path.join(curr_dir, '../windows/Release/'))
-
-
-if os.name == 'nt':
-    dll_path = [os.path.join(p, 'xgboost_wrapper.dll') for p in dll_path]
-else:
-    dll_path = [os.path.join(p, 'libxgboostwrapper.so') for p in dll_path]
-
-lib_path = [p for p in dll_path if os.path.exists(p) and os.path.isfile(p)]
-
-if len(lib_path) == 0:
-    raise XGBoostLibraryNotFound("XGBoost library not found. Did you run "
-                                 "../make?")
-setup(name="xgboost",
-      version="0.40",
-      description="Python wrappers for XGBoost: eXtreme Gradient Boosting",
-      zip_safe=False,
-      py_modules=['xgboost'],
-      data_files=[('.', [lib_path[0]])],
-      url="https://github.com/dmlc/xgboost")
diff --git a/wrapper/xgboost_wrapper.cpp b/wrapper/xgboost_wrapper.cpp
index 18c1eae4923c..6956b567d27f 100644
--- a/wrapper/xgboost_wrapper.cpp
+++ b/wrapper/xgboost_wrapper.cpp
@@ -98,7 +98,14 @@ class Booster: public learner::BoostLearner {
  private:
   bool init_model;
 };
+}  // namespace wrapper
+}  // namespace xgboost
+
+using namespace xgboost::wrapper;
 
+#ifndef XGBOOST_STRICT_CXX98_
+namespace xgboost {
+namespace wrapper {
 // helper to support threadlocal
 struct ThreadLocalStore {
   std::vector<std::string*> data;
@@ -126,17 +133,17 @@ static ThreadLocalStore thread_local_store;
 }  // namespace wrapper
 }  // namespace xgboost
 
-using namespace xgboost::wrapper;
-
 /*! \brief  macro to guard beginning and end section of all functions */
 #define API_BEGIN() try {
 /*!
  * \brief every function starts with API_BEGIN(); and finishes with API_END();
  * \param Finalize optionally put in a finalizer
  */
-#define API_END(Finalize) } catch(std::exception &e) {  \
+#define API_END_FINALIZE(Finalize) } catch(std::exception &e) {  \
     Finalize; return XGBHandleException(e);             \
   } return 0;
+/*! \brief API End with no finalization */
+#define API_END() API_END_FINALIZE(;)
 
 // do not use threadlocal on OSX since it is not always available
 #ifndef DISABLE_THREAD_LOCAL
@@ -171,6 +178,17 @@ const char *XGBSetGetLastError_(const char *str_set) {
   }
   return last_error->c_str();
 }
+#else
+// crippled implementation for solaris case
+// exception handling is not needed for R, so it is OK.
+#define API_BEGIN()
+#define API_END_FINALIZE(Finalize) return 0
+#define API_END() return 0
+
+const char *XGBSetGetLastError_(const char *str_set) {
+  return NULL;
+}
+#endif  // XGBOOST_STRICT_CXX98_
 
 /*! \brief return str message of the last error */
 const char *XGBGetLastError() {
@@ -217,7 +235,7 @@ int XGDMatrixCreateFromCSR(const bst_ulong *indptr,
   }
   mat.info.info.num_row = nindptr - 1;
   *out = p_mat;
-  API_END(delete p_mat);
+  API_END_FINALIZE(delete p_mat);
 }
 
 int XGDMatrixCreateFromCSC(const bst_ulong *col_ptr,
@@ -258,7 +276,7 @@ int XGDMatrixCreateFromCSC(const bst_ulong *col_ptr,
   mat.info.info.num_row = mat.row_ptr_.size() - 1;
   mat.info.info.num_col = static_cast<size_t>(ncol);
   *out = p_mat;
-  API_END(delete p_mat);
+  API_END_FINALIZE(delete p_mat);
 }
 
 int XGDMatrixCreateFromMat(const float *data,
@@ -289,7 +307,7 @@ int XGDMatrixCreateFromMat(const float *data,
     mat.row_ptr_.push_back(mat.row_ptr_.back() + nelem);
   }
   *out = p_mat;
-  API_END(delete p_mat);
+  API_END_FINALIZE(delete p_mat);
 }
 
 int XGDMatrixSliceDMatrix(DMatrixHandle handle,
@@ -340,7 +358,7 @@ int XGDMatrixSliceDMatrix(DMatrixHandle handle,
     }
   }
   *out = p_ret;
-  API_END(delete p_ret);
+  API_END_FINALIZE(delete p_ret);
 }
 
 int XGDMatrixFree(DMatrixHandle handle) {