diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index bb6f3acd9c9f..4806692c0b57 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -170,7 +170,7 @@ jobs: run: | cd R-package R.exe CMD INSTALL . - Rscript.exe tests/run_lint.R + Rscript.exe tests/helper_scripts/run_lint.R test-with-R: runs-on: ${{ matrix.config.os }} diff --git a/Makefile b/Makefile index 20ddace00ed2..63319ba3ef63 100644 --- a/Makefile +++ b/Makefile @@ -134,14 +134,16 @@ Rpack: clean_all sed -i -e 's/@OPENMP_LIB@//g' xgboost/src/Makevars.win rm -f xgboost/src/Makevars.win-e # OSX sed create this extra file; remove it bash R-package/remove_warning_suppression_pragma.sh + bash xgboost/remove_warning_suppression_pragma.sh rm xgboost/remove_warning_suppression_pragma.sh + rm -rfv xgboost/tests/helper_scripts/ Rbuild: Rpack R CMD build --no-build-vignettes xgboost rm -rf xgboost Rcheck: Rbuild - R CMD check xgboost*.tar.gz + R CMD check --as-cran xgboost*.tar.gz -include build/*.d -include build/*/*.d diff --git a/R-package/DESCRIPTION b/R-package/DESCRIPTION index 1a35eaa0612a..f599a57858be 100644 --- a/R-package/DESCRIPTION +++ b/R-package/DESCRIPTION @@ -2,7 +2,7 @@ Package: xgboost Type: Package Title: Extreme Gradient Boosting Version: 1.3.0.1 -Date: 2020-02-21 +Date: 2020-08-28 Authors@R: c( person("Tianqi", "Chen", role = c("aut"), email = "tianqi.tchen@gmail.com"), diff --git a/R-package/NAMESPACE b/R-package/NAMESPACE index fb0ac542f8bf..1795742c8160 100644 --- a/R-package/NAMESPACE +++ b/R-package/NAMESPACE @@ -38,6 +38,7 @@ export(xgb.dump) export(xgb.gblinear.history) export(xgb.ggplot.deepness) export(xgb.ggplot.importance) +export(xgb.ggplot.shap.summary) export(xgb.importance) export(xgb.load) export(xgb.load.raw) @@ -46,6 +47,7 @@ export(xgb.plot.deepness) export(xgb.plot.importance) export(xgb.plot.multi.trees) export(xgb.plot.shap) +export(xgb.plot.shap.summary) export(xgb.plot.tree) export(xgb.save) export(xgb.save.raw) diff --git a/R-package/R/utils.R b/R-package/R/utils.R index b0c653f17671..846cc1f4404e 100644 --- a/R-package/R/utils.R +++ b/R-package/R/utils.R @@ -349,6 +349,7 @@ NULL #' # Save as a stand-alone file (JSON); load it with xgb.load() #' xgb.save(bst, 'xgb.model.json') #' bst2 <- xgb.load('xgb.model.json') +#' if (file.exists('xgb.model.json')) file.remove('xgb.model.json') #' #' # Save as a raw byte vector; load it with xgb.load.raw() #' xgb_bytes <- xgb.save.raw(bst) @@ -364,6 +365,7 @@ NULL #' obj2 <- readRDS('my_object.rds') #' # Re-construct xgb.Booster object from the bytes #' bst2 <- xgb.load.raw(obj2$xgb_model_bytes) +#' if (file.exists('my_object.rds')) file.remove('my_object.rds') #' #' @name a-compatibility-note-for-saveRDS-save NULL diff --git a/R-package/R/xgb.cv.R b/R-package/R/xgb.cv.R index fd74d0f6b625..fb48ca6071bf 100644 --- a/R-package/R/xgb.cv.R +++ b/R-package/R/xgb.cv.R @@ -79,7 +79,7 @@ #' #' All observations are used for both training and validation. #' -#' Adapted from \url{http://en.wikipedia.org/wiki/Cross-validation_\%28statistics\%29#k-fold_cross-validation} +#' Adapted from \url{https://en.wikipedia.org/wiki/Cross-validation_\%28statistics\%29} #' #' @return #' An object of class \code{xgb.cv.synchronous} with the following elements: diff --git a/R-package/R/xgb.plot.shap.R b/R-package/R/xgb.plot.shap.R index d9ea69786ad9..70a357ee9866 100644 --- a/R-package/R/xgb.plot.shap.R +++ b/R-package/R/xgb.plot.shap.R @@ -200,9 +200,9 @@ xgb.plot.shap <- function(data, shap_contrib = NULL, features = NULL, top_n = 1, #' @return A \code{ggplot2} object. #' @export #' -#' @examples See \code{\link{xgb.plot.shap}}. +#' @examples # See \code{\link{xgb.plot.shap}}. #' @seealso \code{\link{xgb.plot.shap}}, \code{\link{xgb.ggplot.shap.summary}}, -#' \code{\url{https://github.com/slundberg/shap}} +#' \url{https://github.com/slundberg/shap} xgb.plot.shap.summary <- function(data, shap_contrib = NULL, features = NULL, top_n = 10, model = NULL, trees = NULL, target_class = NULL, approxcontrib = FALSE, subsample = NULL) { # Only ggplot implementation is available. diff --git a/R-package/R/xgb.train.R b/R-package/R/xgb.train.R index a6755f5fa277..0449ae266243 100644 --- a/R-package/R/xgb.train.R +++ b/R-package/R/xgb.train.R @@ -130,16 +130,16 @@ #' Note that when using a customized metric, only this single metric can be used. #' The following is the list of built-in metrics for which Xgboost provides optimized implementation: #' \itemize{ -#' \item \code{rmse} root mean square error. \url{http://en.wikipedia.org/wiki/Root_mean_square_error} -#' \item \code{logloss} negative log-likelihood. \url{http://en.wikipedia.org/wiki/Log-likelihood} -#' \item \code{mlogloss} multiclass logloss. \url{http://wiki.fast.ai/index.php/Log_Loss} +#' \item \code{rmse} root mean square error. \url{https://en.wikipedia.org/wiki/Root_mean_square_error} +#' \item \code{logloss} negative log-likelihood. \url{https://en.wikipedia.org/wiki/Log-likelihood} +#' \item \code{mlogloss} multiclass logloss. \url{https://scikit-learn.org/stable/modules/generated/sklearn.metrics.log_loss.html} #' \item \code{error} Binary classification error rate. It is calculated as \code{(# wrong cases) / (# all cases)}. #' By default, it uses the 0.5 threshold for predicted values to define negative and positive instances. #' Different threshold (e.g., 0.) could be specified as "error@0." #' \item \code{merror} Multiclass classification error rate. It is calculated as \code{(# wrong cases) / (# all cases)}. -#' \item \code{auc} Area under the curve. \url{http://en.wikipedia.org/wiki/Receiver_operating_characteristic#'Area_under_curve} for ranking evaluation. +#' \item \code{auc} Area under the curve. \url{https://en.wikipedia.org/wiki/Receiver_operating_characteristic#'Area_under_curve} for ranking evaluation. #' \item \code{aucpr} Area under the PR curve. \url{https://en.wikipedia.org/wiki/Precision_and_recall} for ranking evaluation. -#' \item \code{ndcg} Normalized Discounted Cumulative Gain (for ranking task). \url{http://en.wikipedia.org/wiki/NDCG} +#' \item \code{ndcg} Normalized Discounted Cumulative Gain (for ranking task). \url{https://en.wikipedia.org/wiki/NDCG} #' } #' #' The following callbacks are automatically created when certain parameters are set: diff --git a/R-package/man/a-compatibility-note-for-saveRDS-save.Rd b/R-package/man/a-compatibility-note-for-saveRDS-save.Rd index 63b8dfce52ac..85b52243c1b9 100644 --- a/R-package/man/a-compatibility-note-for-saveRDS-save.Rd +++ b/R-package/man/a-compatibility-note-for-saveRDS-save.Rd @@ -43,6 +43,7 @@ bst2 <- xgb.load('xgb.model') # Save as a stand-alone file (JSON); load it with xgb.load() xgb.save(bst, 'xgb.model.json') bst2 <- xgb.load('xgb.model.json') +if (file.exists('xgb.model.json')) file.remove('xgb.model.json') # Save as a raw byte vector; load it with xgb.load.raw() xgb_bytes <- xgb.save.raw(bst) @@ -58,5 +59,6 @@ saveRDS(obj, 'my_object.rds') obj2 <- readRDS('my_object.rds') # Re-construct xgb.Booster object from the bytes bst2 <- xgb.load.raw(obj2$xgb_model_bytes) +if (file.exists('my_object.rds')) file.remove('my_object.rds') } diff --git a/R-package/man/normalize.Rd b/R-package/man/normalize.Rd new file mode 100644 index 000000000000..6a05e83426b1 --- /dev/null +++ b/R-package/man/normalize.Rd @@ -0,0 +1,18 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/xgb.ggplot.R +\name{normalize} +\alias{normalize} +\title{Scale feature value to have mean 0, standard deviation 1} +\usage{ +normalize(x) +} +\arguments{ +\item{x}{Numeric vector} +} +\value{ +Numeric vector with mean 0 and sd 1. +} +\description{ +This is used to compare multiple features on the same plot. +Internal utility function +} diff --git a/R-package/man/prepare.ggplot.shap.data.Rd b/R-package/man/prepare.ggplot.shap.data.Rd new file mode 100644 index 000000000000..57f71a3ff800 --- /dev/null +++ b/R-package/man/prepare.ggplot.shap.data.Rd @@ -0,0 +1,27 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/xgb.ggplot.R +\name{prepare.ggplot.shap.data} +\alias{prepare.ggplot.shap.data} +\title{Combine and melt feature values and SHAP contributions for sample +observations.} +\usage{ +prepare.ggplot.shap.data(data_list, normalize = FALSE) +} +\arguments{ +\item{data_list}{List containing 'data' and 'shap_contrib' returned by +\code{xgb.shap.data()}.} + +\item{normalize}{Whether to standardize feature values to have mean 0 and +standard deviation 1 (useful for comparing multiple features on the same +plot). Default \code{FALSE}.} +} +\value{ +A data.table containing the observation ID, the feature name, the + feature value (normalized if specified), and the SHAP contribution value. +} +\description{ +Conforms to data format required for ggplot functions. +} +\details{ +Internal utility function. +} diff --git a/R-package/man/xgb.cv.Rd b/R-package/man/xgb.cv.Rd index 98e70e48cade..86a88007be21 100644 --- a/R-package/man/xgb.cv.Rd +++ b/R-package/man/xgb.cv.Rd @@ -154,7 +154,7 @@ The cross-validation process is then repeated \code{nrounds} times, with each of All observations are used for both training and validation. -Adapted from \url{http://en.wikipedia.org/wiki/Cross-validation_\%28statistics\%29#k-fold_cross-validation} +Adapted from \url{https://en.wikipedia.org/wiki/Cross-validation_\%28statistics\%29} } \examples{ data(agaricus.train, package='xgboost') diff --git a/R-package/man/xgb.plot.shap.Rd b/R-package/man/xgb.plot.shap.Rd index 3cd3a8953dc8..abb21ce1957a 100644 --- a/R-package/man/xgb.plot.shap.Rd +++ b/R-package/man/xgb.plot.shap.Rd @@ -131,6 +131,7 @@ bst <- xgboost(agaricus.train$data, agaricus.train$label, nrounds = 50, xgb.plot.shap(agaricus.test$data, model = bst, features = "odor=none") contr <- predict(bst, agaricus.test$data, predcontrib = TRUE) xgb.plot.shap(agaricus.test$data, contr, model = bst, top_n = 12, n_col = 3) +xgb.ggplot.shap.summary(agaricus.test$data, contr, model = bst, top_n = 12) # Summary plot # multiclass example - plots for each class separately: nclass <- 3 @@ -149,6 +150,7 @@ xgb.plot.shap(x, model = mbst, trees = trees0 + 1, target_class = 1, top_n = 4, n_col = 2, col = col, pch = 16, pch_NA = 17) xgb.plot.shap(x, model = mbst, trees = trees0 + 2, target_class = 2, top_n = 4, n_col = 2, col = col, pch = 16, pch_NA = 17) +xgb.ggplot.shap.summary(x, model = mbst, target_class = 0, top_n = 4) # Summary plot } \references{ diff --git a/R-package/man/xgb.plot.shap.summary.Rd b/R-package/man/xgb.plot.shap.summary.Rd new file mode 100644 index 000000000000..f757fd7404a3 --- /dev/null +++ b/R-package/man/xgb.plot.shap.summary.Rd @@ -0,0 +1,78 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/xgb.ggplot.R, R/xgb.plot.shap.R +\name{xgb.ggplot.shap.summary} +\alias{xgb.ggplot.shap.summary} +\alias{xgb.plot.shap.summary} +\title{SHAP contribution dependency summary plot} +\usage{ +xgb.ggplot.shap.summary( + data, + shap_contrib = NULL, + features = NULL, + top_n = 10, + model = NULL, + trees = NULL, + target_class = NULL, + approxcontrib = FALSE, + subsample = NULL +) + +xgb.plot.shap.summary( + data, + shap_contrib = NULL, + features = NULL, + top_n = 10, + model = NULL, + trees = NULL, + target_class = NULL, + approxcontrib = FALSE, + subsample = NULL +) +} +\arguments{ +\item{data}{data as a \code{matrix} or \code{dgCMatrix}.} + +\item{shap_contrib}{a matrix of SHAP contributions that was computed earlier for the above +\code{data}. When it is NULL, it is computed internally using \code{model} and \code{data}.} + +\item{features}{a vector of either column indices or of feature names to plot. When it is NULL, +feature importance is calculated, and \code{top_n} high ranked features are taken.} + +\item{top_n}{when \code{features} is NULL, top_n [1, 100] most important features in a model are taken.} + +\item{model}{an \code{xgb.Booster} model. It has to be provided when either \code{shap_contrib} +or \code{features} is missing.} + +\item{trees}{passed to \code{\link{xgb.importance}} when \code{features = NULL}.} + +\item{target_class}{is only relevant for multiclass models. When it is set to a 0-based class index, +only SHAP contributions for that specific class are used. +If it is not set, SHAP importances are averaged over all classes.} + +\item{approxcontrib}{passed to \code{\link{predict.xgb.Booster}} when \code{shap_contrib = NULL}.} + +\item{subsample}{a random fraction of data points to use for plotting. When it is NULL, +it is set so that up to 100K data points are used.} +} +\value{ +A \code{ggplot2} object. +} +\description{ +Compare SHAP contributions of different features. +} +\details{ +A point plot (each point representing one sample from \code{data}) is +produced for each feature, with the points plotted on the SHAP value axis. +Each point (observation) is coloured based on its feature value. The plot +hence allows us to see which features have a negative / positive contribution +on the model prediction, and whether the contribution is different for larger +or smaller values of the feature. We effectively try to replicate the +\code{summary_plot} function from https://github.com/slundberg/shap. +} +\examples{ +# See \code{\link{xgb.plot.shap}}. +} +\seealso{ +\code{\link{xgb.plot.shap}}, \code{\link{xgb.ggplot.shap.summary}}, + \url{https://github.com/slundberg/shap} +} diff --git a/R-package/man/xgb.shap.data.Rd b/R-package/man/xgb.shap.data.Rd new file mode 100644 index 000000000000..952445610bd2 --- /dev/null +++ b/R-package/man/xgb.shap.data.Rd @@ -0,0 +1,29 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/xgb.plot.shap.R +\name{xgb.shap.data} +\alias{xgb.shap.data} +\title{Prepare data for SHAP plots. To be used in xgb.plot.shap, xgb.plot.shap.summary, etc. +Internal utility function.} +\usage{ +xgb.shap.data( + data, + shap_contrib = NULL, + features = NULL, + top_n = 1, + model = NULL, + trees = NULL, + target_class = NULL, + approxcontrib = FALSE, + subsample = NULL, + max_observations = 1e+05 +) +} +\value{ +A list containing: 'data', a matrix containing sample observations + and their feature values; 'shap_contrib', a matrix containing the SHAP contribution + values for these observations. +} +\description{ +Prepare data for SHAP plots. To be used in xgb.plot.shap, xgb.plot.shap.summary, etc. +Internal utility function. +} diff --git a/R-package/man/xgb.train.Rd b/R-package/man/xgb.train.Rd index 94db595cbc65..e68962fb6b41 100644 --- a/R-package/man/xgb.train.Rd +++ b/R-package/man/xgb.train.Rd @@ -215,16 +215,16 @@ User may set one or several \code{eval_metric} parameters. Note that when using a customized metric, only this single metric can be used. The following is the list of built-in metrics for which Xgboost provides optimized implementation: \itemize{ - \item \code{rmse} root mean square error. \url{http://en.wikipedia.org/wiki/Root_mean_square_error} - \item \code{logloss} negative log-likelihood. \url{http://en.wikipedia.org/wiki/Log-likelihood} - \item \code{mlogloss} multiclass logloss. \url{http://wiki.fast.ai/index.php/Log_Loss} + \item \code{rmse} root mean square error. \url{https://en.wikipedia.org/wiki/Root_mean_square_error} + \item \code{logloss} negative log-likelihood. \url{https://en.wikipedia.org/wiki/Log-likelihood} + \item \code{mlogloss} multiclass logloss. \url{https://scikit-learn.org/stable/modules/generated/sklearn.metrics.log_loss.html} \item \code{error} Binary classification error rate. It is calculated as \code{(# wrong cases) / (# all cases)}. By default, it uses the 0.5 threshold for predicted values to define negative and positive instances. Different threshold (e.g., 0.) could be specified as "error@0." \item \code{merror} Multiclass classification error rate. It is calculated as \code{(# wrong cases) / (# all cases)}. - \item \code{auc} Area under the curve. \url{http://en.wikipedia.org/wiki/Receiver_operating_characteristic#'Area_under_curve} for ranking evaluation. + \item \code{auc} Area under the curve. \url{https://en.wikipedia.org/wiki/Receiver_operating_characteristic#'Area_under_curve} for ranking evaluation. \item \code{aucpr} Area under the PR curve. \url{https://en.wikipedia.org/wiki/Precision_and_recall} for ranking evaluation. - \item \code{ndcg} Normalized Discounted Cumulative Gain (for ranking task). \url{http://en.wikipedia.org/wiki/NDCG} + \item \code{ndcg} Normalized Discounted Cumulative Gain (for ranking task). \url{https://en.wikipedia.org/wiki/NDCG} } The following callbacks are automatically created when certain parameters are set: diff --git a/R-package/tests/generate_models_params.R b/R-package/tests/generate_models_params.R deleted file mode 100644 index 0f71ab22c640..000000000000 --- a/R-package/tests/generate_models_params.R +++ /dev/null @@ -1,10 +0,0 @@ -model_generator_metadata <- function() { - return (list( - kRounds = 2, - kRows = 1000, - kCols = 4, - kForests = 2, - kMaxDepth = 2, - kClasses = 3 - )) -} diff --git a/R-package/tests/generate_models.R b/R-package/tests/helper_scripts/generate_models.R similarity index 96% rename from R-package/tests/generate_models.R rename to R-package/tests/helper_scripts/generate_models.R index 854b7944e4a6..d38b23a19374 100644 --- a/R-package/tests/generate_models.R +++ b/R-package/tests/helper_scripts/generate_models.R @@ -5,7 +5,14 @@ library(Matrix) source('./generate_models_params.R') set.seed(0) -metadata <- model_generator_metadata() +metadata <- list( + kRounds = 2, + kRows = 1000, + kCols = 4, + kForests = 2, + kMaxDepth = 2, + kClasses = 3 +) X <- Matrix(data = rnorm(metadata$kRows * metadata$kCols), nrow = metadata$kRows, ncol = metadata$kCols, sparse = TRUE) w <- runif(metadata$kRows) diff --git a/R-package/tests/run_lint.R b/R-package/tests/helper_scripts/run_lint.R similarity index 100% rename from R-package/tests/run_lint.R rename to R-package/tests/helper_scripts/run_lint.R diff --git a/R-package/tests/testthat/test_model_compatibility.R b/R-package/tests/testthat/test_model_compatibility.R index 7204ed89142c..105a60d169a1 100644 --- a/R-package/tests/testthat/test_model_compatibility.R +++ b/R-package/tests/testthat/test_model_compatibility.R @@ -1,10 +1,16 @@ require(xgboost) require(jsonlite) -source('../generate_models_params.R') context("Models from previous versions of XGBoost can be loaded") -metadata <- model_generator_metadata() +metadata <- list( + kRounds = 2, + kRows = 1000, + kCols = 4, + kForests = 2, + kMaxDepth = 2, + kClasses = 3 +) run_model_param_check <- function (config) { testthat::expect_equal(config$learner$learner_model_param$num_feature, '4') diff --git a/R-package/vignettes/discoverYourData.Rmd b/R-package/vignettes/discoverYourData.Rmd index 8181fcbb969b..c41f4f125def 100644 --- a/R-package/vignettes/discoverYourData.Rmd +++ b/R-package/vignettes/discoverYourData.Rmd @@ -57,7 +57,7 @@ To answer the question above we will convert *categorical* variables to `numeric In this Vignette we will see how to transform a *dense* `data.frame` (*dense* = few zeroes in the matrix) with *categorical* variables to a very *sparse* matrix (*sparse* = lots of zero in the matrix) of `numeric` features. -The method we are going to see is usually called [one-hot encoding](http://en.wikipedia.org/wiki/One-hot). +The method we are going to see is usually called [one-hot encoding](https://en.wikipedia.org/wiki/One-hot). The first step is to load `Arthritis` dataset in memory and wrap it with `data.table` package. @@ -66,7 +66,7 @@ data(Arthritis) df <- data.table(Arthritis, keep.rownames = FALSE) ``` -> `data.table` is 100% compliant with **R** `data.frame` but its syntax is more consistent and its performance for large dataset is [best in class](http://stackoverflow.com/questions/21435339/data-table-vs-dplyr-can-one-do-something-well-the-other-cant-or-does-poorly) (`dplyr` from **R** and `Pandas` from **Python** [included](https://github.com/Rdatatable/data.table/wiki/Benchmarks-%3A-Grouping)). Some parts of **Xgboost** **R** package use `data.table`. +> `data.table` is 100% compliant with **R** `data.frame` but its syntax is more consistent and its performance for large dataset is [best in class](https://stackoverflow.com/questions/21435339/data-table-vs-dplyr-can-one-do-something-well-the-other-cant-or-does-poorly) (`dplyr` from **R** and `Pandas` from **Python** [included](https://github.com/Rdatatable/data.table/wiki/Benchmarks-%3A-Grouping)). Some parts of **Xgboost** **R** package use `data.table`. The first thing we want to do is to have a look to the first few lines of the `data.table`: @@ -137,8 +137,8 @@ levels(df[,Treatment]) #### Encoding categorical features Next step, we will transform the categorical data to dummy variables. -Several encoding methods exist, e.g., [one-hot encoding](http://en.wikipedia.org/wiki/One-hot) is a common approach. -We will use the [dummy contrast coding](http://www.ats.ucla.edu/stat/r/library/contrast_coding.htm#dummy) which is popular because it produces "full rank" encoding (also see [this blog post by Max Kuhn](http://appliedpredictivemodeling.com/blog/2013/10/23/the-basics-of-encoding-categorical-data-for-predictive-models)). +Several encoding methods exist, e.g., [one-hot encoding](https://en.wikipedia.org/wiki/One-hot) is a common approach. +We will use the [dummy contrast coding](https://stats.idre.ucla.edu/r/library/r-library-contrast-coding-systems-for-categorical-variables/) which is popular because it produces "full rank" encoding (also see [this blog post by Max Kuhn](http://appliedpredictivemodeling.com/blog/2013/10/23/the-basics-of-encoding-categorical-data-for-predictive-models)). The purpose is to transform each value of each *categorical* feature into a *binary* feature `{0, 1}`. @@ -176,7 +176,7 @@ bst <- xgboost(data = sparse_matrix, label = output_vector, max_depth = 4, You can see some `train-error: 0.XXXXX` lines followed by a number. It decreases. Each line shows how well the model explains your data. Lower is better. -A model which fits too well may [overfit](http://en.wikipedia.org/wiki/Overfitting) (meaning it copy/paste too much the past, and won't be that good to predict the future). +A model which fits too well may [overfit](https://en.wikipedia.org/wiki/Overfitting) (meaning it copy/paste too much the past, and won't be that good to predict the future). > Here you can see the numbers decrease until line 7 and then increase. > @@ -304,7 +304,7 @@ Linear model may not be that smart in this scenario. Special Note: What about Random Forests™? ----------------------------------------- -As you may know, [Random Forests™](http://en.wikipedia.org/wiki/Random_forest) algorithm is cousin with boosting and both are part of the [ensemble learning](http://en.wikipedia.org/wiki/Ensemble_learning) family. +As you may know, [Random Forests™](https://en.wikipedia.org/wiki/Random_forest) algorithm is cousin with boosting and both are part of the [ensemble learning](https://en.wikipedia.org/wiki/Ensemble_learning) family. Both trains several decision trees for one dataset. The *main* difference is that in Random Forests™, trees are independent and in boosting, the tree `N+1` focus its learning on the loss (<=> what has not been well modeled by the tree `N`). diff --git a/R-package/vignettes/xgboost.bib b/R-package/vignettes/xgboost.bib index f21bdae1672e..5deb1e13d1f8 100644 --- a/R-package/vignettes/xgboost.bib +++ b/R-package/vignettes/xgboost.bib @@ -24,7 +24,7 @@ @misc{ author = "K. Bache and M. Lichman", year = "2013", title = "{UCI} Machine Learning Repository", - url = "http://archive.ics.uci.edu/ml", + url = "http://archive.ics.uci.edu/ml/", institution = "University of California, Irvine, School of Information and Computer Sciences" } diff --git a/R-package/vignettes/xgboostPresentation.Rmd b/R-package/vignettes/xgboostPresentation.Rmd index c2f990e1480a..ab72c677938e 100644 --- a/R-package/vignettes/xgboostPresentation.Rmd +++ b/R-package/vignettes/xgboostPresentation.Rmd @@ -68,7 +68,7 @@ The version 0.4-2 is on CRAN, and you can install it by: install.packages("xgboost") ``` -Formerly available versions can be obtained from the CRAN [archive](https://cran.r-project.org/src/contrib/Archive/xgboost) +Formerly available versions can be obtained from the CRAN [archive](https://cran.r-project.org/src/contrib/Archive/xgboost/) ## Learning