Skip to content

Commit

Permalink
Support dataframe data format in native XGBoost.
Browse files Browse the repository at this point in the history
- Implement a columnar adapter.
- Refactor Python pandas handling code to avoid converting into numpy.
- Add support in R for transforming columns.
  • Loading branch information
trivialfis committed Dec 11, 2023
1 parent 5623521 commit 7bf3ccf
Show file tree
Hide file tree
Showing 21 changed files with 715 additions and 211 deletions.
65 changes: 59 additions & 6 deletions R-package/R/xgb.DMatrix.R
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,8 @@
#' @param missing a float value to represents missing values in data (used only when input is a dense matrix).
#' It is useful when a 0 or some other extreme value represents missing values in data.
#' @param silent whether to suppress printing an informational message after loading from a file.
#' @param feature_names Set names for features.
#' @param feature_names Set names for features. Overrides column names in data
#' frame and matrix.
#' @param nthread Number of threads used for creating DMatrix.
#' @param group Group size for all ranking group.
#' @param qid Query ID for data samples, used for ranking.
Expand All @@ -32,6 +33,8 @@
#' If a DMatrix gets serialized and then de-serialized (for example, when saving data in an R session or caching
#' chunks in an Rmd file), the resulting object will not be usable anymore and will need to be reconstructed
#' from the original source of data.
#' @param enable_categorical Experimental support of specializing for
#' categorical features. JSON/UBJSON serialization format is required.
#'
#' @examples
#' data(agaricus.train, package='xgboost')
Expand All @@ -58,19 +61,28 @@ xgb.DMatrix <- function(
qid = NULL,
label_lower_bound = NULL,
label_upper_bound = NULL,
feature_weights = NULL
feature_weights = NULL,
enable_categorical = FALSE
) {
if (!is.null(group) && !is.null(qid)) {
stop("Either one of 'group' or 'qid' should be NULL")
}
cnames <- NULL
ctypes <- NULL
if (typeof(data) == "character") {
if (length(data) > 1)
stop("'data' has class 'character' and length ", length(data),
".\n 'data' accepts either a numeric matrix or a single filename.")
if (length(data) > 1) {
stop(
"'data' has class 'character' and length ", length(data),
".\n 'data' accepts either a numeric matrix or a single filename."
)
}
data <- path.expand(data)
handle <- .Call(XGDMatrixCreateFromFile_R, data, as.integer(silent))
} else if (is.matrix(data)) {
handle <- .Call(XGDMatrixCreateFromMat_R, data, missing, as.integer(NVL(nthread, -1)))
handle <- .Call(
XGDMatrixCreateFromMat_R, data, missing, as.integer(NVL(nthread, -1))
)
cnames <- colnames(data)
} else if (inherits(data, "dgCMatrix")) {
handle <- .Call(
XGDMatrixCreateFromCSC_R,
Expand Down Expand Up @@ -103,6 +115,40 @@ xgb.DMatrix <- function(
missing,
as.integer(NVL(nthread, -1))
)
} else if (is.data.frame(data)) {
ctypes <- sapply(data, function(x) {
if (is.factor(x)) {
if (!enable_categorical) {
stop(
"When factor type is used, the parameter `enable_categorical`",
" must be set to TRUE."
)
}
"c"
} else if (is.integer(x)) {
"int"
} else if (is.logical(x)) {
"i"
} else {
if (!is.numeric(x)) {
stop("Invalid type in dataframe.")
}
"float"
}
})
## as.data.frame somehow converts integer/logical into real.
data <- as.data.frame(sapply(data, function(x) {
if (is.factor(x)) {
## XGBoost uses 0-based indexing.
as.numeric(x) - 1
} else {
x
}
}))
handle <- .Call(
XGDMatrixCreateFromDF_R, data, missing, as.integer(NVL(nthread, -1))
)
cnames <- colnames(data)
} else {
stop("xgb.DMatrix does not support construction from ", typeof(data))
}
Expand All @@ -119,7 +165,11 @@ xgb.DMatrix <- function(
if (!is.null(base_margin)) {
setinfo(dmat, "base_margin", base_margin)
}
if (!is.null(cnames)) {
setinfo(dmat, "feature_name", cnames)
}
if (!is.null(feature_names)) {
## override cnames
setinfo(dmat, "feature_name", feature_names)
}
if (!is.null(group)) {
Expand All @@ -137,6 +187,9 @@ xgb.DMatrix <- function(
if (!is.null(feature_weights)) {
setinfo(dmat, "feature_weights", feature_weights)
}
if (!is.null(ctypes)) {
setinfo(dmat, "feature_type", ctypes)
}

return(dmat)
}
Expand Down
9 changes: 7 additions & 2 deletions R-package/man/xgb.DMatrix.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions R-package/src/init.c
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ extern SEXP XGDMatrixCreateFromFile_R(SEXP, SEXP);
extern SEXP XGDMatrixCreateFromMat_R(SEXP, SEXP, SEXP);
extern SEXP XGDMatrixGetFloatInfo_R(SEXP, SEXP);
extern SEXP XGDMatrixGetUIntInfo_R(SEXP, SEXP);
extern SEXP XGDMatrixCreateFromDF_R(SEXP, SEXP, SEXP);
extern SEXP XGDMatrixGetStrFeatureInfo_R(SEXP, SEXP);
extern SEXP XGDMatrixNumCol_R(SEXP);
extern SEXP XGDMatrixNumRow_R(SEXP);
Expand Down Expand Up @@ -79,6 +80,7 @@ static const R_CallMethodDef CallEntries[] = {
{"XGDMatrixCreateFromMat_R", (DL_FUNC) &XGDMatrixCreateFromMat_R, 3},
{"XGDMatrixGetFloatInfo_R", (DL_FUNC) &XGDMatrixGetFloatInfo_R, 2},
{"XGDMatrixGetUIntInfo_R", (DL_FUNC) &XGDMatrixGetUIntInfo_R, 2},
{"XGDMatrixCreateFromDF_R", (DL_FUNC) &XGDMatrixCreateFromDF_R, 3},
{"XGDMatrixGetStrFeatureInfo_R", (DL_FUNC) &XGDMatrixGetStrFeatureInfo_R, 2},
{"XGDMatrixNumCol_R", (DL_FUNC) &XGDMatrixNumCol_R, 1},
{"XGDMatrixNumRow_R", (DL_FUNC) &XGDMatrixNumRow_R, 1},
Expand Down
64 changes: 64 additions & 0 deletions R-package/src/xgboost_R.cc
Original file line number Diff line number Diff line change
Expand Up @@ -223,6 +223,69 @@ XGB_DLL SEXP XGDMatrixCreateFromMat_R(SEXP mat, SEXP missing, SEXP n_threads) {
return ret;
}

XGB_DLL SEXP XGDMatrixCreateFromDF_R(SEXP df, SEXP missing, SEXP n_threads) {
SEXP ret = Rf_protect(R_MakeExternalPtr(nullptr, R_NilValue, R_NilValue));
R_API_BEGIN();

DMatrixHandle handle;

auto make_vec = [&](auto const *ptr, std::int32_t len) {
auto v = xgboost::linalg::MakeVec(ptr, len);
return xgboost::linalg::ArrayInterface(v);
};

std::int32_t rc{0};
{
using xgboost::Json;
auto n_features = Rf_xlength(df);
std::vector<Json> array(n_features);
CHECK_GT(n_features, 0);
auto len = Rf_xlength(VECTOR_ELT(df, 0));
// The `data.frame` in R actually converts all data into numeric. The other type
// handlers here are not used. At the moment they are kept as a reference for when we
// can avoid making data copies during transformation.
for (decltype(n_features) i = 0; i < n_features; ++i) {
switch (TYPEOF(VECTOR_ELT(df, i))) {
case INTSXP: {
auto const *ptr = INTEGER(VECTOR_ELT(df, i));
array[i] = make_vec(ptr, len);
break;
}
case REALSXP: {
auto const *ptr = REAL(VECTOR_ELT(df, i));
array[i] = make_vec(ptr, len);
break;
}
case LGLSXP: {
auto const *ptr = LOGICAL(VECTOR_ELT(df, i));
array[i] = make_vec(ptr, len);
break;
}
default: {
LOG(FATAL) << "data.frame has unsupported type.";
}
}
}

Json jinterface{std::move(array)};
auto sinterface = Json::Dump(jinterface);
Json jconfig{xgboost::Object{}};
jconfig["missing"] = asReal(missing);
jconfig["nthread"] = asInteger(n_threads);
auto sconfig = Json::Dump(jconfig);

rc = XGDMatrixCreateFromColumnar(sinterface.c_str(), sconfig.c_str(), &handle);
}

CHECK_CALL(rc);
R_SetExternalPtrAddr(ret, handle);
R_RegisterCFinalizerEx(ret, _DMatrixFinalizer, TRUE);
R_API_END();
Rf_unprotect(1);

return ret;
}

namespace {
void CreateFromSparse(SEXP indptr, SEXP indices, SEXP data, std::string *indptr_str,
std::string *indices_str, std::string *data_str) {
Expand Down Expand Up @@ -298,6 +361,7 @@ XGB_DLL SEXP XGDMatrixCreateFromCSR_R(SEXP indptr, SEXP indices, SEXP data, SEXP
res_code = XGDMatrixCreateFromCSR(sindptr.c_str(), sindices.c_str(), sdata.c_str(), ncol,
config.c_str(), &handle);
}
CHECK_CALL(res_code);
R_SetExternalPtrAddr(ret, handle);
R_RegisterCFinalizerEx(ret, _DMatrixFinalizer, TRUE);
R_API_END();
Expand Down
10 changes: 10 additions & 0 deletions R-package/src/xgboost_R.h
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,16 @@ XGB_DLL SEXP XGDMatrixCreateFromFile_R(SEXP fname, SEXP silent);
XGB_DLL SEXP XGDMatrixCreateFromMat_R(SEXP mat,
SEXP missing,
SEXP n_threads);

/**
* @brief Create matrix content from a data frame.
* @param data R data.frame object
* @param missing which value to represent missing value
* @param n_threads Number of threads used to construct DMatrix from dense matrix.
* @return created dmatrix
*/
XGB_DLL SEXP XGDMatrixCreateFromDF_R(SEXP df, SEXP missing, SEXP n_threads);

/*!
* \brief create a matrix content from CSC format
* \param indptr pointer to column headers
Expand Down
27 changes: 27 additions & 0 deletions R-package/tests/testthat/test_dmatrix.R
Original file line number Diff line number Diff line change
Expand Up @@ -322,3 +322,30 @@ test_that("xgb.DMatrix: can get group for both 'qid' and 'group' constructors",
expected_gr <- c(0, 20, 40, 100)
expect_equal(info_gr, expected_gr)
})

test_that("xgb.DMatrix: data.frame", {
df <- data.frame(
a = (1:4) / 10,
num = c(1, NA, 3, 4),
as.int = as.integer(c(1, 2, 3, 4)),
lo = c(TRUE, FALSE, NA, TRUE),
str.fac = c("a", "b", "d", "c"),
as.fac = as.factor(c(3, 5, 8, 11)),
stringsAsFactors = TRUE
)

m <- xgb.DMatrix(df, enable_categorical = TRUE)
expect_equal(colnames(m), colnames(df))
expect_equal(
getinfo(m, "feature_type"), c("float", "float", "int", "i", "c", "c")
)
expect_error(xgb.DMatrix(df))

df <- data.frame(
missing = c("a", "b", "d", NA),
valid = c("a", "b", "d", "c"),
stringsAsFactors = TRUE
)
m <- xgb.DMatrix(df, enable_categorical = TRUE)
expect_equal(getinfo(m, "feature_type"), c("c", "c"))
})
4 changes: 4 additions & 0 deletions demo/guide-python/cat_in_the_dat.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,10 @@ def categorical_model(X: pd.DataFrame, y: pd.Series, output_dir: str) -> None:
X_train, X_test, y_train, y_test = train_test_split(
X, y, random_state=1994, test_size=0.2
)
# Be aware that the encoding for X_train and X_test are the same here. In practice,
# we should try to use an encoder like (sklearn OrdinalEncoder) to obtain the
# categorical values.

# Specify `enable_categorical` to True.
clf = xgb.XGBClassifier(
**params,
Expand Down
45 changes: 45 additions & 0 deletions include/xgboost/c_api.h
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,16 @@ XGB_DLL int XGDMatrixCreateFromURI(char const *config, DMatrixHandle *out);
XGB_DLL int XGDMatrixCreateFromCSREx(const size_t *indptr, const unsigned *indices,
const float *data, size_t nindptr, size_t nelem,
size_t num_col, DMatrixHandle *out);
/**
* @brief Create a DMatrix from columnar data. (table)
*
* @param data See @ref XGBoosterPredictFromColumnar for details.
* @param config See @ref XGDMatrixCreateFromDense for details.
* @param out The created dmatrix.
*
* @return 0 when success, -1 when failure happens
*/
XGB_DLL int XGDMatrixCreateFromColumnar(char const *data, char const *config, DMatrixHandle *out);

/**
* @example c-api-demo.c
Expand Down Expand Up @@ -514,6 +524,16 @@ XGB_DLL int
XGProxyDMatrixSetDataCudaArrayInterface(DMatrixHandle handle,
const char *c_interface_str);

/**
* @brief Set columnar (table) data on a DMatrix proxy.
*
* @param handle A DMatrix proxy created by @ref XGProxyDMatrixCreate
* @param c_interface_str See @ref XGBoosterPredictFromColumnar for details.
*
* @return 0 when success, -1 when failure happens
*/
XGB_DLL int XGProxyDMatrixSetDataCudaColumnar(DMatrixHandle handle, char const *c_interface_str);

/*!
* \brief Set data on a DMatrix proxy.
*
Expand Down Expand Up @@ -1113,6 +1133,31 @@ XGB_DLL int XGBoosterPredictFromDense(BoosterHandle handle, char const *values,
* @example inference.c
*/

/**
* @brief Inplace prediction from CPU columnar data. (Table)
*
* @note If the booster is configured to run on a CUDA device, XGBoost falls back to run
* prediction with DMatrix with a performance warning.
*
* @param handle Booster handle.
* @param values An JSON array of __array_interface__ for each column.
* @param config See @ref XGBoosterPredictFromDMatrix for more info.
* Additional fields for inplace prediction are:
* - "missing": float
* @param m An optional (NULL if not available) proxy DMatrix instance
* storing meta info.
*
* @param out_shape See @ref XGBoosterPredictFromDMatrix for more info.
* @param out_dim See @ref XGBoosterPredictFromDMatrix for more info.
* @param out_result See @ref XGBoosterPredictFromDMatrix for more info.
*
* @return 0 when success, -1 when failure happens
*/
XGB_DLL int XGBoosterPredictFromColumnar(BoosterHandle handle, char const *array_interface,
char const *c_json_config, DMatrixHandle m,
bst_ulong const **out_shape, bst_ulong *out_dim,
const float **out_result);

/**
* \brief Inplace prediction from CPU CSR matrix.
*
Expand Down

0 comments on commit 7bf3ccf

Please sign in to comment.