dmlc · trivialfis · Dec 12, 2023 · Nov 29, 2023 · Dec 12, 2023 · david-cortes
diff --git a/R-package/R/xgb.DMatrix.R b/R-package/R/xgb.DMatrix.R
@@ -19,7 +19,8 @@
 #' @param missing a float value to represents missing values in data (used only when input is a dense matrix).
 #'        It is useful when a 0 or some other extreme value represents missing values in data.
 #' @param silent whether to suppress printing an informational message after loading from a file.
-#' @param feature_names Set names for features.
+#' @param feature_names Set names for features. Overrides column names in data
+#'        frame and matrix.
 #' @param nthread Number of threads used for creating DMatrix.
 #' @param group Group size for all ranking group.
 #' @param qid Query ID for data samples, used for ranking.
@@ -32,6 +33,8 @@
 #' If a DMatrix gets serialized and then de-serialized (for example, when saving data in an R session or caching
 #' chunks in an Rmd file), the resulting object will not be usable anymore and will need to be reconstructed
 #' from the original source of data.
+#' @param enable_categorical Experimental support of specializing for
+#'        categorical features. JSON/UBJSON serialization format is required.
 #'
 #' @examples
 #' data(agaricus.train, package='xgboost')
@@ -58,19 +61,28 @@ xgb.DMatrix <- function(
   qid = NULL,
   label_lower_bound = NULL,
   label_upper_bound = NULL,
-  feature_weights = NULL
+  feature_weights = NULL,
+  enable_categorical = FALSE
 ) {
   if (!is.null(group) && !is.null(qid)) {
     stop("Either one of 'group' or 'qid' should be NULL")
   }
+  cnames <- NULL
+  ctypes <- NULL
   if (typeof(data) == "character") {
-    if (length(data) > 1)
-      stop("'data' has class 'character' and length ", length(data),
-           ".\n  'data' accepts either a numeric matrix or a single filename.")
+    if (length(data) > 1) {
+      stop(
+        "'data' has class 'character' and length ", length(data),
+        ".\n  'data' accepts either a numeric matrix or a single filename."
+      )
+    }
     data <- path.expand(data)
     handle <- .Call(XGDMatrixCreateFromFile_R, data, as.integer(silent))
   } else if (is.matrix(data)) {
-    handle <- .Call(XGDMatrixCreateFromMat_R, data, missing, as.integer(NVL(nthread, -1)))
+    handle <- .Call(
+      XGDMatrixCreateFromMat_R, data, missing, as.integer(NVL(nthread, -1))
+    )
+    cnames <- colnames(data)
   } else if (inherits(data, "dgCMatrix")) {
     handle <- .Call(
       XGDMatrixCreateFromCSC_R,
@@ -103,6 +115,40 @@ xgb.DMatrix <- function(
       missing,
       as.integer(NVL(nthread, -1))
     )
+  } else if (is.data.frame(data)) {
+    ctypes <- sapply(data, function(x) {
+      if (is.factor(x)) {
+        if (!enable_categorical) {
+          stop(
+            "When factor type is used, the parameter `enable_categorical`",
+            " must be set to TRUE."
+          )
+        }
+        "c"
+      } else if (is.integer(x)) {
+        "int"
+      } else if (is.logical(x)) {
+        "i"
+      } else {
+        if (!is.numeric(x)) {
+          stop("Invalid type in dataframe.")
+        }
+        "float"
+      }
+    })
+    ## as.data.frame somehow converts integer/logical into real.
+    data <- as.data.frame(sapply(data, function(x) {
+      if (is.factor(x)) {
+        ## XGBoost uses 0-based indexing.
+        as.numeric(x) - 1
+      } else {
+        x
+      }
+    }))
+    handle <- .Call(
+      XGDMatrixCreateFromDF_R, data, missing, as.integer(NVL(nthread, -1))
+    )
+    cnames <- colnames(data)
   } else {
     stop("xgb.DMatrix does not support construction from ", typeof(data))
   }
@@ -119,7 +165,11 @@ xgb.DMatrix <- function(
   if (!is.null(base_margin)) {
     setinfo(dmat, "base_margin", base_margin)
   }
+  if (!is.null(cnames)) {
+    setinfo(dmat, "feature_name", cnames)
+  }
   if (!is.null(feature_names)) {
+    ## override cnames
     setinfo(dmat, "feature_name", feature_names)
   }
   if (!is.null(group)) {
@@ -137,6 +187,9 @@ xgb.DMatrix <- function(
   if (!is.null(feature_weights)) {
     setinfo(dmat, "feature_weights", feature_weights)
   }
+  if (!is.null(ctypes)) {
+    setinfo(dmat, "feature_type", ctypes)
+  }
 
   return(dmat)
 }

diff --git a/R-package/man/xgb.DMatrix.Rd b/R-package/man/xgb.DMatrix.Rd
diff --git a/R-package/src/init.c b/R-package/src/init.c
@@ -41,6 +41,7 @@ extern SEXP XGDMatrixCreateFromFile_R(SEXP, SEXP);
 extern SEXP XGDMatrixCreateFromMat_R(SEXP, SEXP, SEXP);
 extern SEXP XGDMatrixGetFloatInfo_R(SEXP, SEXP);
 extern SEXP XGDMatrixGetUIntInfo_R(SEXP, SEXP);
+extern SEXP XGDMatrixCreateFromDF_R(SEXP, SEXP, SEXP);
 extern SEXP XGDMatrixGetStrFeatureInfo_R(SEXP, SEXP);
 extern SEXP XGDMatrixNumCol_R(SEXP);
 extern SEXP XGDMatrixNumRow_R(SEXP);
@@ -79,6 +80,7 @@ static const R_CallMethodDef CallEntries[] = {
   {"XGDMatrixCreateFromMat_R",    (DL_FUNC) &XGDMatrixCreateFromMat_R,    3},
   {"XGDMatrixGetFloatInfo_R",     (DL_FUNC) &XGDMatrixGetFloatInfo_R,     2},
   {"XGDMatrixGetUIntInfo_R",      (DL_FUNC) &XGDMatrixGetUIntInfo_R,      2},
+  {"XGDMatrixCreateFromDF_R",     (DL_FUNC) &XGDMatrixCreateFromDF_R,     3},
   {"XGDMatrixGetStrFeatureInfo_R", (DL_FUNC) &XGDMatrixGetStrFeatureInfo_R, 2},
   {"XGDMatrixNumCol_R",           (DL_FUNC) &XGDMatrixNumCol_R,           1},
   {"XGDMatrixNumRow_R",           (DL_FUNC) &XGDMatrixNumRow_R,           1},

diff --git a/R-package/src/xgboost_R.cc b/R-package/src/xgboost_R.cc
@@ -223,6 +223,69 @@ XGB_DLL SEXP XGDMatrixCreateFromMat_R(SEXP mat, SEXP missing, SEXP n_threads) {
   return ret;
 }
 
+XGB_DLL SEXP XGDMatrixCreateFromDF_R(SEXP df, SEXP missing, SEXP n_threads) {
+  SEXP ret = Rf_protect(R_MakeExternalPtr(nullptr, R_NilValue, R_NilValue));
+  R_API_BEGIN();
+
+  DMatrixHandle handle;
+
+  auto make_vec = [&](auto const *ptr, std::int32_t len) {
+    auto v = xgboost::linalg::MakeVec(ptr, len);
+    return xgboost::linalg::ArrayInterface(v);
+  };
+
+  std::int32_t rc{0};
+  {
+    using xgboost::Json;
+    auto n_features = Rf_xlength(df);
+    std::vector<Json> array(n_features);
+    CHECK_GT(n_features, 0);
+    auto len = Rf_xlength(VECTOR_ELT(df, 0));
+    // The `data.frame` in R actually converts all data into numeric. The other type
+    // handlers here are not used. At the moment they are kept as a reference for when we
+    // can avoid making data copies during transformation.
+    for (decltype(n_features) i = 0; i < n_features; ++i) {
+      switch (TYPEOF(VECTOR_ELT(df, i))) {
+        case INTSXP: {
+          auto const *ptr = INTEGER(VECTOR_ELT(df, i));
+          array[i] = make_vec(ptr, len);
+          break;
+        }
+        case REALSXP: {
+          auto const *ptr = REAL(VECTOR_ELT(df, i));
+          array[i] = make_vec(ptr, len);
+          break;
+        }
+        case LGLSXP: {
+          auto const *ptr = LOGICAL(VECTOR_ELT(df, i));
+          array[i] = make_vec(ptr, len);
+          break;
+        }
+        default: {
+          LOG(FATAL) << "data.frame has unsupported type.";
+        }
+      }
+    }
+
+    Json jinterface{std::move(array)};
+    auto sinterface = Json::Dump(jinterface);
+    Json jconfig{xgboost::Object{}};
+    jconfig["missing"] = asReal(missing);
+    jconfig["nthread"] = asInteger(n_threads);
+    auto sconfig = Json::Dump(jconfig);
+
+    rc = XGDMatrixCreateFromColumnar(sinterface.c_str(), sconfig.c_str(), &handle);
+  }
+
+  CHECK_CALL(rc);
+  R_SetExternalPtrAddr(ret, handle);
+  R_RegisterCFinalizerEx(ret, _DMatrixFinalizer, TRUE);
+  R_API_END();
+  Rf_unprotect(1);
+
+  return ret;
+}
+
 namespace {
 void CreateFromSparse(SEXP indptr, SEXP indices, SEXP data, std::string *indptr_str,
                       std::string *indices_str, std::string *data_str) {
@@ -298,6 +361,7 @@ XGB_DLL SEXP XGDMatrixCreateFromCSR_R(SEXP indptr, SEXP indices, SEXP data, SEXP
     res_code = XGDMatrixCreateFromCSR(sindptr.c_str(), sindices.c_str(), sdata.c_str(), ncol,
                                       config.c_str(), &handle);
   }
+  CHECK_CALL(res_code);
   R_SetExternalPtrAddr(ret, handle);
   R_RegisterCFinalizerEx(ret, _DMatrixFinalizer, TRUE);
   R_API_END();

diff --git a/R-package/src/xgboost_R.h b/R-package/src/xgboost_R.h
@@ -53,6 +53,16 @@ XGB_DLL SEXP XGDMatrixCreateFromFile_R(SEXP fname, SEXP silent);
 XGB_DLL SEXP XGDMatrixCreateFromMat_R(SEXP mat,
                                       SEXP missing,
                                       SEXP n_threads);
+
+/**
+ * @brief Create matrix content from a data frame.
+ * @param data R data.frame object
+ * @param missing which value to represent missing value
+ * @param n_threads Number of threads used to construct DMatrix from dense matrix.
+ * @return created dmatrix
+ */
+XGB_DLL SEXP XGDMatrixCreateFromDF_R(SEXP df, SEXP missing, SEXP n_threads);
+
 /*!
  * \brief create a matrix content from CSC format
  * \param indptr pointer to column headers

diff --git a/R-package/tests/testthat/test_dmatrix.R b/R-package/tests/testthat/test_dmatrix.R
@@ -322,3 +322,30 @@ test_that("xgb.DMatrix: can get group for both 'qid' and 'group' constructors",
   expected_gr <- c(0, 20, 40, 100)
   expect_equal(info_gr, expected_gr)
 })
+
+test_that("xgb.DMatrix: data.frame", {
+  df <- data.frame(
+    a = (1:4) / 10,
+    num = c(1, NA, 3, 4),
+    as.int = as.integer(c(1, 2, 3, 4)),
+    lo = c(TRUE, FALSE, NA, TRUE),
+    str.fac = c("a", "b", "d", "c"),
+    as.fac = as.factor(c(3, 5, 8, 11)),
+    stringsAsFactors = TRUE
+  )
+
+  m <- xgb.DMatrix(df, enable_categorical = TRUE)
+  expect_equal(colnames(m), colnames(df))
+  expect_equal(
+    getinfo(m, "feature_type"), c("float", "float", "int", "i", "c", "c")
+  )
+  expect_error(xgb.DMatrix(df))
+
+  df <- data.frame(
+    missing = c("a", "b", "d", NA),
+    valid = c("a", "b", "d", "c"),
+    stringsAsFactors = TRUE
+  )
+  m <- xgb.DMatrix(df, enable_categorical = TRUE)
+  expect_equal(getinfo(m, "feature_type"), c("c", "c"))
+})
diff --git a/demo/guide-python/cat_in_the_dat.py b/demo/guide-python/cat_in_the_dat.py
@@ -78,6 +78,10 @@ def categorical_model(X: pd.DataFrame, y: pd.Series, output_dir: str) -> None:
     X_train, X_test, y_train, y_test = train_test_split(
         X, y, random_state=1994, test_size=0.2
     )
+    # Be aware that the encoding for X_train and X_test are the same here. In practice,
+    # we should try to use an encoder like (sklearn OrdinalEncoder) to obtain the
+    # categorical values.
+
     # Specify `enable_categorical` to True.
     clf = xgb.XGBClassifier(
         **params,

diff --git a/include/xgboost/c_api.h b/include/xgboost/c_api.h
@@ -159,6 +159,16 @@ XGB_DLL int XGDMatrixCreateFromURI(char const *config, DMatrixHandle *out);
 XGB_DLL int XGDMatrixCreateFromCSREx(const size_t *indptr, const unsigned *indices,
                                      const float *data, size_t nindptr, size_t nelem,
                                      size_t num_col, DMatrixHandle *out);
+/**
+ * @brief Create a DMatrix from columnar data. (table)
+ *
+ * @param data   See @ref XGBoosterPredictFromColumnar for details.
+ * @param config See @ref XGDMatrixCreateFromDense for details.
+ * @param out    The created dmatrix.
+ *
+ * @return 0 when success, -1 when failure happens
+ */
+XGB_DLL int XGDMatrixCreateFromColumnar(char const *data, char const *config, DMatrixHandle *out);
 
 /**
  * @example c-api-demo.c
@@ -514,6 +524,16 @@ XGB_DLL int
 XGProxyDMatrixSetDataCudaArrayInterface(DMatrixHandle handle,
                                         const char *c_interface_str);
 
+/**
+ * @brief Set columnar (table) data on a DMatrix proxy.
+ *
+ * @param handle          A DMatrix proxy created by @ref XGProxyDMatrixCreate
+ * @param c_interface_str See @ref XGBoosterPredictFromColumnar for details.
+ *
+ * @return 0 when success, -1 when failure happens
+ */
+XGB_DLL int XGProxyDMatrixSetDataCudaColumnar(DMatrixHandle handle, char const *c_interface_str);
+
 /*!
  * \brief Set data on a DMatrix proxy.
  *
@@ -1113,6 +1133,31 @@ XGB_DLL int XGBoosterPredictFromDense(BoosterHandle handle, char const *values,
  * @example inference.c
  */
 
+/**
+ * @brief Inplace prediction from CPU columnar data. (Table)
+ *
+ * @note If the booster is configured to run on a CUDA device, XGBoost falls back to run
+ *       prediction with DMatrix with a performance warning.
+ *
+ * @param handle        Booster handle.
+ * @param values        An JSON array of __array_interface__ for each column.
+ * @param config        See @ref XGBoosterPredictFromDMatrix for more info.
+ *   Additional fields for inplace prediction are:
+ *     - "missing": float
+ * @param m             An optional (NULL if not available) proxy DMatrix instance
+ *                      storing meta info.
+ *
+ * @param out_shape     See @ref XGBoosterPredictFromDMatrix for more info.
+ * @param out_dim       See @ref XGBoosterPredictFromDMatrix for more info.
+ * @param out_result    See @ref XGBoosterPredictFromDMatrix for more info.
+ *
+ * @return 0 when success, -1 when failure happens
+ */
+XGB_DLL int XGBoosterPredictFromColumnar(BoosterHandle handle, char const *array_interface,
+                                         char const *c_json_config, DMatrixHandle m,
+                                         bst_ulong const **out_shape, bst_ulong *out_dim,
+                                         const float **out_result);
+
 /**
  * \brief Inplace prediction from CPU CSR matrix.
  *