dmlc · rongou · Oct 24, 2023 · trivialfis · Oct 30, 2023
diff --git a/include/xgboost/data.h b/include/xgboost/data.h
@@ -389,7 +389,8 @@ class SparsePage {
   /**
    * \brief Reindex the column index with an offset.
    */
-  void Reindex(uint64_t feature_offset, int32_t n_threads);
+  void ReindexCPU(uint64_t feature_offset, int32_t n_threads);
+  void ReindexCUDA(uint64_t feature_offset);
 
   void SortRows(int32_t n_threads);
 

diff --git a/python-package/xgboost/data.py b/python-package/xgboost/data.py
@@ -853,13 +853,18 @@ def _from_cudf_df(
     feature_names: Optional[FeatureNames],
     feature_types: Optional[FeatureTypes],
     enable_categorical: bool,
+    data_split_mode: DataSplitMode = DataSplitMode.ROW,
 ) -> DispatchedDataBackendReturnType:
     data, cat_codes, feature_names, feature_types = _transform_cudf_df(
         data, feature_names, feature_types, enable_categorical
     )
     interfaces_str = _cudf_array_interfaces(data, cat_codes)
     handle = ctypes.c_void_p()
-    config = bytes(json.dumps({"missing": missing, "nthread": nthread}), "utf-8")
+    config = make_jcargs(
+        missing=float(missing),
+        nthread=int(nthread),
+        data_split_mode=int(data_split_mode),
+    )
     _check_call(
         _LIB.XGDMatrixCreateFromCudaColumnar(
             interfaces_str,
@@ -1096,7 +1101,13 @@ def dispatch_data_backend(
         )
     if _is_cudf_df(data) or _is_cudf_ser(data):
         return _from_cudf_df(
-            data, missing, threads, feature_names, feature_types, enable_categorical
+            data,
+            missing,
+            threads,
+            feature_names,
+            feature_types,
+            enable_categorical,
+            data_split_mode,
         )
     if _is_cupy_array(data):
         return _from_cupy_array(data, missing, threads, feature_names, feature_types)

diff --git a/src/c_api/c_api.cu b/src/c_api/c_api.cu
@@ -96,9 +96,11 @@ XGB_DLL int XGDMatrixCreateFromCudaColumnar(char const *data,
 
   float missing = GetMissing(config);
   auto n_threads = OptionalArg<Integer, std::int64_t>(config, "nthread", 0);
+  auto data_split_mode =
+      static_cast<DataSplitMode>(OptionalArg<Integer, int64_t>(config, "data_split_mode", 0));
   data::CudfAdapter adapter(json_str);
-  *out =
-      new std::shared_ptr<DMatrix>(DMatrix::Create(&adapter, missing, n_threads));
+  *out = new std::shared_ptr<DMatrix>(
+      DMatrix::Create(&adapter, missing, n_threads, "", data_split_mode));
   API_END();
 }
 

diff --git a/src/data/data.cc b/src/data/data.cc
@@ -1045,13 +1045,17 @@ void SparsePage::SortIndices(int32_t n_threads) {
   });
 }
 
-void SparsePage::Reindex(uint64_t feature_offset, int32_t n_threads) {
+void SparsePage::ReindexCPU(uint64_t feature_offset, int32_t n_threads) {
   auto& h_data = this->data.HostVector();
   common::ParallelFor(h_data.size(), n_threads, [&](auto i) {
     h_data[i].index += feature_offset;
   });
 }
 
+#if !defined(XGBOOST_USE_CUDA)
+void SparsePage::ReindexCUDA(uint64_t feature_offset) { common::AssertGPUSupport(); }
+#endif  // !defined(XGBOOST_USE_CUDA)
+
 void SparsePage::SortRows(int32_t n_threads) {
   auto& h_offset = this->offset.HostVector();
   auto& h_data = this->data.HostVector();

diff --git a/src/data/data.cu b/src/data/data.cu
@@ -169,6 +169,11 @@ void MetaInfo::SetInfoFromCUDA(Context const& ctx, StringView key, Json array) {
   }
 }
 
+void SparsePage::ReindexCUDA(uint64_t feature_offset) {
+  auto d_data = this->data.DeviceSpan();
+  dh::LaunchN(d_data.size(), [=] __device__(size_t idx) { d_data[idx].index += feature_offset; });
+}
+
 template <typename AdapterT>
 DMatrix* DMatrix::Create(AdapterT* adapter, float missing, int nthread,
                          const std::string& cache_prefix, DataSplitMode data_split_mode) {

diff --git a/src/data/simple_dmatrix.cc b/src/data/simple_dmatrix.cc
@@ -74,14 +74,18 @@ DMatrix* SimpleDMatrix::SliceCol(int num_slices, int slice_id) {
   return out;
 }
 
-void SimpleDMatrix::ReindexFeatures(Context const* ctx) {
+void SimpleDMatrix::ReindexFeatures() {
   if (info_.IsColumnSplit() && collective::GetWorldSize() > 1) {
     auto const cols = collective::Allgather(info_.num_col_);
     auto const offset = std::accumulate(cols.cbegin(), cols.cbegin() + collective::GetRank(), 0ul);
     if (offset == 0) {
       return;
     }
-    sparse_page_->Reindex(offset, ctx->Threads());
+    if (fmat_ctx_.IsCUDA()) {
+      sparse_page_->ReindexCUDA(offset);
+    } else {
+      sparse_page_->ReindexCPU(offset, fmat_ctx_.Threads());
+    }
   }
 }
 
@@ -216,8 +220,7 @@ BatchSet<ExtSparsePage> SimpleDMatrix::GetExtBatches(Context const*, BatchParam
 template <typename AdapterT>
 SimpleDMatrix::SimpleDMatrix(AdapterT* adapter, float missing, int nthread,
                              DataSplitMode data_split_mode) {
-  Context ctx;
-  ctx.Init(Args{{"nthread", std::to_string(nthread)}});
+  fmat_ctx_.Init(Args{{"nthread", std::to_string(nthread)}});
 
   std::vector<uint64_t> qids;
   uint64_t default_max = std::numeric_limits<uint64_t>::max();
@@ -233,7 +236,7 @@ SimpleDMatrix::SimpleDMatrix(AdapterT* adapter, float missing, int nthread,
   // Iterate over batches of input data
   while (adapter->Next()) {
     auto& batch = adapter->Value();
-    auto batch_max_columns = sparse_page_->Push(batch, missing, ctx.Threads());
+    auto batch_max_columns = sparse_page_->Push(batch, missing, fmat_ctx_.Threads());
     inferred_num_columns = std::max(batch_max_columns, inferred_num_columns);
     total_batch_size += batch.Size();
     // Append meta information if available
@@ -282,7 +285,7 @@ SimpleDMatrix::SimpleDMatrix(AdapterT* adapter, float missing, int nthread,
 
   // Synchronise worker columns
   info_.data_split_mode = data_split_mode;
-  ReindexFeatures(&ctx);
+  ReindexFeatures();
   info_.SynchronizeNumberOfColumns();
 
   if (adapter->NumRows() == kAdapterUnknownSize) {
@@ -315,11 +318,9 @@ SimpleDMatrix::SimpleDMatrix(AdapterT* adapter, float missing, int nthread,
   info_.num_nonzero_ = data_vec.size();
 
   // Sort the index for row partitioners used by variuos tree methods.
-  if (!sparse_page_->IsIndicesSorted(ctx.Threads())) {
-    sparse_page_->SortIndices(ctx.Threads());
+  if (!sparse_page_->IsIndicesSorted(fmat_ctx_.Threads())) {
+    sparse_page_->SortIndices(fmat_ctx_.Threads());
   }
-
-  this->fmat_ctx_ = ctx;
 }
 
 SimpleDMatrix::SimpleDMatrix(dmlc::Stream* in_stream) {

diff --git a/src/data/simple_dmatrix.cu b/src/data/simple_dmatrix.cu
@@ -17,16 +17,13 @@ namespace xgboost::data {
 template <typename AdapterT>
 SimpleDMatrix::SimpleDMatrix(AdapterT* adapter, float missing, std::int32_t nthread,
                              DataSplitMode data_split_mode) {
-  CHECK(data_split_mode != DataSplitMode::kCol)
-      << "Column-wise data split is currently not supported on the GPU.";
   auto device = (adapter->Device().IsCPU() || adapter->NumRows() == 0)
                     ? DeviceOrd::CUDA(dh::CurrentDevice())
                     : adapter->Device();
   CHECK(device.IsCUDA());
   dh::safe_cuda(cudaSetDevice(device.ordinal));
 
-  Context ctx;
-  ctx.Init(Args{{"nthread", std::to_string(nthread)}, {"device", device.Name()}});
+  fmat_ctx_.Init(Args{{"nthread", std::to_string(nthread)}, {"device", device.Name()}});
 
   CHECK(adapter->NumRows() != kAdapterUnknownSize);
   CHECK(adapter->NumColumns() != kAdapterUnknownSize);
@@ -42,9 +39,8 @@ SimpleDMatrix::SimpleDMatrix(AdapterT* adapter, float missing, std::int32_t nthr
   info_.num_row_ = adapter->NumRows();
   // Synchronise worker columns
   info_.data_split_mode = data_split_mode;
+  ReindexFeatures();
   info_.SynchronizeNumberOfColumns();
-
-  this->fmat_ctx_ = ctx;
 }
 
 template SimpleDMatrix::SimpleDMatrix(CudfAdapter* adapter, float missing,

diff --git a/src/data/simple_dmatrix.h b/src/data/simple_dmatrix.h
@@ -69,7 +69,7 @@ class SimpleDMatrix : public DMatrix {
    * are globally indexed, so we reindex the features based on the offset needed to obtain the
    * global view.
    */
-  void ReindexFeatures(Context const* ctx);
+  void ReindexFeatures();
 
  private:
   // Context used only for DMatrix initialization.

diff --git a/tests/ci_build/lint_python.py b/tests/ci_build/lint_python.py
@@ -26,6 +26,7 @@ class LintersPaths:
         "tests/python/test_tree_regularization.py",
         "tests/python/test_shap.py",
         "tests/python/test_with_pandas.py",
+        "tests/python-gpu/test_from_cudf.py",
         "tests/python-gpu/test_gpu_data_iterator.py",
         "tests/python-gpu/test_gpu_prediction.py",
         "tests/python-gpu/load_pickle.py",