dmlc · trivialfis · Aug 2, 2022 · Apr 25, 2022 · Jul 29, 2022
diff --git a/doc/python/python_api.rst b/doc/python/python_api.rst
@@ -22,6 +22,9 @@ Core Data Structure
     :members:
     :show-inheritance:
 
+.. autoclass:: xgboost.QuantileDMatrix
+    :show-inheritance:
+
 .. autoclass:: xgboost.DeviceQuantileDMatrix
     :show-inheritance:
 

diff --git a/include/xgboost/c_api.h b/include/xgboost/c_api.h
@@ -415,28 +415,26 @@ XGB_EXTERN_C typedef void DataIterResetCallback(DataIterHandle handle); // NOLIN
  *
  * \return 0 when success, -1 when failure happens
  */
-XGB_DLL int XGDMatrixCreateFromCallback(DataIterHandle iter,
-                                        DMatrixHandle proxy,
-                                        DataIterResetCallback *reset,
-                                        XGDMatrixCallbackNext *next,
-                                        char const* c_json_config,
-                                        DMatrixHandle *out);
+XGB_DLL int XGDMatrixCreateFromCallback(DataIterHandle iter, DMatrixHandle proxy,
+                                        DataIterResetCallback *reset, XGDMatrixCallbackNext *next,
+                                        char const *c_json_config, DMatrixHandle *out);
 
 /*!
  * \brief Create a Quantile DMatrix with data iterator.
  *
- * Short note for how to use the second set of callback for GPU Hist tree method:
+ * Short note for how to use the second set of callback for (GPU)Hist tree method:
  *
  * - Step 0: Define a data iterator with 2 methods `reset`, and `next`.
  * - Step 1: Create a DMatrix proxy by `XGProxyDMatrixCreate` and hold the handle.
  * - Step 2: Pass the iterator handle, proxy handle and 2 methods into
- *           `XGDeviceQuantileDMatrixCreateFromCallback`.
+ *           `XGQuantileDMatrixCreateFromCallback`.
  * - Step 3: Call appropriate data setters in `next` functions.
  *
- * See test_iterative_device_dmatrix.cu or Python interface for examples.
+ * See test_iterative_dmatrix.cu or Python interface for examples.
  *
  * \param iter     A handle to external data iterator.
  * \param proxy    A DMatrix proxy handle created by `XGProxyDMatrixCreate`.
+ * \param ref      Reference DMatrix for providing quantile information.
  * \param reset    Callback function resetting the iterator state.
  * \param next     Callback function yielding the next batch of data.
  * \param missing  Which value to represent missing value
@@ -446,10 +444,20 @@ XGB_DLL int XGDMatrixCreateFromCallback(DataIterHandle iter,
  *
  * \return 0 when success, -1 when failure happens
  */
-XGB_DLL int XGDeviceQuantileDMatrixCreateFromCallback(
-    DataIterHandle iter, DMatrixHandle proxy, DataIterResetCallback *reset,
-    XGDMatrixCallbackNext *next, float missing, int nthread, int max_bin,
-    DMatrixHandle *out);
+XGB_DLL int XGQuantileDMatrixCreateFromCallback(DataIterHandle iter, DMatrixHandle proxy,
+                                                DataIterHandle ref, DataIterResetCallback *reset,
+                                                XGDMatrixCallbackNext *next, char const *config,
+                                                DMatrixHandle *out);
+
+/*!
+ * \brief Create a Device Quantile DMatrix with data iterator.
+ * \deprecated since 2.0
+ * \see XGQuantileDMatrixCreateFromCallback()
+ */
+XGB_DLL int XGDeviceQuantileDMatrixCreateFromCallback(DataIterHandle iter, DMatrixHandle proxy,
+                                                      DataIterResetCallback *reset,
+                                                      XGDMatrixCallbackNext *next, float missing,
+                                                      int nthread, int max_bin, DMatrixHandle *out);
 
 /*!
  * \brief Set data on a DMatrix proxy.

diff --git a/python-package/xgboost/__init__.py b/python-package/xgboost/__init__.py
@@ -6,6 +6,7 @@
 from .core import (
     DMatrix,
     DeviceQuantileDMatrix,
+    QuantileDMatrix,
     Booster,
     DataIter,
     build_info,
@@ -33,6 +34,7 @@
     # core
     "DMatrix",
     "DeviceQuantileDMatrix",
+    "QuantileDMatrix",
     "Booster",
     "DataIter",
     "train",

diff --git a/python-package/xgboost/core.py b/python-package/xgboost/core.py
@@ -1146,7 +1146,7 @@ def feature_types(self, feature_types: Optional[Union[List[str], str]]) -> None:
 
         Parameters
         ----------
-        feature_types : list or None
+        feature_types :
             Labels for features. None will reset existing feature names
 
         """
@@ -1189,7 +1189,7 @@ def feature_types(self, feature_types: Optional[Union[List[str], str]]) -> None:
 
 
 class _ProxyDMatrix(DMatrix):
-    """A placeholder class when DMatrix cannot be constructed (DeviceQuantileDMatrix,
+    """A placeholder class when DMatrix cannot be constructed (QuantileDMatrix,
     inplace_predict).
 
     """
@@ -1234,17 +1234,35 @@ def _set_data_from_csr(self, csr: scipy.sparse.csr_matrix) -> None:
         )
 
 
-class DeviceQuantileDMatrix(DMatrix):
-    """Device memory Data Matrix used in XGBoost for training with tree_method='gpu_hist'. Do
-    not use this for test/validation tasks as some information may be lost in
-    quantisation. This DMatrix is primarily designed to save memory in training from
-    device memory inputs by avoiding intermediate storage. Set max_bin to control the
-    number of bins during quantisation.  See doc string in :py:obj:`xgboost.DMatrix` for
-    documents on meta info.
+class QuantileDMatrix(DMatrix):
+    """A DMatrix variant that generates quantilized data directly from input for
+    ``hist`` and ``gpu_hist`` tree methods. This DMatrix is primarily designed to save
+    memory in training by avoiding intermediate storage. Set ``max_bin`` to control the
+    number of bins during quantisation, which should be consistent with the training
+    parameter ``max_bin``. When ``QuantileDMatrix`` is used for validation/test dataset,
+    ``ref`` should be another ``QuantileDMatrix``(or ``DMatrix``, but not recommended as
+    it defeats the purpose of saving memory) constructed from training dataset.  See
+    :py:obj:`xgboost.DMatrix` for documents on meta info.
 
-    You can construct DeviceQuantileDMatrix from cupy/cudf/dlpack.
+    .. note::
 
-    .. versionadded:: 1.1.0
+        Do not use ``QuantileDMatrix`` as validation/test dataset without supplying a
+        reference (the training dataset) ``QuantileDMatrix`` using ``ref`` as some
+        information may be lost in quantisation.
+
+    .. versionadded:: 2.0.0
+
+    Parameters
+    ----------
+    max_bin :
+        The number of histogram bin, should be consistent with the training parameter
+        ``max_bin``.
+
+    ref :
+        The training dataset that provides quantile information, needed when creating
+        validation/test dataset with ``QuantileDMatrix``. Supplying the training DMatrix
+        as a reference means that the same quantisation applied to the training data is
+        applied to the validation/test data
 
     """
 
@@ -1261,17 +1279,18 @@ def __init__(  # pylint: disable=super-init-not-called
         feature_names: Optional[FeatureNames] = None,
         feature_types: Optional[FeatureTypes] = None,
         nthread: Optional[int] = None,
-        max_bin: int = 256,
+        max_bin: Optional[int] = None,
+        ref: Optional[DMatrix] = None,
         group: Optional[ArrayLike] = None,
         qid: Optional[ArrayLike] = None,
         label_lower_bound: Optional[ArrayLike] = None,
         label_upper_bound: Optional[ArrayLike] = None,
         feature_weights: Optional[ArrayLike] = None,
         enable_categorical: bool = False,
     ) -> None:
-        self.max_bin = max_bin
+        self.max_bin: int = max_bin if max_bin is not None else 256
         self.missing = missing if missing is not None else np.nan
-        self.nthread = nthread if nthread is not None else 1
+        self.nthread = nthread if nthread is not None else -1
         self._silent = silent  # unused, kept for compatibility
 
         if isinstance(data, ctypes.c_void_p):
@@ -1280,12 +1299,13 @@ def __init__(  # pylint: disable=super-init-not-called
 
         if qid is not None and group is not None:
             raise ValueError(
-                'Only one of the eval_qid or eval_group for each evaluation '
-                'dataset should be provided.'
+                "Only one of the eval_qid or eval_group for each evaluation "
+                "dataset should be provided."
             )
 
         self._init(
             data,
+            ref=ref,
             label=label,
             weight=weight,
             base_margin=base_margin,
@@ -1299,7 +1319,13 @@ def __init__(  # pylint: disable=super-init-not-called
             enable_categorical=enable_categorical,
         )
 
-    def _init(self, data: DataType, enable_categorical: bool, **meta: Any) -> None:
+    def _init(
+        self,
+        data: DataType,
+        ref: Optional[DMatrix],
+        enable_categorical: bool,
+        **meta: Any,
+    ) -> None:
         from .data import (
             _is_dlpack,
             _transform_dlpack,
@@ -1317,20 +1343,26 @@ def _init(self, data: DataType, enable_categorical: bool, **meta: Any) -> None:
             it = SingleBatchInternalIter(data=data, **meta)
 
         handle = ctypes.c_void_p()
-        reset_callback, next_callback = it.get_callbacks(False, enable_categorical)
+        reset_callback, next_callback = it.get_callbacks(True, enable_categorical)
         if it.cache_prefix is not None:
             raise ValueError(
-                "DeviceQuantileDMatrix doesn't cache data, remove the cache_prefix "
+                "QuantileDMatrix doesn't cache data, remove the cache_prefix "
                 "in iterator to fix this error."
             )
-        ret = _LIB.XGDeviceQuantileDMatrixCreateFromCallback(
+
+        args = {
+            "nthread": self.nthread,
+            "missing": self.missing,
+            "max_bin": self.max_bin,
+        }
+        config = from_pystr_to_cstr(json.dumps(args))
+        ret = _LIB.XGQuantileDMatrixCreateFromCallback(
             None,
             it.proxy.handle,
+            ref.handle if ref is not None else ref,
             reset_callback,
             next_callback,
-            ctypes.c_float(self.missing),
-            ctypes.c_int(self.nthread),
-            ctypes.c_int(self.max_bin),
+            config,
             ctypes.byref(handle),
         )
         it.reraise()
@@ -1339,6 +1371,20 @@ def _init(self, data: DataType, enable_categorical: bool, **meta: Any) -> None:
         self.handle = handle
 
 
+class DeviceQuantileDMatrix(QuantileDMatrix):
+    """ Use `QuantileDMatrix` instead.
+
+    .. deprecated:: 2.0.0
+
+    .. versionadded:: 1.1.0
+
+    """
+
+    def __init__(self, *args: Any, **kwargs: Any) -> None:
+        warnings.warn("Please use `QuantileDMatrix` instead.", FutureWarning)
+        super().__init__(*args, **kwargs)
+
+
 Objective = Callable[[np.ndarray, DMatrix], Tuple[np.ndarray, np.ndarray]]
 Metric = Callable[[np.ndarray, DMatrix], Tuple[str, float]]