Initial support for one hot categorical split.

dmlc · Sep 24, 2020 · 18f734a · 18f734a
1 parent 72ef553
commit 18f734a
Show file tree

Hide file tree

Showing 31 changed files with 691 additions and 181 deletions.
diff --git a/include/xgboost/feature_map.h b/include/xgboost/feature_map.h
@@ -82,7 +82,9 @@ class FeatureMap {
     if (!strcmp("q", tname)) return kQuantitive;
     if (!strcmp("int", tname)) return kInteger;
     if (!strcmp("float", tname)) return kFloat;
-    LOG(FATAL) << "unknown feature type, use i for indicator and q for quantity";
+    if (!strcmp("categorical", tname)) return kInteger;
+    LOG(FATAL) << "unknown feature type, use i for indicator, q for quantity "
+                  "and categorical for categorical split.";
     return kIndicator;
   }
   /*! \brief name of the feature */

diff --git a/python-package/xgboost/core.py b/python-package/xgboost/core.py
@@ -384,7 +384,8 @@ def __init__(self, data, label=None, weight=None, base_margin=None,
                  silent=False,
                  feature_names=None,
                  feature_types=None,
-                 nthread=None):
+                 nthread=None,
+                 enable_categorical=False):
         """Parameters
         ----------
         data : os.PathLike/string/numpy.array/scipy.sparse/pd.DataFrame/
@@ -419,6 +420,17 @@ def __init__(self, data, label=None, weight=None, base_margin=None,
             Number of threads to use for loading data when parallelization is
             applicable. If -1, uses maximum threads available on the system.
 
+        enable_categorical: boolean, optional
+
+            .. versionadded:: 1.3.0
+
+            Experimental support of specializing for categorical features.  Do
+            not set to True unless you are interested in development.
+            Currently it's only available for `gpu_hist` tree method with 1 vs
+            rest (one hot) categorical split.  Also, JSON serialization format,
+            `enable_experimental_json_serialization`, `gpu_predictor` and
+            pandas input are required.
+
         """
         if isinstance(data, list):
             raise TypeError('Input data can not be a list.')
@@ -437,7 +449,8 @@ def __init__(self, data, label=None, weight=None, base_margin=None,
             data, missing=self.missing,
             threads=self.nthread,
             feature_names=feature_names,
-            feature_types=feature_types)
+            feature_types=feature_types,
+            enable_categorical=enable_categorical)
         assert handle is not None
         self.handle = handle
 

diff --git a/python-package/xgboost/data.py b/python-package/xgboost/data.py
@@ -175,20 +175,24 @@ def _is_modin_df(data):
 }
 
 
-def _transform_pandas_df(data, feature_names=None, feature_types=None,
+def _transform_pandas_df(data, enable_categorical,
+                         feature_names=None, feature_types=None,
                          meta=None, meta_type=None):
     from pandas import MultiIndex, Int64Index
-    from pandas.api.types import is_sparse
+    from pandas.api.types import is_sparse, is_categorical
+
     data_dtypes = data.dtypes
-    if not all(dtype.name in _pandas_dtype_mapper or is_sparse(dtype)
+    if not all(dtype.name in _pandas_dtype_mapper or is_sparse(dtype) or
+               (is_categorical(dtype) and enable_categorical)
                for dtype in data_dtypes):
         bad_fields = [
             str(data.columns[i]) for i, dtype in enumerate(data_dtypes)
             if dtype.name not in _pandas_dtype_mapper
         ]
 
-        msg = """DataFrame.dtypes for data must be int, float or bool.
-                Did not expect the data types in fields """
+        msg = """DataFrame.dtypes for data must be int, float, bool or categorical.  When
+                categorical type is supplied, DMatrix parameter
+                `enable_categorical` must be set to `True`."""
         raise ValueError(msg + ', '.join(bad_fields))
 
     if feature_names is None and meta is None:
@@ -207,6 +211,8 @@ def _transform_pandas_df(data, feature_names=None, feature_types=None,
             if is_sparse(dtype):
                 feature_types.append(_pandas_dtype_mapper[
                     dtype.subtype.name])
+            elif is_categorical(dtype) and enable_categorical:
+                feature_types.append('categorical')
             else:
                 feature_types.append(_pandas_dtype_mapper[dtype.name])
 
@@ -215,15 +221,21 @@ def _transform_pandas_df(data, feature_names=None, feature_types=None,
             'DataFrame for {meta} cannot have multiple columns'.format(
                 meta=meta))
 
-    dtype = meta_type if meta_type else np.float32
     data = np.ascontiguousarray(data.values, dtype=dtype)
+    dtype = meta_type if meta_type else np.float32
+    try:
+        data = data.values.astype(dtype)
+    except ValueError as e:
+        raise ValueError('Data must be convertable to float, even ' +
+                         'for categorical data.') from e
 
     return data, feature_names, feature_types
 
 
-def _from_pandas_df(data, missing, nthread, feature_names, feature_types):
+def _from_pandas_df(data, enable_categorical, missing, nthread,
+                    feature_names, feature_types):
     data, feature_names, feature_types = _transform_pandas_df(
-        data, feature_names, feature_types)
+        data, enable_categorical, feature_names, feature_types)
     return _from_numpy_array(data, missing, nthread, feature_names,
                              feature_types)
 
@@ -498,7 +510,8 @@ def _has_array_protocol(data):
 
 
 def dispatch_data_backend(data, missing, threads,
-                          feature_names, feature_types):
+                          feature_names, feature_types,
+                          enable_categorical=False):
     '''Dispatch data for DMatrix.'''
     if _is_scipy_csr(data):
         return _from_scipy_csr(data, missing, feature_names, feature_types)
@@ -514,7 +527,7 @@ def dispatch_data_backend(data, missing, threads,
     if _is_tuple(data):
         return _from_tuple(data, missing, feature_names, feature_types)
     if _is_pandas_df(data):
-        return _from_pandas_df(data, missing, threads,
+        return _from_pandas_df(data, enable_categorical, missing, threads,
                                feature_names, feature_types)
     if _is_pandas_series(data):
         return _from_pandas_series(data, missing, threads, feature_names,
@@ -644,7 +657,8 @@ def dispatch_meta_backend(matrix: DMatrix, data, name: str, dtype: str = None):
         _meta_from_numpy(data, name, dtype, handle)
         return
     if _is_pandas_df(data):
-        data, _, _ = _transform_pandas_df(data, meta=name, meta_type=dtype)
+        data, _, _ = _transform_pandas_df(data, False, meta=name,
+                                          meta_type=dtype)
         _meta_from_numpy(data, name, dtype, handle)
         return
     if _is_pandas_series(data):

diff --git a/src/common/device_helpers.cuh b/src/common/device_helpers.cuh
@@ -80,6 +80,11 @@ struct AtomicDispatcher<sizeof(uint64_t)> {
   using Type = unsigned long long;  // NOLINT
   static_assert(sizeof(Type) == sizeof(uint64_t), "Unsigned long long should be of size 64 bits.");
 };
+
+template <>
+struct AtomicDispatcher<sizeof(uint8_t)> {
+  using Type = uint8_t;  // NOLINT
+};
 }  // namespace detail
 }  // namespace dh
 
@@ -536,6 +541,17 @@ void CopyDeviceSpanToVector(std::vector<T> *dst, xgboost::common::Span<const T>
                                 cudaMemcpyDeviceToHost));
 }
 
+template <class HContainer, class DContainer>
+void CopyToD(HContainer const &h, DContainer *d) {
+  d->resize(h.size());
+  using HVT = std::remove_cv_t<typename HContainer::value_type>;
+  using DVT = std::remove_cv_t<typename DContainer::value_type>;
+  static_assert(std::is_same<HVT, DVT>::value,
+                "Host and device containers must have same value type.");
+  dh::safe_cuda(cudaMemcpyAsync(d->data().get(), h.data(), h.size() * sizeof(HVT),
+                                cudaMemcpyHostToDevice));
+}
+
 // Keep track of pinned memory allocation
 struct PinnedMemory {
   void *temp_storage{nullptr};

diff --git a/src/common/hist_util.cu b/src/common/hist_util.cu
@@ -178,7 +178,7 @@ void ProcessBatch(int device, MetaInfo const &info, const SparsePage &page,
   dh::XGBCachingDeviceAllocator<char> alloc;
   const auto& host_data = page.data.ConstHostVector();
   dh::device_vector<Entry> sorted_entries(host_data.begin() + begin,
-                                                  host_data.begin() + end);
+                                          host_data.begin() + end);
   thrust::sort(thrust::cuda::par(alloc), sorted_entries.begin(),
                sorted_entries.end(), detail::EntryCompareOp());
 

diff --git a/src/common/host_device_vector.cc b/src/common/host_device_vector.cc
@@ -10,6 +10,7 @@
 #include <cstdint>
 #include <memory>
 #include <utility>
+#include "xgboost/tree_model.h"
 #include "xgboost/host_device_vector.h"
 
 namespace xgboost {
@@ -176,6 +177,7 @@ template class HostDeviceVector<FeatureType>;
 template class HostDeviceVector<Entry>;
 template class HostDeviceVector<uint64_t>;  // bst_row_t
 template class HostDeviceVector<uint32_t>;  // bst_feature_t
+template class HostDeviceVector<RegTree::Segment>;
 
 #if defined(__APPLE__)
 /*

diff --git a/src/common/host_device_vector.cu b/src/common/host_device_vector.cu
@@ -404,6 +404,7 @@ template class HostDeviceVector<Entry>;
 template class HostDeviceVector<uint64_t>;  // bst_row_t
 template class HostDeviceVector<uint32_t>;  // bst_feature_t
 template class HostDeviceVector<RegTree::Node>;
+template class HostDeviceVector<RegTree::Segment>;
 template class HostDeviceVector<RTreeNodeStat>;
 
 #if defined(__APPLE__)

diff --git a/src/data/adapter.h b/src/data/adapter.h
@@ -68,7 +68,7 @@ namespace data {
 /** \brief An adapter can return this value for number of rows or columns
  * indicating that this value is currently unknown and should be inferred while
  * passing over the data. */
-constexpr size_t kAdapterUnknownSize = std::numeric_limits<size_t >::max();
+constexpr size_t kAdapterUnknownSize = std::numeric_limits<bst_row_t>::max();
 
 struct COOTuple {
   COOTuple() = default;

diff --git a/src/data/iterative_device_dmatrix.cu b/src/data/iterative_device_dmatrix.cu
@@ -98,6 +98,9 @@ void IterativeDeviceDMatrix::Initialize(DataIterHandle iter_handle, float missin
         }));
     nnz += thrust::reduce(thrust::cuda::par(alloc), row_counts.begin(),
                           row_counts.end());
+
+    this->Info().feature_types.Resize(proxy->Info().feature_types.Size());
+    this->Info().feature_types.Copy(proxy->Info().feature_types);
     batches++;
   }
   iter.Reset();