From 6f2b3c78febb05c065fca33c740e49a6fb0bf4ee Mon Sep 17 00:00:00 2001
From: fis <jm.yuan@outlook.com>
Date: Tue, 26 Oct 2021 21:13:34 +0800
Subject: [PATCH] Implement typed and type erased tensor.

* Use typed tensor for storing meta info like base margin.
* Extend the array interface handler to multi-dim.

Implement a general array view.

* Replace existing matrix and vector view.

lint.

Remove const too.

Doc/Test.

Include.

Use it in AUC.

Win build.

Use int32_t.

Use integral.

force the same type.

Use constexpr for old nvcc.

Test for empty tensor.

Rename to view.

Format.

Better document and perf.

Address reviewer's comment.

tidy.

Implement a general array view.

* Replace existing matrix and vector view.

lint.

Remove const too.

Doc/Test.

Include.

Use it in AUC.

Win build.

Use int32_t.

Use integral.

force the same type.

Use constexpr for old nvcc.

Test for empty tensor.

Rename to view.

Format.

Prototype.

Move string view.

Compile on CPU.

Some fixes for GPU compilation.

Array interface.

Use it in file iter.

Cleanup.

itemsize.

Documents.

Cache the pointer.

port.

cuda compilation.

Start working on ari.

Add clang-format config. (#7383)

Generated using `clang-format -style=google -dump-config > .clang-format`, with column
width changed from 80 to 100 to be consistent with existing cpplint check.

Define shape and stride.

Convert some uses.

Prototype for copy tensor info.

proto unravel

Indexer.

Unravel.

Cleanup.

Fixes.

fixe.s

WAR.

as column vector.

Convert some code.

some more.

some more.

Compile.

Ensure column vector from the beginning.

IO.

Add code comments.

Test for trivial dimension.

Start CPU implementation.

Refactor.

Dispatch.

Compile.

Cleanup slice and index for array interface.

Add document about handling user input.

Validate the shape of base margin.

Cleanup out prediction.

Use it in Python.

Optimization.

Cleanup.

Define unravel index as an interface.

Dispatch DType.

Err.

By pass the iterator.

Cleanup old code.

comment.

Cleanup.

Remove duplicated function.

Add contiguous.

Typo.

Fix copying for group.

comment.

Fix CSR.

Fix empty dimensions.

Use linalg for utilities.

Fix test.

Fix streams.

Basic elementwise kernel.

Fixes.

fix dtype.

Fix index.

Comment.

Cleanup.

popcnt implementation.

Move to compile time.

Fix.

Fix.

Move to compile time.

Forward.

Forward.

Include.

Lint.

Reintroduce the checks.

Fix long double.

Remove check.

Comment.

Remove macro.

Some changes in jvm package.

Include.

Ignore 0 shape.

Fix.

Restore bound check for now.

Fix slice.

Fix test feature weights.

Stricter requirements.

Reshape.

Require v3 interface.

Use CUDA ptr attr.

Fix test.

Invalid stream.

tidy.

Fix versions.

test stack.

CPU only.

Simplifies.

Reverse version.

Force c++14 for msvc.

Lint.

Remove imports.

Revert cmake changes.

Reset.

Check.

Unify device initialization.

Tests.

Just bypass the error.

Fix.

Fix.

test qid.

Fix test.

Cleanup.

lint.

Fix.

Restore the heuristic.

Update metainfo binary.

Fix typo.

Restore the optimization.

Cleanup sklearn tests.

Cleanup dask test.

Tidy.

Tidy is happy.

Polish.

Unittest.
---
 doc/contrib/coding_guide.rst                  |  42 ++
 include/xgboost/c_api.h                       |  31 +-
 include/xgboost/data.h                        |  20 +-
 include/xgboost/intrusive_ptr.h               |   4 +-
 include/xgboost/linalg.h                      | 472 +++++++++++++++--
 include/xgboost/predictor.h                   |   2 +-
 .../xgboost4j-gpu/src/native/xgboost4j-gpu.cu |  30 +-
 python-package/xgboost/data.py                |  34 +-
 src/common/common.cu                          |   3 +-
 src/common/host_device_vector.cc              |   1 +
 src/common/host_device_vector.cu              |   1 +
 src/common/linalg_op.cuh                      |  25 +
 src/common/linalg_op.h                        |  25 +
 src/data/adapter.h                            |  80 ++-
 src/data/array_interface.cu                   |  53 +-
 src/data/array_interface.h                    | 485 +++++++++++-------
 src/data/data.cc                              | 377 +++++++++-----
 src/data/data.cu                              | 184 +++----
 src/data/device_adapter.cuh                   |  47 +-
 src/data/file_iterator.h                      |  15 +-
 src/data/simple_dmatrix.cc                    |   7 +-
 src/data/validation.h                         |  40 ++
 src/gbm/gblinear.cc                           |  19 +-
 src/predictor/cpu_predictor.cc                |  28 +-
 src/predictor/gpu_predictor.cu                |  29 +-
 src/predictor/predictor.cc                    |  31 +-
 src/tree/hist/evaluate_splits.h               |   2 +-
 src/tree/updater_colmaker.cc                  |   2 +-
 src/tree/updater_gpu_hist.cu                  |   2 +-
 tests/cpp/common/test_hist_util.h             |   2 +-
 tests/cpp/common/test_linalg.cc               | 139 ++++-
 tests/cpp/common/test_linalg.cu               |  62 +++
 tests/cpp/data/test_adapter.cc                |   7 +-
 tests/cpp/data/test_array_interface.cc        |  55 +-
 tests/cpp/data/test_array_interface.cu        |  15 +-
 tests/cpp/data/test_array_interface.h         |  10 +-
 .../cpp/data/test_iterative_device_dmatrix.cu |   4 +-
 tests/cpp/data/test_metainfo.cc               |  21 +-
 tests/cpp/data/test_metainfo.cu               |  19 +-
 tests/cpp/data/test_metainfo.h                |  82 +++
 tests/cpp/data/test_simple_dmatrix.cc         |  10 +-
 tests/cpp/data/test_simple_dmatrix.cu         |  10 +-
 tests/cpp/helpers.cc                          |   3 +-
 tests/cpp/helpers.h                           |   5 +-
 tests/cpp/predictor/test_gpu_predictor.cu     |   4 +-
 tests/python/test_dmatrix.py                  |   5 +-
 tests/python/test_with_dask.py                |  49 +-
 tests/python/test_with_sklearn.py             |  94 +---
 48 files changed, 1825 insertions(+), 862 deletions(-)
 create mode 100644 src/common/linalg_op.cuh
 create mode 100644 src/common/linalg_op.h
 create mode 100644 src/data/validation.h
 create mode 100644 tests/cpp/common/test_linalg.cu
 create mode 100644 tests/cpp/data/test_metainfo.h

diff --git a/doc/contrib/coding_guide.rst b/doc/contrib/coding_guide.rst
index 6d407ba129e9..92dcc4d9d8e2 100644
--- a/doc/contrib/coding_guide.rst
+++ b/doc/contrib/coding_guide.rst
@@ -134,3 +134,45 @@ Similarly, if you want to exclude C++ source from linting:
   cd /path/to/xgboost/
   python3 tests/ci_build/tidy.py --cpp=0
 
+**********************************
+Guide for handling user input data
+**********************************
+
+This is an in-comprehensive guide for handling user input data.  XGBoost has wide range of
+native supported data structures, mostly come from higher level language bindings, from
+basic contiguous 1 dimension memory buffer to more sophisticated data structures like
+columnar data with validity mask.  Raw data can be used in 2 places, firstly it's the
+construction of various ``DMatrix``, secondly it's the in-place prediction.  For plain
+memory buffer, there's not much to discuss, but for general array and columnar data, there
+are many pitfalls.  XGBoost has 3 different data structures for handling arrays (tensors),
+for consuming user inputs ``ArrayInterface`` should be chosen.  There are many existing
+functions that accept only plain pointer due to legacy reasons (XGBoost started as a much
+smaller library and didn't care about memory usage that much back then).  The
+``ArrayInterface`` is a in memory representation of ``__array_interface__`` protocol
+defined by numpy or the ``__cuda_array_interface__`` defined by numba.  Following is a
+check list of things to have in mind when accepting related user inputs:
+
+- [ ] Is it strided? (identified by the ``stride`` field)
+- [ ] Is the data type supported? Half type and 128 types should be converted before going into XGBoost.
+- [ ] Does it have higher than 1 dimension? (identified by ``shape`` field)
+- [ ] Are some of dimensions trivial? (shape[dim] <= 1)
+- [ ] Does it have mask? (identified by ``mask`` field)
+- [ ] Can the mask be broadcasted? (unsupported at the moment)
+- [ ] Is it on CUDA memory? (identified by ``stream`` field)
+
+Most of the checks are handled by the ``ArrayInterface`` during construction, except for
+the data type since it doesn't know how to handle such pointers with C builtin types.  But
+for safety reason one should still try to write related tests for all items. The data type
+issue should be taken care of in language binding for each of the specific data input.
+For columnar format, it's just a masked array for each column so they should be treated
+uniformly. For input predictor ``X``, we have adapters for each type of input. Some are
+composition of the others. For instance, CSR matrix has 3 potentially strided arrays for
+``indptr``, ``indices`` and ``values``. No assumption should be made to these components
+(all the check boxes should be considered). Slicing row of CSR matrix should calculate the
+offset based on strides.
+
+For meta info, which is growing both in size and complexity, we accept only masked array
+at the moment (no specialized adapter).  One should be careful about the input data shape,
+for base margin it can be 2 dim or higher if we have multiple targets in the future.  The
+getters in ``DMatrix`` returns only 1 dimension flatten vectors, which can be improved in
+the future when it's needed.
diff --git a/include/xgboost/c_api.h b/include/xgboost/c_api.h
index 8d929593c1a4..9d657872e26b 100644
--- a/include/xgboost/c_api.h
+++ b/include/xgboost/c_api.h
@@ -249,7 +249,7 @@ XGB_DLL int XGDMatrixCreateFromCudaArrayInterface(char const *data,
                                                   char const* json_config,
                                                   DMatrixHandle *out);
 
-/*
+/**
  * ========================== Begin data callback APIs =========================
  *
  * Short notes for data callback
@@ -258,9 +258,9 @@ XGB_DLL int XGDMatrixCreateFromCudaArrayInterface(char const *data,
  * used by JVM packages.  It uses `XGBoostBatchCSR` to accept batches for CSR formated
  * input, and concatenate them into 1 final big CSR.  The related functions are:
  *
- * - XGBCallbackSetData
- * - XGBCallbackDataIterNext
- * - XGDMatrixCreateFromDataIter
+ * - \ref XGBCallbackSetData
+ * - \ref XGBCallbackDataIterNext
+ * - \ref XGDMatrixCreateFromDataIter
  *
  * Another set is used by external data iterator. It accept foreign data iterators as
  * callbacks.  There are 2 different senarios where users might want to pass in callbacks
@@ -276,17 +276,17 @@ XGB_DLL int XGDMatrixCreateFromCudaArrayInterface(char const *data,
  * Related functions are:
  *
  * # Factory functions
- * - `XGDMatrixCreateFromCallback` for external memory
- * - `XGDeviceQuantileDMatrixCreateFromCallback` for quantile DMatrix
+ * - \ref XGDMatrixCreateFromCallback for external memory
+ * - \ref XGDeviceQuantileDMatrixCreateFromCallback for quantile DMatrix
  *
  * # Proxy that callers can use to pass data to XGBoost
- * - XGProxyDMatrixCreate
- * - XGDMatrixCallbackNext
- * - DataIterResetCallback
- * - XGProxyDMatrixSetDataCudaArrayInterface
- * - XGProxyDMatrixSetDataCudaColumnar
- * - XGProxyDMatrixSetDataDense
- * - XGProxyDMatrixSetDataCSR
+ * - \ref XGProxyDMatrixCreate
+ * - \ref XGDMatrixCallbackNext
+ * - \ref DataIterResetCallback
+ * - \ref XGProxyDMatrixSetDataCudaArrayInterface
+ * - \ref XGProxyDMatrixSetDataCudaColumnar
+ * - \ref XGProxyDMatrixSetDataDense
+ * - \ref XGProxyDMatrixSetDataCSR
  * - ... (data setters)
  */
 
@@ -411,7 +411,7 @@ XGB_EXTERN_C typedef void DataIterResetCallback(DataIterHandle handle); // NOLIN
  *   - cache_prefix: The path of cache file, caller must initialize all the directories in this path.
  *   - nthread (optional): Number of threads used for initializing DMatrix.
  *
- * \param out      The created external memory DMatrix
+ * \param[out] out      The created external memory DMatrix
  *
  * \return 0 when success, -1 when failure happens
  */
@@ -605,7 +605,8 @@ XGB_DLL int XGDMatrixSetUIntInfo(DMatrixHandle handle,
  *   char const* feat_names [] {"feat_0", "feat_1"};
  *   XGDMatrixSetStrFeatureInfo(handle, "feature_name", feat_names, 2);
  *
- *   // i for integer, q for quantitive.  Similarly "int" and "float" are also recognized.
+ *   // i for integer, q for quantitive, c for categorical.  Similarly "int" and "float"
+ *   // are also recognized.
  *   char const* feat_types [] {"i", "q"};
  *   XGDMatrixSetStrFeatureInfo(handle, "feature_type", feat_types, 2);
  *
diff --git a/include/xgboost/data.h b/include/xgboost/data.h
index cd000e371332..90cbc1373713 100644
--- a/include/xgboost/data.h
+++ b/include/xgboost/data.h
@@ -11,12 +11,14 @@
 #include <dmlc/data.h>
 #include <dmlc/serializer.h>
 #include <xgboost/base.h>
-#include <xgboost/span.h>
 #include <xgboost/host_device_vector.h>
+#include <xgboost/linalg.h>
+#include <xgboost/span.h>
+#include <xgboost/string_view.h>
 
+#include <algorithm>
 #include <memory>
 #include <numeric>
-#include <algorithm>
 #include <string>
 #include <utility>
 #include <vector>
@@ -45,7 +47,7 @@ enum class FeatureType : uint8_t {
 class MetaInfo {
  public:
   /*! \brief number of data fields in MetaInfo */
-  static constexpr uint64_t kNumField = 11;
+  static constexpr uint64_t kNumField = 12;
 
   /*! \brief number of rows in the data */
   uint64_t num_row_{0};  // NOLINT
@@ -67,7 +69,7 @@ class MetaInfo {
    * if specified, xgboost will start from this init margin
    * can be used to specify initial prediction to boost from.
    */
-  HostDeviceVector<bst_float> base_margin_;  // NOLINT
+  linalg::Tensor<float, 3> base_margin_;  // NOLINT
   /*!
    * \brief lower bound of the label, to be used for survival analysis (censored regression)
    */
@@ -93,7 +95,7 @@ class MetaInfo {
    * \brief Weight of each feature, used to define the probability of each feature being
    *        selected when using column sampling.
    */
-  HostDeviceVector<float> feature_weigths;
+  HostDeviceVector<float> feature_weights;
 
   /*! \brief default constructor */
   MetaInfo()  = default;
@@ -157,7 +159,8 @@ class MetaInfo {
    *
    *        Right now only 1 column is permitted.
    */
-  void SetInfo(const char* key, std::string const& interface_str);
+
+  void SetInfo(StringView key, StringView interface_str);
 
   void GetInfo(char const* key, bst_ulong* out_len, DataType dtype,
                const void** out_dptr) const;
@@ -179,6 +182,9 @@ class MetaInfo {
   void Extend(MetaInfo const& that, bool accumulate_rows, bool check_column);
 
  private:
+  void SetInfoFromHost(StringView key, Json arr);
+  void SetInfoFromCUDA(StringView key, Json arr);
+
   /*! \brief argsort of labels */
   mutable std::vector<size_t> label_order_cache_;
 };
@@ -477,7 +483,7 @@ class DMatrix {
     this->Info().SetInfo(key, dptr, dtype, num);
   }
   virtual void SetInfo(const char* key, std::string const& interface_str) {
-    this->Info().SetInfo(key, interface_str);
+    this->Info().SetInfo(key, StringView{interface_str});
   }
   /*! \brief meta information of the dataset */
   virtual const MetaInfo& Info() const = 0;
diff --git a/include/xgboost/intrusive_ptr.h b/include/xgboost/intrusive_ptr.h
index 1c58704c4ca3..df7ae30213d6 100644
--- a/include/xgboost/intrusive_ptr.h
+++ b/include/xgboost/intrusive_ptr.h
@@ -19,7 +19,7 @@ namespace xgboost {
  */
 class IntrusivePtrCell {
  private:
-  std::atomic<int32_t> count_;
+  std::atomic<int32_t> count_ {0};
   template <typename T> friend class IntrusivePtr;
 
   std::int32_t IncRef() noexcept {
@@ -31,7 +31,7 @@ class IntrusivePtrCell {
   bool IsZero() const { return Count() == 0; }
 
  public:
-  IntrusivePtrCell() noexcept : count_{0} {}
+  IntrusivePtrCell() noexcept = default;
   int32_t Count() const { return count_.load(std::memory_order_relaxed); }
 };
 
diff --git a/include/xgboost/linalg.h b/include/xgboost/linalg.h
index a801228e903e..bb0486866f81 100644
--- a/include/xgboost/linalg.h
+++ b/include/xgboost/linalg.h
@@ -6,12 +6,16 @@
 #ifndef XGBOOST_LINALG_H_
 #define XGBOOST_LINALG_H_
 
+#include <dmlc/endian.h>
 #include <xgboost/base.h>
-#include <xgboost/generic_parameters.h>
+#include <xgboost/host_device_vector.h>
+#include <xgboost/json.h>
 #include <xgboost/span.h>
 
 #include <algorithm>
 #include <cassert>
+#include <limits>
+#include <string>
 #include <type_traits>
 #include <utility>
 #include <vector>
@@ -19,16 +23,35 @@
 namespace xgboost {
 namespace linalg {
 namespace detail {
-template <typename S, typename Head, size_t D>
-constexpr size_t Offset(S (&strides)[D], size_t n, size_t dim, Head head) {
-  assert(dim < D);
+
+struct ArrayInterfaceHandler {
+  template <typename T>
+  static constexpr char TypeChar() {
+    return (std::is_floating_point<T>::value
+                ? 'f'
+                : (std::is_integral<T>::value ? (std::is_signed<T>::value ? 'i' : 'u') : '\0'));
+  }
+};
+
+template <size_t dim, typename S, typename Head, size_t D>
+constexpr size_t Offset(S (&strides)[D], size_t n, Head head) {
+  static_assert(dim < D, "");
   return n + head * strides[dim];
 }
 
-template <typename S, size_t D, typename Head, typename... Tail>
-constexpr size_t Offset(S (&strides)[D], size_t n, size_t dim, Head head, Tail &&...rest) {
-  assert(dim < D);
-  return Offset(strides, n + (head * strides[dim]), dim + 1, rest...);
+template <size_t dim, typename S, size_t D, typename Head, typename... Tail>
+constexpr std::enable_if_t<sizeof...(Tail) != 0, size_t> Offset(S (&strides)[D], size_t n,
+                                                                Head head, Tail &&...rest) {
+  static_assert(dim < D, "");
+  return Offset<dim + 1>(strides, n + (head * strides[dim]), std::forward<Tail>(rest)...);
+}
+
+template <int32_t D>
+constexpr void CalcStride(size_t (&shape)[D], size_t (&stride)[D]) {
+  stride[D - 1] = 1;
+  for (int32_t s = D - 2; s >= 0; --s) {
+    stride[s] = shape[s + 1] * stride[s + 1];
+  }
 }
 
 struct AllTag {};
@@ -71,6 +94,101 @@ XGBOOST_DEVICE constexpr auto UnrollLoop(Fn fn) {
     fn(i);
   }
 }
+
+template <typename T>
+int32_t NativePopc(T v) {
+  int c = 0;
+  for (; v != 0; v &= v - 1) c++;
+  return c;
+}
+
+inline XGBOOST_DEVICE int Popc(uint32_t v) {
+#if defined(__CUDA_ARCH__)
+  return __popc(v);
+#elif defined(__GNUC__) || defined(__clang__)
+  return __builtin_popcount(v);
+#elif defined(_MSC_VER)
+  return __popcnt(v);
+#else
+  return NativePopc(v);
+#endif  // compiler
+}
+
+inline XGBOOST_DEVICE int Popc(uint64_t v) {
+#if defined(__CUDA_ARCH__)
+  return __popcll(v);
+#elif defined(__GNUC__) || defined(__clang__)
+  return __builtin_popcountll(v);
+#elif defined(_MSC_VER)
+  return __popcnt64(v);
+#else
+  return NativePopc(v);
+#endif  // compiler
+}
+
+template <class T, std::size_t N, std::size_t... Idx>
+constexpr auto Arr2Tup(T (&arr)[N], std::index_sequence<Idx...>) {
+  return std::make_tuple(arr[Idx]...);
+}
+
+template <class T, std::size_t N>
+constexpr auto Arr2Tup(T (&arr)[N]) {
+  return Arr2Tup(arr, std::make_index_sequence<N>{});
+}
+
+// uint division optimization inspired by the CIndexer in cupy.  Division operation is
+// slow on both CPU and GPU, especially 64 bit integer.  So here we first try to avoid 64
+// bit when the index is smaller, then try to avoid division when it's exp of 2.
+template <typename I, int32_t D>
+XGBOOST_DEVICE auto UnravelImpl(I idx, common::Span<size_t const, D> shape) {
+  size_t index[D]{0};
+  static_assert(std::is_signed<decltype(D)>::value,
+                "Don't change the type without changing the for loop.");
+  for (int32_t dim = D; --dim > 0;) {
+    auto s = static_cast<std::remove_const_t<std::remove_reference_t<I>>>(shape[dim]);
+    if (s & (s - 1)) {
+      auto t = idx / s;
+      index[dim] = idx - t * s;
+      idx = t;
+    } else {  // exp of 2
+      index[dim] = idx & (s - 1);
+      idx >>= Popc(s - 1);
+    }
+  }
+  index[0] = idx;
+  return Arr2Tup(index);
+}
+
+template <size_t dim, typename I, int32_t D>
+void ReshapeImpl(size_t (&out_shape)[D], I s) {
+  static_assert(dim < D, "");
+  out_shape[dim] = s;
+}
+
+template <size_t dim, int32_t D, typename... S, typename I,
+          std::enable_if_t<sizeof...(S) != 0> * = nullptr>
+void ReshapeImpl(size_t (&out_shape)[D], I &&s, S &&...rest) {
+  static_assert(dim < D, "");
+  out_shape[dim] = s;
+  ReshapeImpl<dim + 1>(out_shape, std::forward<S>(rest)...);
+}
+
+template <typename Fn, typename Tup, size_t... I>
+XGBOOST_DEVICE decltype(auto) constexpr Apply(Fn &&f, Tup &&t, std::index_sequence<I...>) {
+  return f(std::get<I>(t)...);
+}
+
+/**
+ * C++ 17 style apply.
+ *
+ * \param f function to apply
+ * \param t tuple of arguments
+ */
+template <typename Fn, typename Tup>
+XGBOOST_DEVICE decltype(auto) constexpr Apply(Fn &&f, Tup &&t) {
+  constexpr auto kSize = std::tuple_size<Tup>::value;
+  return Apply(std::forward<Fn>(f), std::forward<Tup>(t), std::make_index_sequence<kSize>{});
+}
 }  // namespace detail
 
 /**
@@ -85,6 +203,11 @@ constexpr detail::AllTag All() { return {}; }
  * much linear algebra routines, this class is a helper intended to ease some high level
  * operations like indexing into prediction tensor or gradient matrix.  It can be passed
  * into CUDA kernel as normal argument for GPU algorithms.
+ *
+ * Ideally we should add a template parameter `bool on_host` so that the compiler can
+ * prevent passing/accessing the wrong view, but inheritance is heavily used in XGBoost so
+ * some functions expect data types that can be used in everywhere (update prediction
+ * cache for example).
  */
 template <typename T, int32_t kDim = 5>
 class TensorView {
@@ -96,7 +219,7 @@ class TensorView {
   StrideT stride_{1};
   ShapeT shape_{0};
   common::Span<T> data_;
-  T* ptr_{nullptr};  // pointer of data_ to avoid bound check.
+  T *ptr_{nullptr};  // pointer of data_ to avoid bound check.
 
   size_t size_{0};
   int32_t device_{-1};
@@ -110,42 +233,45 @@ class TensorView {
     }
   }
 
-  struct SliceHelper {
-    size_t old_dim;
-    size_t new_dim;
-    size_t offset;
-  };
-
-  template <int32_t D, typename... S>
-  XGBOOST_DEVICE SliceHelper MakeSliceDim(size_t old_dim, size_t new_dim, size_t new_shape[D],
-                                          size_t new_stride[D], detail::AllTag) const {
+  template <size_t old_dim, size_t new_dim, int32_t D, typename... S>
+  XGBOOST_DEVICE size_t MakeSliceDim(size_t new_shape[D], size_t new_stride[D],
+                                     detail::AllTag) const {
+    static_assert(new_dim < D, "");
+    static_assert(old_dim < kDim, "");
     new_stride[new_dim] = stride_[old_dim];
     new_shape[new_dim] = shape_[old_dim];
-    return {old_dim + 1, new_dim + 1, 0};
+    return 0;
   }
-
-  template <int32_t D, typename... S>
-  XGBOOST_DEVICE SliceHelper MakeSliceDim(size_t old_dim, size_t new_dim, size_t new_shape[D],
-                                          size_t new_stride[D], detail::AllTag,
-                                          S &&...slices) const {
+  /**
+   * \brief Slice dimension for All tag.
+   */
+  template <size_t old_dim, size_t new_dim, int32_t D, typename... S>
+  XGBOOST_DEVICE size_t MakeSliceDim(size_t new_shape[D], size_t new_stride[D], detail::AllTag,
+                                     S &&...slices) const {
+    static_assert(new_dim < D, "");
+    static_assert(old_dim < kDim, "");
     new_stride[new_dim] = stride_[old_dim];
     new_shape[new_dim] = shape_[old_dim];
-    return MakeSliceDim<D>(old_dim + 1, new_dim + 1, new_shape, new_stride, slices...);
+    return MakeSliceDim<old_dim + 1, new_dim + 1, D>(new_shape, new_stride,
+                                                     std::forward<S>(slices)...);
   }
 
-  template <int32_t D, typename Index>
-  XGBOOST_DEVICE SliceHelper MakeSliceDim(size_t old_dim, size_t new_dim, size_t new_shape[D],
-                                          size_t new_stride[D], Index i) const {
-    return {old_dim + 1, new_dim, stride_[old_dim] * i};
+  template <size_t old_dim, size_t new_dim, int32_t D, typename Index>
+  XGBOOST_DEVICE size_t MakeSliceDim(size_t new_shape[D], size_t new_stride[D], Index i) const {
+    static_assert(old_dim < kDim, "");
+    return stride_[old_dim] * i;
   }
-
-  template <int32_t D, typename Index, typename... S>
-  XGBOOST_DEVICE std::enable_if_t<std::is_integral<Index>::value, SliceHelper> MakeSliceDim(
-      size_t old_dim, size_t new_dim, size_t new_shape[D], size_t new_stride[D], Index i,
-      S &&...slices) const {
+  /**
+   * \brief Slice dimension for Index tag.
+   */
+  template <size_t old_dim, size_t new_dim, int32_t D, typename Index, typename... S>
+  XGBOOST_DEVICE std::enable_if_t<std::is_integral<Index>::value, size_t> MakeSliceDim(
+      size_t new_shape[D], size_t new_stride[D], Index i, S &&...slices) const {
+    static_assert(old_dim < kDim, "");
     auto offset = stride_[old_dim] * i;
-    auto res = MakeSliceDim<D>(old_dim + 1, new_dim, new_shape, new_stride, slices...);
-    return {res.old_dim, res.new_dim, res.offset + offset};
+    auto res =
+        MakeSliceDim<old_dim + 1, new_dim, D>(new_shape, new_stride, std::forward<S>(slices)...);
+    return res + offset;
   }
 
  public:
@@ -174,12 +300,10 @@ class TensorView {
       shape_[i] = 1;
     }
     // stride
-    stride_[kDim - 1] = 1;
-    for (int32_t s = kDim - 2; s >= 0; --s) {
-      stride_[s] = shape_[s + 1] * stride_[s + 1];
-    }
+    detail::CalcStride(shape_, stride_);
+    // size
     this->CalcSize();
-  };
+  }
 
   /**
    * \brief Create a tensor with data, shape and strides.  Don't use this constructor if
@@ -195,7 +319,7 @@ class TensorView {
       stride_[i] = stride[i];
     });
     this->CalcSize();
-  };
+  }
 
   XGBOOST_DEVICE TensorView(TensorView const &that)
       : data_{that.data_}, ptr_{data_.data()}, size_{that.size_}, device_{that.device_} {
@@ -221,7 +345,8 @@ class TensorView {
   template <typename... Index>
   XGBOOST_DEVICE T &operator()(Index &&...index) {
     static_assert(sizeof...(index) <= kDim, "Invalid index.");
-    size_t offset = detail::Offset(stride_, 0ul, 0ul, index...);
+    size_t offset = detail::Offset<0ul>(stride_, 0ul, std::forward<Index>(index)...);
+    assert(offset < data_.size() && "Out of bound access.");
     return ptr_[offset];
   }
   /**
@@ -230,12 +355,14 @@ class TensorView {
   template <typename... Index>
   XGBOOST_DEVICE T const &operator()(Index &&...index) const {
     static_assert(sizeof...(index) <= kDim, "Invalid index.");
-    size_t offset = detail::Offset(stride_, 0ul, 0ul, index...);
+    size_t offset = detail::Offset<0ul>(stride_, 0ul, std::forward<Index>(index)...);
+    assert(offset < data_.size() && "Out of bound access.");
     return ptr_[offset];
   }
 
   /**
-   * \brief Slice the tensor.  The returned tensor has inferred dim and shape.
+   * \brief Slice the tensor.  The returned tensor has inferred dim and shape.  Scalar
+   *        result is not supported.
    *
    * \code
    *
@@ -252,10 +379,10 @@ class TensorView {
     int32_t constexpr kNewDim{detail::CalcSliceDim<detail::IndexToTag<S>...>()};
     size_t new_shape[kNewDim];
     size_t new_stride[kNewDim];
-    auto res = MakeSliceDim<kNewDim>(size_t(0), size_t(0), new_shape, new_stride, slices...);
+    auto offset = MakeSliceDim<0, 0, kNewDim>(new_shape, new_stride, std::forward<S>(slices)...);
     // ret is a different type due to changed dimension, so we can not access its private
     // fields.
-    TensorView<T, kNewDim> ret{data_.subspan(data_.empty() ? 0 : res.offset), new_shape, new_stride,
+    TensorView<T, kNewDim> ret{data_.subspan(data_.empty() ? 0 : offset), new_shape, new_stride,
                                device_};
     return ret;
   }
@@ -275,12 +402,91 @@ class TensorView {
   XGBOOST_DEVICE auto cend() const { return data_.cend(); }      // NOLINT
   XGBOOST_DEVICE auto begin() { return data_.begin(); }          // NOLINT
   XGBOOST_DEVICE auto end() { return data_.end(); }              // NOLINT
-
+  /**
+   * \brief Number of items in the tensor.
+   */
   XGBOOST_DEVICE size_t Size() const { return size_; }
+  /**
+   * \brief Whether it's a contiguous array. (c and f contiguous are both contiguous)
+   */
+  XGBOOST_DEVICE bool Contiguous() const { return size_ == data_.size(); }
+  /**
+   * \brief Obtain the raw data.
+   */
   XGBOOST_DEVICE auto Values() const { return data_; }
+  /**
+   * \brief Obtain the CUDA device ordinal.
+   */
   XGBOOST_DEVICE auto DeviceIdx() const { return device_; }
+
+  /**
+   * \brief Array Interface defined by
+   * <a href="https://numpy.org/doc/stable/reference/arrays.interface.html">numpy</a>.
+   *
+   * `stream` is optionally included when data is on CUDA device.
+   */
+  Json ArrayInterface() const {
+    Json array_interface{Object{}};
+    array_interface["data"] = std::vector<Json>(2);
+    array_interface["data"][0] = Integer(reinterpret_cast<int64_t>(data_.data()));
+    array_interface["data"][1] = Boolean{true};
+    if (this->DeviceIdx() >= 0) {
+      // Change this once we have different CUDA stream.
+      array_interface["stream"] = Null{};
+    }
+    std::vector<Json> shape(Shape().size());
+    std::vector<Json> stride(Stride().size());
+    for (size_t i = 0; i < Shape().size(); ++i) {
+      shape[i] = Integer(Shape(i));
+      stride[i] = Integer(Stride(i) * sizeof(T));
+    }
+    array_interface["shape"] = Array{shape};
+    array_interface["strides"] = Array{stride};
+    array_interface["version"] = 3;
+
+    char constexpr kT = detail::ArrayInterfaceHandler::TypeChar<T>();
+    static_assert(kT != '\0', "");
+    if (DMLC_LITTLE_ENDIAN) {
+      array_interface["typestr"] = String{"<" + (kT + std::to_string(sizeof(T)))};
+    } else {
+      array_interface["typestr"] = String{">" + (kT + std::to_string(sizeof(T)))};
+    }
+    return array_interface;
+  }
+  /**
+   * \brief Same as const version, but returns non-readonly data pointer.
+   */
+  Json ArrayInterface() {
+    auto const &as_const = *this;
+    auto res = as_const.ArrayInterface();
+    res["data"][1] = Boolean{false};
+    return res;
+  }
+
+  auto ArrayInterfaceStr() const {
+    std::string str;
+    Json::Dump(this->ArrayInterface(), &str);
+    return str;
+  }
+  auto ArrayInterfaceStr() {
+    std::string str;
+    Json::Dump(this->ArrayInterface(), &str);
+    return str;
+  }
 };
 
+/**
+ * \brief Turns linear index into multi-dimension index.  Similar to numpy unravel.
+ */
+template <size_t D>
+XGBOOST_DEVICE auto UnravelIndex(size_t idx, common::Span<size_t const, D> shape) {
+  if (idx > std::numeric_limits<uint32_t>::max()) {
+    return detail::UnravelImpl<uint64_t, D>(static_cast<uint64_t>(idx), shape);
+  } else {
+    return detail::UnravelImpl<uint32_t, D>(static_cast<uint32_t>(idx), shape);
+  }
+}
+
 /**
  * \brief A view over a vector, specialization of Tensor
  *
@@ -289,6 +495,19 @@ class TensorView {
 template <typename T>
 using VectorView = TensorView<T, 1>;
 
+/**
+ * \brief Create a vector view from contigious memory.
+ *
+ * \param ptr Pointer to the contigious memory.
+ * \param s   Size of the vector.
+ * \param device (optional) Device ordinal, default to be host.
+ */
+template <typename T>
+auto MakeVec(T *ptr, size_t s, int32_t device = -1) {
+  using U = std::remove_const_t<std::remove_pointer_t<decltype(ptr)>> const;
+  return linalg::TensorView<U, 1>{{ptr, s}, {s}, device};
+}
+
 /**
  * \brief A view over a matrix, specialization of Tensor.
  *
@@ -296,6 +515,163 @@ using VectorView = TensorView<T, 1>;
  */
 template <typename T>
 using MatrixView = TensorView<T, 2>;
+
+/**
+ * \brief A tensor storage. To use it for other functionality like slicing one needs to
+ *        obtain a view first.  This way we can use it on both host and device.
+ */
+template <typename T, int32_t kDim = 5>
+class Tensor {
+ public:
+  using ShapeT = size_t[kDim];
+  using StrideT = ShapeT;
+
+ private:
+  HostDeviceVector<T> data_;
+  ShapeT shape_{0};
+
+ public:
+  Tensor() = default;
+
+  /**
+   * \brief Create a tensor with shape and device ordinal.  The storage is initialized
+   *        automatically.
+   *
+   * See \ref TensorView for parameters of this constructor.
+   */
+  template <typename I, int32_t D>
+  explicit Tensor(I const (&shape)[D], int32_t device) {
+    // No device unroll as this is a host only function.
+    std::copy(shape, shape + D, shape_);
+    for (auto i = D; i < kDim; ++i) {
+      shape_[i] = 1;
+    }
+    auto size = detail::CalcSize(shape_);
+    if (device >= 0) {
+      data_.SetDevice(device);
+    }
+    data_.Resize(size);
+    if (device >= 0) {
+      data_.DevicePointer();  // Pull to device
+    }
+  }
+  /**
+   * Initialize from 2 host iterators.
+   */
+  template <typename It, typename I, int32_t D>
+  explicit Tensor(It begin, It end, I const (&shape)[D], int32_t device) {
+    // shape
+    static_assert(D <= kDim, "Invalid shape.");
+    std::copy(shape, shape + D, shape_);
+    for (auto i = D; i < kDim; ++i) {
+      shape_[i] = 1;
+    }
+    auto &h_vec = data_.HostVector();
+    h_vec.insert(h_vec.begin(), begin, end);
+    if (device >= 0) {
+      data_.SetDevice(device);
+      data_.DevicePointer();  // Pull to device;
+    }
+    CHECK_EQ(data_.Size(), detail::CalcSize(shape_));
+  }
+  /**
+   * \brief Get a \ref TensorView for this tensor.
+   */
+  TensorView<T, kDim> View(int32_t device) {
+    if (device >= 0) {
+      data_.SetDevice(device);
+      auto span = data_.DeviceSpan();
+      return {span, shape_, device};
+    } else {
+      auto span = data_.HostSpan();
+      return {span, shape_, device};
+    }
+  }
+  TensorView<T const, kDim> View(int32_t device) const {
+    if (device >= 0) {
+      data_.SetDevice(device);
+      auto span = data_.ConstDeviceSpan();
+      return {span, shape_, device};
+    } else {
+      auto span = data_.ConstHostSpan();
+      return {span, shape_, device};
+    }
+  }
+
+  size_t Size() const { return data_.Size(); }
+  auto Shape() const { return common::Span<size_t const, kDim>{shape_}; }
+  auto Shape(size_t i) const { return shape_[i]; }
+
+  HostDeviceVector<T> *Data() { return &data_; }
+  HostDeviceVector<T> const *Data() const { return &data_; }
+
+  /**
+   * \brief Visitor function for modification that changes shape and data.
+   *
+   * \tparam Fn function that takes a pointer to `HostDeviceVector` and a static sized
+   *         span as parameters.
+   */
+  template <typename Fn>
+  void ModifyInplace(Fn &&fn) {
+    fn(this->Data(), common::Span<size_t, kDim>{this->shape_});
+    CHECK_EQ(this->Data()->Size(), detail::CalcSize(this->shape_))
+        << "Inconsistent size after modification.";
+  }
+
+  /**
+   * \brief Reshape the tensor.
+   *
+   *    If the total size is changed, then data in this tensor is no longer valid.
+   */
+  template <typename... S>
+  void Reshape(S &&...s) {
+    static_assert(sizeof...(S) <= kDim, "Invalid shape.");
+    detail::ReshapeImpl<0>(shape_, std::forward<S>(s)...);
+    auto constexpr kEnd = sizeof...(S);
+    static_assert(kEnd <= kDim, "Invalid shape.");
+    std::fill(shape_ + kEnd, shape_ + kDim, 1);
+    auto n = detail::CalcSize(shape_);
+    data_.Resize(n);
+  }
+
+  /**
+   * \brief Reshape the tensor.
+   *
+   *    If the total size is changed, then data in this tensor is no longer valid.
+   */
+  template <int32_t D>
+  void Reshape(size_t (&shape)[D]) {
+    static_assert(D <= kDim, "Invalid shape.");
+    std::copy(shape, shape + D, this->shape_);
+    std::fill(shape_ + D, shape_ + kDim, 1);
+    auto n = detail::CalcSize(shape_);
+    data_.Resize(n);
+  }
+
+  /**
+   * \brief Set device ordinal for this tensor.
+   */
+  void SetDevice(int32_t device) { data_.SetDevice(device); }
+};
+
+// Only first axis is supported for now.
+template <typename T, int32_t D>
+void Stack(Tensor<T, D> *l, Tensor<T, D> const &r) {
+  if (r.Data()->DeviceIdx() >= 0) {
+    l->Data()->SetDevice(r.Data()->DeviceIdx());
+  }
+  l->ModifyInplace([&](HostDeviceVector<T> *data, common::Span<size_t, D> shape) {
+    for (size_t i = 1; i < D; ++i) {
+      if (shape[i] == 0) {
+        shape[i] = r.Shape(i);
+      } else {
+        CHECK_EQ(shape[i], r.Shape(i));
+      }
+    }
+    data->Extend(*r.Data());
+    shape[0] = l->Shape(0) + r.Shape(0);
+  });
+}
 }  // namespace linalg
 }  // namespace xgboost
 #endif  // XGBOOST_LINALG_H_
diff --git a/include/xgboost/predictor.h b/include/xgboost/predictor.h
index dd87c601f978..859669a47ada 100644
--- a/include/xgboost/predictor.h
+++ b/include/xgboost/predictor.h
@@ -128,7 +128,7 @@ class Predictor {
    */
   virtual void InitOutPredictions(const MetaInfo &info,
                                   HostDeviceVector<bst_float> *out_predt,
-                                  const gbm::GBTreeModel &model) const = 0;
+                                  const gbm::GBTreeModel &model) const;
 
   /**
    * \brief Generate batch predictions for a given feature matrix. May use
diff --git a/jvm-packages/xgboost4j-gpu/src/native/xgboost4j-gpu.cu b/jvm-packages/xgboost4j-gpu/src/native/xgboost4j-gpu.cu
index de7a1fc41495..4ecf8b0f1da1 100644
--- a/jvm-packages/xgboost4j-gpu/src/native/xgboost4j-gpu.cu
+++ b/jvm-packages/xgboost4j-gpu/src/native/xgboost4j-gpu.cu
@@ -35,13 +35,12 @@ template <typename T> T CheckJvmCall(T const &v, JNIEnv *jenv) {
 }
 
 template <typename VCont>
-void CopyColumnMask(xgboost::ArrayInterface const &interface,
+void CopyColumnMask(xgboost::ArrayInterface<1> const &interface,
                     std::vector<Json> const &columns, cudaMemcpyKind kind,
                     size_t c, VCont *p_mask, Json *p_out, cudaStream_t stream) {
   auto &mask = *p_mask;
   auto &out = *p_out;
-  auto size = sizeof(typename VCont::value_type) * interface.num_rows *
-              interface.num_cols;
+  auto size = sizeof(typename VCont::value_type) * interface.n;
   mask.resize(size);
   CHECK(RawPtr(mask));
   CHECK(size);
@@ -67,11 +66,11 @@ void CopyColumnMask(xgboost::ArrayInterface const &interface,
     LOG(FATAL) << "Invalid shape of mask";
   }
   out["mask"]["typestr"] = String("<t1");
-  out["mask"]["version"] = Integer(1);
+  out["mask"]["version"] = Integer(3);
 }
 
 template <typename DCont, typename VCont>
-void CopyInterface(std::vector<xgboost::ArrayInterface> &interface_arr,
+void CopyInterface(std::vector<xgboost::ArrayInterface<1>> &interface_arr,
                    std::vector<Json> const &columns, cudaMemcpyKind kind,
                    std::vector<DCont> *p_data, std::vector<VCont> *p_mask,
                    std::vector<xgboost::Json> *p_out, cudaStream_t stream) {
@@ -81,7 +80,7 @@ void CopyInterface(std::vector<xgboost::ArrayInterface> &interface_arr,
   for (size_t c = 0; c < interface_arr.size(); ++c) {
     auto &interface = interface_arr.at(c);
     size_t element_size = interface.ElementSize();
-    size_t size = element_size * interface.num_rows * interface.num_cols;
+    size_t size = element_size * interface.n;
 
     auto &data = (*p_data)[c];
     auto &mask = (*p_mask)[c];
@@ -95,14 +94,13 @@ void CopyInterface(std::vector<xgboost::ArrayInterface> &interface_arr,
         Json{Boolean{false}}};
 
     out["data"] = Array(std::move(j_data));
-    out["shape"] = Array(std::vector<Json>{Json(Integer(interface.num_rows)),
-                                           Json(Integer(interface.num_cols))});
+    out["shape"] = Array(std::vector<Json>{Json(Integer(interface.Shape(0)))});
 
     if (interface.valid.Data()) {
       CopyColumnMask(interface, columns, kind, c, &mask, &out, stream);
     }
     out["typestr"] = String("<f4");
-    out["version"] = Integer(1);
+    out["version"] = Integer(3);
   }
 }
 
@@ -110,10 +108,10 @@ void CopyMetaInfo(Json *p_interface, dh::device_vector<float> *out, cudaStream_t
   auto &j_interface = *p_interface;
   CHECK_EQ(get<Array const>(j_interface).size(), 1);
   auto object = get<Object>(get<Array>(j_interface)[0]);
-  ArrayInterface interface(object);
-  out->resize(interface.num_rows);
+  ArrayInterface<1> interface(object);
+  out->resize(interface.Shape(0));
   size_t element_size = interface.ElementSize();
-  size_t size = element_size * interface.num_rows;
+  size_t size = element_size * interface.n;
   dh::safe_cuda(cudaMemcpyAsync(RawPtr(*out), interface.data, size,
                                 cudaMemcpyDeviceToDevice, stream));
   j_interface[0]["data"][0] = reinterpret_cast<Integer::Int>(RawPtr(*out));
@@ -285,11 +283,11 @@ class DataIteratorProxy {
 
     Json features = json_interface["features_str"];
     auto json_columns = get<Array const>(features);
-    std::vector<ArrayInterface> interfaces;
+    std::vector<ArrayInterface<1>> interfaces;
 
     // Stage the data
     for (auto &json_col : json_columns) {
-      auto column = ArrayInterface(get<Object const>(json_col));
+      auto column = ArrayInterface<1>(get<Object const>(json_col));
       interfaces.emplace_back(column);
     }
     Json::Dump(features, &interface_str);
@@ -342,9 +340,9 @@ class DataIteratorProxy {
     // Data
     auto const &json_interface = host_columns_.at(it_)->interfaces;
 
-    std::vector<ArrayInterface> in;
+    std::vector<ArrayInterface<1>> in;
     for (auto interface : json_interface) {
-      auto column = ArrayInterface(get<Object const>(interface));
+      auto column = ArrayInterface<1>(get<Object const>(interface));
       in.emplace_back(column);
     }
     std::vector<Json> out;
diff --git a/python-package/xgboost/data.py b/python-package/xgboost/data.py
index 285b2e840f59..c739f9267af5 100644
--- a/python-package/xgboost/data.py
+++ b/python-package/xgboost/data.py
@@ -5,7 +5,7 @@
 import json
 import warnings
 import os
-from typing import Any, Tuple, Callable, Optional, List
+from typing import Any, Tuple, Callable, Optional, List, Union
 
 import numpy as np
 
@@ -138,14 +138,14 @@ def _is_numpy_array(data):
     return isinstance(data, (np.ndarray, np.matrix))
 
 
-def _ensure_np_dtype(data, dtype):
+def _ensure_np_dtype(data, dtype) -> Tuple[np.ndarray, np.dtype]:
     if data.dtype.hasobject or data.dtype in [np.float16, np.bool_]:
         data = data.astype(np.float32, copy=False)
         dtype = np.float32
     return data, dtype
 
 
-def _maybe_np_slice(data, dtype):
+def _maybe_np_slice(data: np.ndarray, dtype) -> np.ndarray:
     '''Handle numpy slice.  This can be removed if we use __array_interface__.
     '''
     try:
@@ -852,23 +852,17 @@ def _validate_meta_shape(data: Any, name: str) -> None:
 
 
 def _meta_from_numpy(
-    data: np.ndarray, field: str, dtype, handle: ctypes.c_void_p
+    data: np.ndarray,
+    field: str,
+    dtype: Optional[Union[np.dtype, str]],
+    handle: ctypes.c_void_p,
 ) -> None:
-    data = _maybe_np_slice(data, dtype)
+    data, dtype = _ensure_np_dtype(data, dtype)
     interface = data.__array_interface__
-    assert interface.get('mask', None) is None, 'Masked array is not supported'
-    size = data.size
-
-    c_type = _to_data_type(str(data.dtype), field)
-    ptr = interface['data'][0]
-    ptr = ctypes.c_void_p(ptr)
-    _check_call(_LIB.XGDMatrixSetDenseInfo(
-        handle,
-        c_str(field),
-        ptr,
-        c_bst_ulong(size),
-        c_type
-    ))
+    if interface.get("mask", None) is not None:
+        raise ValueError("Masked array is not supported.")
+    interface_str = _array_interface(data)
+    _check_call(_LIB.XGDMatrixSetInfoFromInterface(handle, c_str(field), interface_str))
 
 
 def _meta_from_list(data, field, dtype, handle):
@@ -911,7 +905,9 @@ def _meta_from_dt(data, field: str, dtype, handle: ctypes.c_void_p):
     _meta_from_numpy(data, field, dtype, handle)
 
 
-def dispatch_meta_backend(matrix: DMatrix, data, name: str, dtype: str = None):
+def dispatch_meta_backend(
+    matrix: DMatrix, data, name: str, dtype: Optional[Union[str, np.dtype]] = None
+):
     '''Dispatch for meta info.'''
     handle = matrix.handle
     assert handle is not None
diff --git a/src/common/common.cu b/src/common/common.cu
index d30fbc0aeb88..4636a4cdcb7c 100644
--- a/src/common/common.cu
+++ b/src/common/common.cu
@@ -12,7 +12,8 @@ int AllVisibleGPUs() {
     // When compiled with CUDA but running on CPU only device,
     // cudaGetDeviceCount will fail.
     dh::safe_cuda(cudaGetDeviceCount(&n_visgpus));
-  } catch(const dmlc::Error &except) {
+  } catch (const dmlc::Error &) {
+    cudaGetLastError();  // reset error.
     return 0;
   }
   return n_visgpus;
diff --git a/src/common/host_device_vector.cc b/src/common/host_device_vector.cc
index cb02b22f0ca3..3a4a59db7881 100644
--- a/src/common/host_device_vector.cc
+++ b/src/common/host_device_vector.cc
@@ -170,6 +170,7 @@ void HostDeviceVector<T>::SetDevice(int) const {}
 
 // explicit instantiations are required, as HostDeviceVector isn't header-only
 template class HostDeviceVector<bst_float>;
+template class HostDeviceVector<double>;
 template class HostDeviceVector<GradientPair>;
 template class HostDeviceVector<int32_t>;   // bst_node_t
 template class HostDeviceVector<uint8_t>;
diff --git a/src/common/host_device_vector.cu b/src/common/host_device_vector.cu
index 8287cb24a1bd..456c60a67071 100644
--- a/src/common/host_device_vector.cu
+++ b/src/common/host_device_vector.cu
@@ -398,6 +398,7 @@ void HostDeviceVector<T>::Resize(size_t new_size, T v) {
 
 // explicit instantiations are required, as HostDeviceVector isn't header-only
 template class HostDeviceVector<bst_float>;
+template class HostDeviceVector<double>;
 template class HostDeviceVector<GradientPair>;
 template class HostDeviceVector<int32_t>;   // bst_node_t
 template class HostDeviceVector<uint8_t>;
diff --git a/src/common/linalg_op.cuh b/src/common/linalg_op.cuh
new file mode 100644
index 000000000000..dfab58729b56
--- /dev/null
+++ b/src/common/linalg_op.cuh
@@ -0,0 +1,25 @@
+/*!
+ * Copyright 2021 by XGBoost Contributors
+ */
+#ifndef XGBOOST_COMMON_LINALG_OP_CUH_
+#define XGBOOST_COMMON_LINALG_OP_CUH_
+#include "device_helpers.cuh"
+#include "xgboost/linalg.h"
+
+namespace xgboost {
+namespace linalg {
+template <typename T, int32_t D, typename Fn>
+void ElementWiseKernelDevice(linalg::TensorView<T, D> t, Fn&& fn, cudaStream_t s = nullptr) {
+  if (t.Contiguous()) {
+    auto ptr = t.Values().data();
+    dh::LaunchN(t.Size(), s, [=] __device__(size_t i) { ptr[i] = fn(i, ptr[i]); });
+  } else {
+    dh::LaunchN(t.Size(), s, [=] __device__(size_t i) mutable {
+      T& v = detail::Apply(t, linalg::UnravelIndex(i, t.Shape()));
+      v = fn(i, v);
+    });
+  }
+}
+}  // namespace linalg
+}  // namespace xgboost
+#endif  // XGBOOST_COMMON_LINALG_OP_CUH_
diff --git a/src/common/linalg_op.h b/src/common/linalg_op.h
new file mode 100644
index 000000000000..a74b119e7947
--- /dev/null
+++ b/src/common/linalg_op.h
@@ -0,0 +1,25 @@
+/*!
+ * Copyright 2021 by XGBoost Contributors
+ */
+#ifndef XGBOOST_COMMON_LINALG_OP_H_
+#define XGBOOST_COMMON_LINALG_OP_H_
+#include "threading_utils.h"
+#include "xgboost/linalg.h"
+
+namespace xgboost {
+namespace linalg {
+template <typename T, int32_t D, typename Fn>
+void ElementWiseKernelHost(linalg::TensorView<T, D> t, int32_t n_threads, Fn&& fn) {
+  if (t.Contiguous()) {
+    auto ptr = t.Values().data();
+    common::ParallelFor(t.Size(), n_threads, [&](size_t i) { ptr[i] = fn(i, ptr[i]); });
+  } else {
+    common::ParallelFor(t.Size(), n_threads, [&](size_t i) {
+      auto& v = detail::Apply(t, linalg::UnravelIndex(i, t.Shape()));
+      v = fn(i, v);
+    });
+  }
+}
+}  // namespace linalg
+}  // namespace xgboost
+#endif  // XGBOOST_COMMON_LINALG_OP_H_
diff --git a/src/data/adapter.h b/src/data/adapter.h
index 27da8c6e3b36..c4253b7e7a0e 100644
--- a/src/data/adapter.h
+++ b/src/data/adapter.h
@@ -254,20 +254,20 @@ class ArrayAdapterBatch : public detail::NoMetaInfo {
   static constexpr bool kIsRowMajor = true;
 
  private:
-  ArrayInterface array_interface_;
+  ArrayInterface<2> array_interface_;
 
   class Line {
-    ArrayInterface array_interface_;
+    ArrayInterface<2> array_interface_;
     size_t ridx_;
 
    public:
-    Line(ArrayInterface array_interface, size_t ridx)
+    Line(ArrayInterface<2> array_interface, size_t ridx)
         : array_interface_{std::move(array_interface)}, ridx_{ridx} {}
 
-    size_t Size() const { return array_interface_.num_cols; }
+    size_t Size() const { return array_interface_.Shape(1); }
 
     COOTuple GetElement(size_t idx) const {
-      return {ridx_, idx, array_interface_.GetElement(ridx_, idx)};
+      return {ridx_, idx, array_interface_(ridx_, idx)};
     }
   };
 
@@ -277,11 +277,11 @@ class ArrayAdapterBatch : public detail::NoMetaInfo {
     return Line{array_interface_, idx};
   }
 
-  size_t NumRows() const { return array_interface_.num_rows; }
-  size_t NumCols() const { return array_interface_.num_cols; }
+  size_t NumRows() const { return array_interface_.Shape(0); }
+  size_t NumCols() const { return array_interface_.Shape(1); }
   size_t Size() const { return this->NumRows(); }
 
-  explicit ArrayAdapterBatch(ArrayInterface array_interface)
+  explicit ArrayAdapterBatch(ArrayInterface<2> array_interface)
       : array_interface_{std::move(array_interface)} {}
 };
 
@@ -294,43 +294,42 @@ class ArrayAdapter : public detail::SingleBatchDataIter<ArrayAdapterBatch> {
  public:
   explicit ArrayAdapter(StringView array_interface) {
     auto j = Json::Load(array_interface);
-    array_interface_ = ArrayInterface(get<Object const>(j));
+    array_interface_ = ArrayInterface<2>(get<Object const>(j));
     batch_ = ArrayAdapterBatch{array_interface_};
   }
   ArrayAdapterBatch const& Value() const override { return batch_; }
-  size_t NumRows() const { return array_interface_.num_rows; }
-  size_t NumColumns() const { return array_interface_.num_cols; }
+  size_t NumRows() const { return array_interface_.Shape(0); }
+  size_t NumColumns() const { return array_interface_.Shape(1); }
 
  private:
   ArrayAdapterBatch batch_;
-  ArrayInterface array_interface_;
+  ArrayInterface<2> array_interface_;
 };
 
 class CSRArrayAdapterBatch : public detail::NoMetaInfo {
-  ArrayInterface indptr_;
-  ArrayInterface indices_;
-  ArrayInterface values_;
+  ArrayInterface<1> indptr_;
+  ArrayInterface<1> indices_;
+  ArrayInterface<1> values_;
   bst_feature_t n_features_;
 
   class Line {
-    ArrayInterface indices_;
-    ArrayInterface values_;
+    ArrayInterface<1> indices_;
+    ArrayInterface<1> values_;
     size_t ridx_;
     size_t offset_;
 
    public:
-    Line(ArrayInterface indices, ArrayInterface values, size_t ridx,
+    Line(ArrayInterface<1> indices, ArrayInterface<1> values, size_t ridx,
          size_t offset)
         : indices_{std::move(indices)}, values_{std::move(values)}, ridx_{ridx},
           offset_{offset} {}
 
     COOTuple GetElement(size_t idx) const {
-      return {ridx_, indices_.GetElement<size_t>(offset_ + idx, 0),
-              values_.GetElement(offset_ + idx, 0)};
+      return {ridx_, indices_.operator()<size_t>(offset_ + idx), values_(offset_ + idx)};
     }
 
     size_t Size() const {
-      return values_.num_rows * values_.num_cols;
+      return values_.Shape(0);
     }
   };
 
@@ -339,17 +338,16 @@ class CSRArrayAdapterBatch : public detail::NoMetaInfo {
 
  public:
   CSRArrayAdapterBatch() = default;
-  CSRArrayAdapterBatch(ArrayInterface indptr, ArrayInterface indices,
-                       ArrayInterface values, bst_feature_t n_features)
-      : indptr_{std::move(indptr)}, indices_{std::move(indices)},
-        values_{std::move(values)}, n_features_{n_features} {
-    indptr_.AsColumnVector();
-    values_.AsColumnVector();
-    indices_.AsColumnVector();
+  CSRArrayAdapterBatch(ArrayInterface<1> indptr, ArrayInterface<1> indices,
+                       ArrayInterface<1> values, bst_feature_t n_features)
+      : indptr_{std::move(indptr)},
+        indices_{std::move(indices)},
+        values_{std::move(values)},
+        n_features_{n_features} {
   }
 
   size_t NumRows() const {
-    size_t size = indptr_.num_rows * indptr_.num_cols;
+    size_t size = indptr_.Shape(0);
     size = size == 0 ? 0 : size - 1;
     return size;
   }
@@ -357,19 +355,19 @@ class CSRArrayAdapterBatch : public detail::NoMetaInfo {
   size_t Size() const { return this->NumRows(); }
 
   Line const GetLine(size_t idx) const {
-    auto begin_offset = indptr_.GetElement<size_t>(idx, 0);
-    auto end_offset = indptr_.GetElement<size_t>(idx + 1, 0);
+    auto begin_no_stride = indptr_.operator()<size_t>(idx);
+    auto end_no_stride = indptr_.operator()<size_t>(idx + 1);
 
     auto indices = indices_;
     auto values = values_;
+    // Slice indices and values, stride remains unchanged since this is slicing by
+    // specific index.
+    auto offset = indices.strides[0] * begin_no_stride;
 
-    values.num_cols = end_offset - begin_offset;
-    values.num_rows = 1;
+    indices.shape[0] = end_no_stride - begin_no_stride;
+    values.shape[0] = end_no_stride - begin_no_stride;
 
-    indices.num_cols = values.num_cols;
-    indices.num_rows = values.num_rows;
-
-    return Line{indices, values, idx, begin_offset};
+    return Line{indices, values, idx, offset};
   }
 };
 
@@ -391,7 +389,7 @@ class CSRArrayAdapter : public detail::SingleBatchDataIter<CSRArrayAdapterBatch>
     return batch_;
   }
   size_t NumRows() const {
-    size_t size = indptr_.num_cols * indptr_.num_rows;
+    size_t size = indptr_.Shape(0);
     size = size == 0 ? 0 : size - 1;
     return  size;
   }
@@ -399,9 +397,9 @@ class CSRArrayAdapter : public detail::SingleBatchDataIter<CSRArrayAdapterBatch>
 
  private:
   CSRArrayAdapterBatch batch_;
-  ArrayInterface indptr_;
-  ArrayInterface indices_;
-  ArrayInterface values_;
+  ArrayInterface<1> indptr_;
+  ArrayInterface<1> indices_;
+  ArrayInterface<1> values_;
   size_t num_cols_;
 };
 
diff --git a/src/data/array_interface.cu b/src/data/array_interface.cu
index def4de195523..08dbfaefb647 100644
--- a/src/data/array_interface.cu
+++ b/src/data/array_interface.cu
@@ -7,15 +7,50 @@
 namespace xgboost {
 void ArrayInterfaceHandler::SyncCudaStream(int64_t stream) {
   switch (stream) {
-  case 0:
-    LOG(FATAL) << "Invalid stream ID in array interface: " << stream;
-  case 1:
-    // default legacy stream
-    break;
-  case 2:
-    // default per-thread stream
-  default:
-    dh::safe_cuda(cudaStreamSynchronize(reinterpret_cast<cudaStream_t>(stream)));
+    case 0:
+      /**
+       * disallowed by the `__cuda_array_interface__`.  Quote:
+       *
+       *   This is disallowed as it would be ambiguous between None and the default
+       *   stream, and also between the legacy and per-thread default streams. Any use
+       *   case where 0 might be given should either use None, 1, or 2 instead for
+       *   clarity.
+       */
+      LOG(FATAL) << "Invalid stream ID in array interface: " << stream;
+    case 1:
+      // default legacy stream
+      break;
+    case 2:
+      // default per-thread stream
+    default:
+      dh::safe_cuda(cudaStreamSynchronize(reinterpret_cast<cudaStream_t>(stream)));
+  }
+}
+
+bool ArrayInterfaceHandler::IsCudaPtr(void const* ptr) {
+  if (!ptr) {
+    return false;
+  }
+  cudaPointerAttributes attr;
+  auto err = cudaPointerGetAttributes(&attr, ptr);
+  // reset error
+  CHECK_EQ(err, cudaGetLastError());
+  if (err == cudaErrorInvalidValue) {
+    // CUDA < 11
+    return false;
+  } else if (err == cudaSuccess) {
+    // CUDA >= 11
+    switch (attr.type) {
+      case cudaMemoryTypeUnregistered:
+      case cudaMemoryTypeHost:
+        return false;
+      default:
+        return true;
+    }
+    return true;
+  } else {
+    // other errors, no `cudaErrorNoDevice`, `cudaErrorInsufficientDriver` etc.
+    return false;
   }
 }
 }  // namespace xgboost
diff --git a/src/data/array_interface.h b/src/data/array_interface.h
index 6524f4512407..354a1ac49850 100644
--- a/src/data/array_interface.h
+++ b/src/data/array_interface.h
@@ -13,24 +13,23 @@
 #include <utility>
 #include <vector>
 
+#include "../common/bitfield.h"
+#include "../common/common.h"
 #include "xgboost/base.h"
 #include "xgboost/data.h"
 #include "xgboost/json.h"
+#include "xgboost/linalg.h"
 #include "xgboost/logging.h"
 #include "xgboost/span.h"
-#include "../common/bitfield.h"
-#include "../common/common.h"
 
 namespace xgboost {
 // Common errors in parsing columnar format.
 struct ArrayInterfaceErrors {
-  static char const* Contigious() {
-    return "Memory should be contigious.";
-  }
-  static char const* TypestrFormat() {
+  static char const *Contiguous() { return "Memory should be contiguous."; }
+  static char const *TypestrFormat() {
     return "`typestr' should be of format <endian><type><size of type in bytes>.";
   }
-  static char const* Dimension(int32_t d) {
+  static char const *Dimension(int32_t d) {
     static std::string str;
     str.clear();
     str += "Only ";
@@ -38,11 +37,11 @@ struct ArrayInterfaceErrors {
     str += " dimensional array is valid.";
     return str.c_str();
   }
-  static char const* Version() {
-    return "Only version <= 3 of "
-           "`__cuda_array_interface__/__array_interface__' are supported.";
+  static char const *Version() {
+    return "Only version <= 3 of `__cuda_array_interface__` and `__array_interface__' are "
+           "supported.";
   }
-  static char const* OfType(std::string const& type) {
+  static char const *OfType(std::string const &type) {
     static std::string str;
     str.clear();
     str += " should be of ";
@@ -96,38 +95,25 @@ struct ArrayInterfaceErrors {
 // object and turn it into an array (for cupy and numba).
 class ArrayInterfaceHandler {
  public:
-  template <typename T>
-  static constexpr char TypeChar() {
-    return
-        (std::is_floating_point<T>::value ? 'f' :
-         (std::is_integral<T>::value ?
-          (std::is_signed<T>::value ? 'i' : 'u') : '\0'));
-  }
+  enum Type : std::int8_t { kF4, kF8, kF16, kI1, kI2, kI4, kI8, kU1, kU2, kU4, kU8 };
 
   template <typename PtrType>
-  static PtrType GetPtrFromArrayData(std::map<std::string, Json> const& obj) {
+  static PtrType GetPtrFromArrayData(std::map<std::string, Json> const &obj) {
     if (obj.find("data") == obj.cend()) {
       LOG(FATAL) << "Empty data passed in.";
     }
-    auto p_data = reinterpret_cast<PtrType>(static_cast<size_t>(
-        get<Integer const>(
-            get<Array const>(
-                obj.at("data"))
-            .at(0))));
+    auto p_data = reinterpret_cast<PtrType>(
+        static_cast<size_t>(get<Integer const>(get<Array const>(obj.at("data")).at(0))));
     return p_data;
   }
 
-  static void Validate(std::map<std::string, Json> const& array) {
+  static void Validate(std::map<std::string, Json> const &array) {
     auto version_it = array.find("version");
     if (version_it == array.cend()) {
       LOG(FATAL) << "Missing `version' field for array interface";
     }
-    auto stream_it = array.find("stream");
-    if (stream_it != array.cend() && !IsA<Null>(stream_it->second)) {
-      // is cuda, check the version.
-      if (get<Integer const>(version_it->second) > 3) {
-        LOG(FATAL) << ArrayInterfaceErrors::Version();
-      }
+    if (get<Integer const>(version_it->second) > 3) {
+      LOG(FATAL) << ArrayInterfaceErrors::Version();
     }
 
     if (array.find("typestr") == array.cend()) {
@@ -149,12 +135,12 @@ class ArrayInterfaceHandler {
   // Mask object is also an array interface, but with different requirements.
   static size_t ExtractMask(std::map<std::string, Json> const &column,
                             common::Span<RBitField8::value_type> *p_out) {
-    auto& s_mask = *p_out;
+    auto &s_mask = *p_out;
     if (column.find("mask") != column.cend()) {
-      auto const& j_mask = get<Object const>(column.at("mask"));
+      auto const &j_mask = get<Object const>(column.at("mask"));
       Validate(j_mask);
 
-      auto p_mask = GetPtrFromArrayData<RBitField8::value_type*>(j_mask);
+      auto p_mask = GetPtrFromArrayData<RBitField8::value_type *>(j_mask);
 
       auto j_shape = get<Array const>(j_mask.at("shape"));
       CHECK_EQ(j_shape.size(), 1) << ArrayInterfaceErrors::Dimension(1);
@@ -186,8 +172,8 @@ class ArrayInterfaceHandler {
 
       if (j_mask.find("strides") != j_mask.cend()) {
         auto strides = get<Array const>(column.at("strides"));
-        CHECK_EQ(strides.size(),                        1) << ArrayInterfaceErrors::Dimension(1);
-        CHECK_EQ(get<Integer>(strides.at(0)), type_length) << ArrayInterfaceErrors::Contigious();
+        CHECK_EQ(strides.size(), 1) << ArrayInterfaceErrors::Dimension(1);
+        CHECK_EQ(get<Integer>(strides.at(0)), type_length) << ArrayInterfaceErrors::Contiguous();
       }
 
       s_mask = {p_mask, span_size};
@@ -195,96 +181,225 @@ class ArrayInterfaceHandler {
     }
     return 0;
   }
-
-  static std::pair<bst_row_t, bst_feature_t> ExtractShape(
-      std::map<std::string, Json> const& column) {
-    auto j_shape = get<Array const>(column.at("shape"));
-    auto typestr = get<String const>(column.at("typestr"));
-    if (j_shape.size() == 1) {
-      return {static_cast<bst_row_t>(get<Integer const>(j_shape.at(0))), 1};
-    } else {
-      CHECK_EQ(j_shape.size(), 2) << "Only 1-D and 2-D arrays are supported.";
-      return {static_cast<bst_row_t>(get<Integer const>(j_shape.at(0))),
-              static_cast<bst_feature_t>(get<Integer const>(j_shape.at(1)))};
+  /**
+   * \brief Handle vector inputs.  For higher dimension, we require strictly correct shape.
+   */
+  template <int32_t D>
+  static void HandleRowVector(std::vector<size_t> const &shape, std::vector<size_t> *p_out) {
+    auto &out = *p_out;
+    if (shape.size() == 2 && D == 1) {
+      auto m = shape[0];
+      auto n = shape[1];
+      CHECK(m == 1 || n == 1);
+      if (m == 1) {
+        // keep the number of columns
+        out[0] = out[1];
+        out.resize(1);
+      } else if (n == 1) {
+        // keep the number of rows.
+        out.resize(1);
+      }
+      // when both m and n are 1, above logic keeps the column.
+      // when neither m nor n is 1, caller should throw an error about Dimension.
     }
   }
 
-  static void ExtractStride(std::map<std::string, Json> const &column,
-                            size_t *stride_r, size_t *stride_c, size_t rows,
-                            size_t cols, size_t itemsize) {
-    auto strides_it = column.find("strides");
-    if (strides_it == column.cend() || IsA<Null>(strides_it->second)) {
-      // default strides
-      *stride_r = cols;
-      *stride_c = 1;
-    } else {
-      // strides specified by the array interface
-      auto const &j_strides = get<Array const>(strides_it->second);
-      CHECK_LE(j_strides.size(), 2) << ArrayInterfaceErrors::Dimension(2);
-      *stride_r = get<Integer const>(j_strides[0]) / itemsize;
-      size_t n = 1;
-      if (j_strides.size() == 2) {
-        n = get<Integer const>(j_strides[1]) / itemsize;
-      }
-      *stride_c = n;
+  template <int32_t D>
+  static void ExtractShape(std::map<std::string, Json> const &array, size_t (&out_shape)[D]) {
+    auto const &j_shape = get<Array const>(array.at("shape"));
+    std::vector<size_t> shape_arr(j_shape.size(), 0);
+    std::transform(j_shape.cbegin(), j_shape.cend(), shape_arr.begin(),
+                   [](Json in) { return get<Integer const>(in); });
+    // handle column vector vs. row vector
+    HandleRowVector<D>(shape_arr, &shape_arr);
+    // Copy shape.
+    size_t i;
+    for (i = 0; i < shape_arr.size(); ++i) {
+      CHECK_LT(i, D) << ArrayInterfaceErrors::Dimension(D);
+      out_shape[i] = shape_arr[i];
     }
+    // Fill the remaining dimensions
+    std::fill(out_shape + i, out_shape + D, 1);
+  }
 
-    auto valid = rows * (*stride_r) + cols * (*stride_c) >= (rows * cols);
-    CHECK(valid) << "Invalid strides in array."
-                 << "  strides: (" << (*stride_r) << "," << (*stride_c)
-                 << "), shape: (" << rows << ", " << cols << ")";
+  template <int32_t D>
+  static bool ExtractStride(std::map<std::string, Json> const &array, size_t itemsize,
+                            size_t (&shape)[D], size_t (&stride)[D]) {
+    auto strides_it = array.find("strides");
+    // No stride is provided
+    if (strides_it == array.cend() || IsA<Null>(strides_it->second)) {
+      // No stride is provided, we can calculate it from shape.
+      linalg::detail::CalcStride(shape, stride);
+      // Quote:
+      //
+      //   strides: Either None to indicate a C-style contiguous array or a Tuple of
+      //            strides which provides the number of bytes
+      return true;
+    }
+    // Get shape, we need to make changes to handle row vector, so some duplicated code
+    // from `ExtractShape` for copying out the shape.
+    auto const &j_shape = get<Array const>(array.at("shape"));
+    std::vector<size_t> shape_arr(j_shape.size(), 0);
+    std::transform(j_shape.cbegin(), j_shape.cend(), shape_arr.begin(),
+                   [](Json in) { return get<Integer const>(in); });
+    // Get stride
+    auto const &j_strides = get<Array const>(strides_it->second);
+    CHECK_EQ(j_strides.size(), j_shape.size()) << "stride and shape don't match.";
+    std::vector<size_t> stride_arr(j_strides.size(), 0);
+    std::transform(j_strides.cbegin(), j_strides.cend(), stride_arr.begin(),
+                   [](Json in) { return get<Integer const>(in); });
+
+    // Handle column vector vs. row vector
+    HandleRowVector<D>(shape_arr, &stride_arr);
+    size_t i;
+    for (i = 0; i < stride_arr.size(); ++i) {
+      // If one of the dim has shape 0 then total size is 0, stride is meaningless, but we
+      // set it to 0 here just to be consistent
+      CHECK_LT(i, D) << ArrayInterfaceErrors::Dimension(D);
+      // We use number of items instead of number of bytes
+      stride[i] = stride_arr[i] / itemsize;
+    }
+    std::fill(stride + i, stride + D, 1);
+    // If the stride can be calculated from shape then it's contiguous.
+    size_t stride_tmp[D];
+    linalg::detail::CalcStride(shape, stride_tmp);
+    return std::equal(stride_tmp, stride_tmp + D, stride);
   }
 
-  static void* ExtractData(std::map<std::string, Json> const &column,
-                                     std::pair<size_t, size_t> shape) {
-    Validate(column);
-    void* p_data = ArrayInterfaceHandler::GetPtrFromArrayData<void*>(column);
+  static void *ExtractData(std::map<std::string, Json> const &array, size_t size) {
+    Validate(array);
+    void *p_data = ArrayInterfaceHandler::GetPtrFromArrayData<void *>(array);
     if (!p_data) {
-      CHECK_EQ(shape.first * shape.second, 0) << "Empty data with non-zero shape.";
+      CHECK_EQ(size, 0) << "Empty data with non-zero shape.";
     }
     return p_data;
   }
-
+  /**
+   * \brief Whether the ptr is allocated by CUDA.
+   */
+  static bool IsCudaPtr(void const *ptr);
+  /**
+   * \brief Sync the CUDA stream.
+   */
   static void SyncCudaStream(int64_t stream);
 };
 
+/**
+ * Dispatch compile time type to runtime type.
+ */
+template <typename T, typename E = void>
+struct ToDType;
+// float
+template <>
+struct ToDType<float> {
+  static constexpr ArrayInterfaceHandler::Type kType = ArrayInterfaceHandler::kF4;
+};
+template <>
+struct ToDType<double> {
+  static constexpr ArrayInterfaceHandler::Type kType = ArrayInterfaceHandler::kF8;
+};
+template <typename T>
+struct ToDType<T,
+               std::enable_if_t<std::is_same<T, long double>::value && sizeof(long double) == 16>> {
+  static constexpr ArrayInterfaceHandler::Type kType = ArrayInterfaceHandler::kF16;
+};
+// uint
+template <>
+struct ToDType<uint8_t> {
+  static constexpr ArrayInterfaceHandler::Type kType = ArrayInterfaceHandler::kU1;
+};
+template <>
+struct ToDType<uint16_t> {
+  static constexpr ArrayInterfaceHandler::Type kType = ArrayInterfaceHandler::kU2;
+};
+template <>
+struct ToDType<uint32_t> {
+  static constexpr ArrayInterfaceHandler::Type kType = ArrayInterfaceHandler::kU4;
+};
+template <>
+struct ToDType<uint64_t> {
+  static constexpr ArrayInterfaceHandler::Type kType = ArrayInterfaceHandler::kU8;
+};
+// int
+template <>
+struct ToDType<int8_t> {
+  static constexpr ArrayInterfaceHandler::Type kType = ArrayInterfaceHandler::kI1;
+};
+template <>
+struct ToDType<int16_t> {
+  static constexpr ArrayInterfaceHandler::Type kType = ArrayInterfaceHandler::kI2;
+};
+template <>
+struct ToDType<int32_t> {
+  static constexpr ArrayInterfaceHandler::Type kType = ArrayInterfaceHandler::kI4;
+};
+template <>
+struct ToDType<int64_t> {
+  static constexpr ArrayInterfaceHandler::Type kType = ArrayInterfaceHandler::kI8;
+};
+
 #if !defined(XGBOOST_USE_CUDA)
-inline void ArrayInterfaceHandler::SyncCudaStream(int64_t stream) {
-  common::AssertGPUSupport();
-}
+inline void ArrayInterfaceHandler::SyncCudaStream(int64_t stream) { common::AssertGPUSupport(); }
+inline bool ArrayInterfaceHandler::IsCudaPtr(void const* ptr) { return false; }
 #endif  // !defined(XGBOOST_USE_CUDA)
 
 // A view over __array_interface__
+/**
+ * \brief A type erased view over __array_interface__ protocol defined by numpy
+ *
+ *   <a href="https://numpy.org/doc/stable/reference/arrays.interface.html">numpy</a>.
+ *
+ * \tparam D The number of maximum dimension.
+
+ *   User input array must have dim <= D for all non-trivial dimensions.  During
+ *   construction, the ctor can automatically remove those trivial dimensions.
+ *
+ * \tparam allow_mask Whether masked array is accepted.
+ *
+ *   Currently this only supported for 1-dim vector, which is used by cuDF column
+ *   (apache arrow format).  For general masked array, as the time of writting, only
+ *   numpy has the proper support even though it's in the __cuda_array_interface__
+ *   protocol defined by numba.
+ */
+template <int32_t D, bool allow_mask = (D == 1)>
 class ArrayInterface {
-  void Initialize(std::map<std::string, Json> const &array,
-                  bool allow_mask = true) {
+  static_assert(D > 0, "Invalid dimension for array interface.");
+
+  /**
+   * \brief Initialize the object, by extracting shape, stride and type.
+   *
+   *   The function also perform some basic validation for input array.  Lastly it will
+   *   also remove trivial dimensions like converting a matrix with shape (n_samples, 1)
+   *   to a vector of size n_samples.  For for inputs like weights, this should be a 1
+   *   dimension column vector even though user might provide a matrix.
+   */
+  void Initialize(std::map<std::string, Json> const &array) {
     ArrayInterfaceHandler::Validate(array);
+
     auto typestr = get<String const>(array.at("typestr"));
     this->AssignType(StringView{typestr});
+    ArrayInterfaceHandler::ExtractShape(array, shape);
+    size_t itemsize = typestr[2] - '0';
+    is_contiguous = ArrayInterfaceHandler::ExtractStride(array, itemsize, shape, strides);
+    n = linalg::detail::CalcSize(shape);
 
-    std::tie(num_rows, num_cols) = ArrayInterfaceHandler::ExtractShape(array);
-    data = ArrayInterfaceHandler::ExtractData(
-        array, std::make_pair(num_rows, num_cols));
+    data = ArrayInterfaceHandler::ExtractData(array, n);
 
     if (allow_mask) {
+      CHECK(D == 1) << "Masked array is not supported.";
       common::Span<RBitField8::value_type> s_mask;
       size_t n_bits = ArrayInterfaceHandler::ExtractMask(array, &s_mask);
 
       valid = RBitField8(s_mask);
 
       if (s_mask.data()) {
-        CHECK_EQ(n_bits, num_rows)
-            << "Shape of bit mask doesn't match data shape. "
-            << "XGBoost doesn't support internal broadcasting.";
+        CHECK_EQ(n_bits, n) << "Shape of bit mask doesn't match data shape. "
+                            << "XGBoost doesn't support internal broadcasting.";
       }
     } else {
-      CHECK(array.find("mask") == array.cend())
-          << "Masked array is not yet supported.";
+      CHECK(array.find("mask") == array.cend()) << "Masked array is not yet supported.";
     }
 
-    ArrayInterfaceHandler::ExtractStride(array, &stride_row, &stride_col,
-                                         num_rows, num_cols, typestr[2] - '0');
-
     auto stream_it = array.find("stream");
     if (stream_it != array.cend() && !IsA<Null>(stream_it->second)) {
       int64_t stream = get<Integer const>(stream_it->second);
@@ -292,151 +407,149 @@ class ArrayInterface {
     }
   }
 
- public:
-  enum Type : std::int8_t { kF4, kF8, kF16, kI1, kI2, kI4, kI8, kU1, kU2, kU4, kU8 };
-
  public:
   ArrayInterface() = default;
-  explicit ArrayInterface(std::string const &str, bool allow_mask = true)
-      : ArrayInterface{StringView{str.c_str(), str.size()}, allow_mask} {}
-
-  explicit ArrayInterface(std::map<std::string, Json> const &column,
-                          bool allow_mask = true) {
-    this->Initialize(column, allow_mask);
+  explicit ArrayInterface(std::map<std::string, Json> const &array) {
+    this->Initialize(array);
   }
 
-  explicit ArrayInterface(StringView str, bool allow_mask = true) {
-    auto jinterface = Json::Load(str);
-    if (IsA<Object>(jinterface)) {
-      this->Initialize(get<Object const>(jinterface), allow_mask);
+  explicit ArrayInterface(Json const &array) {
+    if (IsA<Object>(array)) {
+      this->Initialize(get<Object const>(array));
       return;
     }
-    if (IsA<Array>(jinterface)) {
-      CHECK_EQ(get<Array const>(jinterface).size(), 1)
+    if (IsA<Array>(array)) {
+      CHECK_EQ(get<Array const>(array).size(), 1)
           << "Column: " << ArrayInterfaceErrors::Dimension(1);
-      this->Initialize(get<Object const>(get<Array const>(jinterface)[0]), allow_mask);
+      this->Initialize(get<Object const>(get<Array const>(array)[0]));
       return;
     }
   }
 
-  void AsColumnVector() {
-    CHECK(num_rows == 1 || num_cols == 1) << "Array should be a vector instead of matrix.";
-    num_rows = std::max(num_rows, static_cast<size_t>(num_cols));
-    num_cols = 1;
+  explicit ArrayInterface(std::string const &str) : ArrayInterface{StringView{str}} {}
 
-    stride_row = std::max(stride_row, stride_col);
-    stride_col = 1;
-  }
+  explicit ArrayInterface(StringView str) : ArrayInterface<D>{Json::Load(str)} {}
 
   void AssignType(StringView typestr) {
-    if (typestr.size() == 4 && typestr[1] == 'f' && typestr[2] == '1' &&
-        typestr[3] == '6') {
-      type = kF16;
+    using T = ArrayInterfaceHandler::Type;
+    if (typestr.size() == 4 && typestr[1] == 'f' && typestr[2] == '1' && typestr[3] == '6') {
+      type = T::kF16;
       CHECK(sizeof(long double) == 16)
           << "128-bit floating point is not supported on current platform.";
     } else if (typestr[1] == 'f' && typestr[2] == '4') {
-      type = kF4;
+      type = T::kF4;
     } else if (typestr[1] == 'f' && typestr[2] == '8') {
-      type = kF8;
+      type = T::kF8;
     } else if (typestr[1] == 'i' && typestr[2] == '1') {
-      type = kI1;
+      type = T::kI1;
     } else if (typestr[1] == 'i' && typestr[2] == '2') {
-      type = kI2;
+      type = T::kI2;
     } else if (typestr[1] == 'i' && typestr[2] == '4') {
-      type = kI4;
+      type = T::kI4;
     } else if (typestr[1] == 'i' && typestr[2] == '8') {
-      type = kI8;
+      type = T::kI8;
     } else if (typestr[1] == 'u' && typestr[2] == '1') {
-      type = kU1;
+      type = T::kU1;
     } else if (typestr[1] == 'u' && typestr[2] == '2') {
-      type = kU2;
+      type = T::kU2;
     } else if (typestr[1] == 'u' && typestr[2] == '4') {
-      type = kU4;
+      type = T::kU4;
     } else if (typestr[1] == 'u' && typestr[2] == '8') {
-      type = kU8;
+      type = T::kU8;
     } else {
       LOG(FATAL) << ArrayInterfaceErrors::UnSupportedType(typestr);
       return;
     }
   }
 
+  XGBOOST_DEVICE size_t Shape(size_t i) const { return shape[i]; }
+  XGBOOST_DEVICE size_t Stride(size_t i) const { return strides[i]; }
+
   template <typename Fn>
-  XGBOOST_HOST_DEV_INLINE decltype(auto) DispatchCall(Fn func) const {
+  XGBOOST_HOST_DEV_INLINE constexpr decltype(auto) DispatchCall(Fn func) const {
+    using T = ArrayInterfaceHandler::Type;
     switch (type) {
-    case kF4:
-      return func(reinterpret_cast<float *>(data));
-    case kF8:
-      return func(reinterpret_cast<double *>(data));
+      case T::kF4:
+        return func(reinterpret_cast<float *>(data));
+      case T::kF8:
+        return func(reinterpret_cast<double *>(data));
 #ifdef __CUDA_ARCH__
-    case kF16: {
-      // CUDA device code doesn't support long double.
-      SPAN_CHECK(false);
-      return func(reinterpret_cast<double *>(data));
-    }
+      case T::kF16: {
+        // CUDA device code doesn't support long double.
+        SPAN_CHECK(false);
+        return func(reinterpret_cast<double *>(data));
+      }
 #else
-    case kF16:
-      return func(reinterpret_cast<long double *>(data));
+      case T::kF16:
+        return func(reinterpret_cast<long double *>(data));
 #endif
-    case kI1:
-      return func(reinterpret_cast<int8_t *>(data));
-    case kI2:
-      return func(reinterpret_cast<int16_t *>(data));
-    case kI4:
-      return func(reinterpret_cast<int32_t *>(data));
-    case kI8:
-      return func(reinterpret_cast<int64_t *>(data));
-    case kU1:
-      return func(reinterpret_cast<uint8_t *>(data));
-    case kU2:
-      return func(reinterpret_cast<uint16_t *>(data));
-    case kU4:
-      return func(reinterpret_cast<uint32_t *>(data));
-    case kU8:
-      return func(reinterpret_cast<uint64_t *>(data));
+      case T::kI1:
+        return func(reinterpret_cast<int8_t *>(data));
+      case T::kI2:
+        return func(reinterpret_cast<int16_t *>(data));
+      case T::kI4:
+        return func(reinterpret_cast<int32_t *>(data));
+      case T::kI8:
+        return func(reinterpret_cast<int64_t *>(data));
+      case T::kU1:
+        return func(reinterpret_cast<uint8_t *>(data));
+      case T::kU2:
+        return func(reinterpret_cast<uint16_t *>(data));
+      case T::kU4:
+        return func(reinterpret_cast<uint32_t *>(data));
+      case T::kU8:
+        return func(reinterpret_cast<uint64_t *>(data));
     }
     SPAN_CHECK(false);
     return func(reinterpret_cast<uint64_t *>(data));
   }
 
-  XGBOOST_DEVICE size_t ElementSize() {
-    return this->DispatchCall([](auto* p_values) {
-      return sizeof(std::remove_pointer_t<decltype(p_values)>);
-    });
+  XGBOOST_DEVICE size_t constexpr ElementSize() {
+    return this->DispatchCall(
+        [](auto *p_values) { return sizeof(std::remove_pointer_t<decltype(p_values)>); });
   }
 
-  template <typename T = float>
-  XGBOOST_DEVICE T GetElement(size_t r, size_t c) const {
-    return this->DispatchCall(
-        [=](auto *p_values) -> T { return p_values[stride_row * r + stride_col * c]; });
+  template <typename T = float, typename... Index>
+  XGBOOST_DEVICE T operator()(Index &&...index) const {
+    static_assert(sizeof...(index) <= D, "Invalid index.");
+    return this->DispatchCall([=](auto const *p_values) -> T {
+      size_t offset = linalg::detail::Offset<0ul>(strides, 0ul, index...);
+      return static_cast<T>(p_values[offset]);
+    });
   }
 
+  // Used only by columnar format.
   RBitField8 valid;
-  bst_row_t num_rows;
-  bst_feature_t num_cols;
-  size_t stride_row{0};
-  size_t stride_col{0};
-  void* data;
-  Type type;
+  // Array stride
+  size_t strides[D]{0};
+  // Array shape
+  size_t shape[D]{0};
+  // Type earsed pointer referencing the data.
+  void *data;
+  // Total number of items
+  size_t n;
+  // Whether the memory is c-contiguous
+  bool is_contiguous {false};
+  // RTTI
+  ArrayInterfaceHandler::Type type;
 };
 
-template <typename T> std::string MakeArrayInterface(T const *data, size_t n) {
-  Json arr{Object{}};
-  arr["data"] = Array(std::vector<Json>{
-      Json{Integer{reinterpret_cast<int64_t>(data)}}, Json{Boolean{false}}});
-  arr["shape"] = Array{std::vector<Json>{Json{Integer{n}}, Json{Integer{1}}}};
-  std::string typestr;
-  if (DMLC_LITTLE_ENDIAN) {
-    typestr.push_back('<');
-  } else {
-    typestr.push_back('>');
+/**
+ * \brief Helper for type casting.
+ */
+template <typename T, int32_t D>
+struct TypedIndex {
+  ArrayInterface<D> const &array;
+  template <typename... I>
+  XGBOOST_DEVICE T operator()(I &&...ind) {
+    static_assert(sizeof...(ind) <= D, "Invalid index.");
+    return array.template operator()<T>(ind...);
   }
-  typestr.push_back(ArrayInterfaceHandler::TypeChar<T>());
-  typestr += std::to_string(sizeof(T));
-  arr["typestr"] = typestr;
-  arr["version"] = 3;
-  std::string str;
-  Json::Dump(arr, &str);
-  return str;
+};
+
+template <int32_t D>
+inline void CheckArrayInterface(StringView key, ArrayInterface<D> const &array) {
+  CHECK(!array.valid.Data()) << "Meta info " << key << " should be dense, found validity mask";
 }
 }  // namespace xgboost
 #endif  // XGBOOST_DATA_ARRAY_INTERFACE_H_
diff --git a/src/data/data.cc b/src/data/data.cc
index 91526429e3b8..d9f489103f22 100644
--- a/src/data/data.cc
+++ b/src/data/data.cc
@@ -1,8 +1,9 @@
 /*!
- * Copyright 2015-2020 by Contributors
+ * Copyright 2015-2021 by Contributors
  * \file data.cc
  */
 #include <dmlc/registry.h>
+#include <array>
 #include <cstring>
 
 #include "dmlc/io.h"
@@ -12,10 +13,13 @@
 #include "xgboost/logging.h"
 #include "xgboost/version_config.h"
 #include "xgboost/learner.h"
+#include "xgboost/string_view.h"
+
 #include "sparse_page_writer.h"
 #include "simple_dmatrix.h"
 
 #include "../common/io.h"
+#include "../common/linalg_op.h"
 #include "../common/math.h"
 #include "../common/version.h"
 #include "../common/group_data.h"
@@ -24,6 +28,7 @@
 #include "../data/iterative_device_dmatrix.h"
 #include "file_iterator.h"
 
+#include "validation.h"
 #include "./sparse_page_source.h"
 #include "./sparse_page_dmatrix.h"
 
@@ -65,10 +70,22 @@ void SaveVectorField(dmlc::Stream* strm, const std::string& name,
   SaveVectorField(strm, name, type, shape, field.ConstHostVector());
 }
 
+template <typename T, int32_t D>
+void SaveTensorField(dmlc::Stream* strm, const std::string& name, xgboost::DataType type,
+                     const xgboost::linalg::Tensor<T, D>& field) {
+  strm->Write(name);
+  strm->Write(static_cast<uint8_t>(type));
+  strm->Write(false);  // is_scalar=False
+  for (size_t i = 0; i < D; ++i) {
+    strm->Write(field.Shape(i));
+  }
+  strm->Write(field.Data()->HostVector());
+}
+
 template <typename T>
 void LoadScalarField(dmlc::Stream* strm, const std::string& expected_name,
                      xgboost::DataType expected_type, T* field) {
-  const std::string invalid {"MetaInfo: Invalid format. "};
+  const std::string invalid{"MetaInfo: Invalid format for " + expected_name};
   std::string name;
   xgboost::DataType type;
   bool is_scalar;
@@ -90,7 +107,7 @@ void LoadScalarField(dmlc::Stream* strm, const std::string& expected_name,
 template <typename T>
 void LoadVectorField(dmlc::Stream* strm, const std::string& expected_name,
                      xgboost::DataType expected_type, std::vector<T>* field) {
-  const std::string invalid {"MetaInfo: Invalid format. "};
+  const std::string invalid{"MetaInfo: Invalid format for " + expected_name};
   std::string name;
   xgboost::DataType type;
   bool is_scalar;
@@ -123,6 +140,33 @@ void LoadVectorField(dmlc::Stream* strm, const std::string& expected_name,
   LoadVectorField(strm, expected_name, expected_type, &field->HostVector());
 }
 
+template <typename T, int32_t D>
+void LoadTensorField(dmlc::Stream* strm, std::string const& expected_name,
+                     xgboost::DataType expected_type, xgboost::linalg::Tensor<T, D>* p_out) {
+  const std::string invalid{"MetaInfo: Invalid format for " + expected_name};
+  std::string name;
+  xgboost::DataType type;
+  bool is_scalar;
+  CHECK(strm->Read(&name)) << invalid;
+  CHECK_EQ(name, expected_name) << invalid << " Expected field: " << expected_name
+                                << ", got: " << name;
+  uint8_t type_val;
+  CHECK(strm->Read(&type_val)) << invalid;
+  type = static_cast<xgboost::DataType>(type_val);
+  CHECK(type == expected_type) << invalid
+                               << "Expected field of type: " << static_cast<int>(expected_type)
+                               << ", "
+                               << "got field type: " << static_cast<int>(type);
+  CHECK(strm->Read(&is_scalar)) << invalid;
+  CHECK(!is_scalar) << invalid << "Expected field " << expected_name
+                    << " to be a tensor; got a scalar";
+  std::array<size_t, D> shape;
+  for (size_t i = 0; i < D; ++i) {
+    CHECK(strm->Read(&(shape[i])));
+  }
+  auto& field = p_out->Data()->HostVector();
+  CHECK(strm->Read(&field)) << invalid;
+}
 }  // anonymous namespace
 
 namespace xgboost {
@@ -135,25 +179,26 @@ void MetaInfo::Clear() {
   labels_.HostVector().clear();
   group_ptr_.clear();
   weights_.HostVector().clear();
-  base_margin_.HostVector().clear();
+  base_margin_ = decltype(base_margin_){};
 }
 
 /*
  * Binary serialization format for MetaInfo:
  *
- * | name               | type     | is_scalar | num_row | num_col | value                   |
- * |--------------------+----------+-----------+---------+---------+-------------------------|
- * | num_row            | kUInt64  | True      | NA      |      NA | ${num_row_}             |
- * | num_col            | kUInt64  | True      | NA      |      NA | ${num_col_}             |
- * | num_nonzero        | kUInt64  | True      | NA      |      NA | ${num_nonzero_}         |
- * | labels             | kFloat32 | False     | ${size} |       1 | ${labels_}              |
- * | group_ptr          | kUInt32  | False     | ${size} |       1 | ${group_ptr_}           |
- * | weights            | kFloat32 | False     | ${size} |       1 | ${weights_}             |
- * | base_margin        | kFloat32 | False     | ${size} |       1 | ${base_margin_}         |
- * | labels_lower_bound | kFloat32 | False     | ${size} |       1 | ${labels_lower_bound_}  |
- * | labels_upper_bound | kFloat32 | False     | ${size} |       1 | ${labels_upper_bound_}  |
- * | feature_names      | kStr     | False     | ${size} |       1 | ${feature_names}        |
- * | feature_types      | kStr     | False     | ${size} |       1 | ${feature_types}        |
+ * | name               | type     | is_scalar | num_row     |     num_col | dim3        | value                  |
+ * |--------------------+----------+-----------+-------------+-------------+-------------+------------------------|
+ * | num_row            | kUInt64  | True      | NA          |          NA | NA          | ${num_row_}            |
+ * | num_col            | kUInt64  | True      | NA          |          NA | NA          | ${num_col_}            |
+ * | num_nonzero        | kUInt64  | True      | NA          |          NA | NA          | ${num_nonzero_}        |
+ * | labels             | kFloat32 | False     | ${size}     |           1 | NA          | ${labels_}             |
+ * | group_ptr          | kUInt32  | False     | ${size}     |           1 | NA          | ${group_ptr_}          |
+ * | weights            | kFloat32 | False     | ${size}     |           1 | NA          | ${weights_}            |
+ * | base_margin        | kFloat32 | False     | ${Shape(0)} | ${Shape(1)} | ${Shape(2)} | ${base_margin_}        |
+ * | labels_lower_bound | kFloat32 | False     | ${size}     |           1 | NA          | ${labels_lower_bound_} |
+ * | labels_upper_bound | kFloat32 | False     | ${size}     |           1 | NA          | ${labels_upper_bound_} |
+ * | feature_names      | kStr     | False     | ${size}     |           1 | NA          | ${feature_names}       |
+ * | feature_types      | kStr     | False     | ${size}     |           1 | NA          | ${feature_types}       |
+ * | feature_types      | kFloat32 | False     | ${size}     |           1 | NA          | ${feature_weights}     |
  *
  * Note that the scalar fields (is_scalar=True) will have num_row and num_col missing.
  * Also notice the difference between the saved name and the name used in `SetInfo':
@@ -174,8 +219,7 @@ void MetaInfo::SaveBinary(dmlc::Stream *fo) const {
                   {group_ptr_.size(), 1}, group_ptr_); ++field_cnt;
   SaveVectorField(fo, u8"weights", DataType::kFloat32,
                   {weights_.Size(), 1}, weights_); ++field_cnt;
-  SaveVectorField(fo, u8"base_margin", DataType::kFloat32,
-                  {base_margin_.Size(), 1}, base_margin_); ++field_cnt;
+  SaveTensorField(fo, u8"base_margin", DataType::kFloat32, base_margin_); ++field_cnt;
   SaveVectorField(fo, u8"labels_lower_bound", DataType::kFloat32,
                   {labels_lower_bound_.Size(), 1}, labels_lower_bound_); ++field_cnt;
   SaveVectorField(fo, u8"labels_upper_bound", DataType::kFloat32,
@@ -185,6 +229,9 @@ void MetaInfo::SaveBinary(dmlc::Stream *fo) const {
                   {feature_names.size(), 1}, feature_names); ++field_cnt;
   SaveVectorField(fo, u8"feature_types", DataType::kStr,
                   {feature_type_names.size(), 1}, feature_type_names); ++field_cnt;
+  SaveVectorField(fo, u8"feature_weights", DataType::kFloat32, {feature_weights.Size(), 1},
+                  feature_weights);
+  ++field_cnt;
 
   CHECK_EQ(field_cnt, kNumField) << "Wrong number of fields";
 }
@@ -213,10 +260,14 @@ void MetaInfo::LoadBinary(dmlc::Stream *fi) {
   auto major = std::get<0>(version);
   // MetaInfo is saved in `SparsePageSource'.  So the version in MetaInfo represents the
   // version of DMatrix.
-  CHECK_EQ(major, 1) << "Binary DMatrix generated by XGBoost: "
-                     << Version::String(version) << " is no longer supported. "
-                     << "Please process and save your data in current version: "
-                     << Version::String(Version::Self()) << " again.";
+  std::stringstream msg;
+  msg << "Binary DMatrix generated by XGBoost: " << Version::String(version)
+      << " is no longer supported. "
+      << "Please process and save your data in current version: "
+      << Version::String(Version::Self()) << " again.";
+  CHECK_EQ(major, 1) << msg.str();
+  auto minor = std::get<1>(version);
+  CHECK_GE(minor, 6) << msg.str();
 
   const uint64_t expected_num_field = kNumField;
   uint64_t num_field { 0 };
@@ -243,12 +294,13 @@ void MetaInfo::LoadBinary(dmlc::Stream *fi) {
   LoadVectorField(fi, u8"labels", DataType::kFloat32, &labels_);
   LoadVectorField(fi, u8"group_ptr", DataType::kUInt32, &group_ptr_);
   LoadVectorField(fi, u8"weights", DataType::kFloat32, &weights_);
-  LoadVectorField(fi, u8"base_margin", DataType::kFloat32, &base_margin_);
+  LoadTensorField(fi, u8"base_margin", DataType::kFloat32, &base_margin_);
   LoadVectorField(fi, u8"labels_lower_bound", DataType::kFloat32, &labels_lower_bound_);
   LoadVectorField(fi, u8"labels_upper_bound", DataType::kFloat32, &labels_upper_bound_);
 
   LoadVectorField(fi, u8"feature_names", DataType::kStr, &feature_names);
   LoadVectorField(fi, u8"feature_types", DataType::kStr, &feature_type_names);
+  LoadVectorField(fi, u8"feature_weights", DataType::kFloat32, &feature_weights);
   LoadFeatureType(feature_type_names, &feature_types.HostVector());
 }
 
@@ -291,14 +343,17 @@ MetaInfo MetaInfo::Slice(common::Span<int32_t const> ridxs) const {
   if (this->base_margin_.Size() != this->num_row_) {
     CHECK_EQ(this->base_margin_.Size() % this->num_row_, 0)
         << "Incorrect size of base margin vector.";
-    size_t stride = this->base_margin_.Size() / this->num_row_;
-    out.base_margin_.HostVector() = Gather(this->base_margin_.HostVector(), ridxs, stride);
+    auto margin = this->base_margin_.View(this->base_margin_.Data()->DeviceIdx());
+    out.base_margin_.Reshape(ridxs.size(), margin.Shape()[1], margin.Shape()[2]);
+    size_t stride = margin.Stride(0);
+    out.base_margin_.Data()->HostVector() =
+        Gather(this->base_margin_.Data()->HostVector(), ridxs, stride);
   } else {
-    out.base_margin_.HostVector() = Gather(this->base_margin_.HostVector(), ridxs);
+    out.base_margin_.Data()->HostVector() = Gather(this->base_margin_.Data()->HostVector(), ridxs);
   }
 
-  out.feature_weigths.Resize(this->feature_weigths.Size());
-  out.feature_weigths.Copy(this->feature_weigths);
+  out.feature_weights.Resize(this->feature_weights.Size());
+  out.feature_weights.Copy(this->feature_weights);
 
   out.feature_names = this->feature_names;
   out.feature_types.Resize(this->feature_types.Size());
@@ -337,116 +392,179 @@ inline bool MetaTryLoadFloatInfo(const std::string& fname,
   return true;
 }
 
-void ValidateQueryGroup(std::vector<bst_group_t> const &group_ptr_) {
-  bool valid_query_group = true;
-  for (size_t i = 1; i < group_ptr_.size(); ++i) {
-    valid_query_group = valid_query_group && group_ptr_[i] >= group_ptr_[i - 1];
-    if (!valid_query_group) {
-      break;
-    }
+namespace {
+template <int32_t D, typename T>
+void CopyTensorInfoImpl(Json arr_interface, linalg::Tensor<T, D>* p_out) {
+  ArrayInterface<D> array{arr_interface};
+  if (array.n == 0) {
+    return;
+  }
+  CHECK(array.valid.Size() == 0) << "Meta info like label or weight can not have missing value.";
+  if (array.is_contiguous && array.type == ToDType<T>::kType) {
+    // Handle contigious
+    p_out->ModifyInplace([&](HostDeviceVector<T>* data, common::Span<size_t, D> shape) {
+      // set shape
+      std::copy(array.shape, array.shape + D, shape.data());
+      // set data
+      data->Resize(array.n);
+      std::memcpy(data->HostPointer(), array.data, array.n * sizeof(T));
+    });
+    return;
   }
-  CHECK(valid_query_group) << "Invalid group structure.";
+  p_out->Reshape(array.shape);
+  auto t = p_out->View(GenericParameter::kCpuId);
+  CHECK(t.Contiguous());
+  // FIXME(jiamingy): Remove the use of this default thread.
+  linalg::ElementWiseKernelHost(t, common::OmpGetNumThreads(0), [&](auto i, auto) {
+    return linalg::detail::Apply(TypedIndex<T, D>{array}, linalg::UnravelIndex<D>(i, t.Shape()));
+  });
 }
+}  // namespace
+
+void MetaInfo::SetInfo(StringView key, StringView interface_str) {
+  Json j_interface = Json::Load(interface_str);
+  bool is_cuda{false};
+  if (IsA<Array>(j_interface)) {
+    auto const& array = get<Array const>(j_interface);
+    CHECK_GE(array.size(), 0) << "Invalid " << key
+                              << ", must have at least 1 column even if it's empty.";
+    auto const& first = get<Object const>(array.front());
+    auto ptr = ArrayInterfaceHandler::GetPtrFromArrayData<void*>(first);
+    is_cuda = ArrayInterfaceHandler::IsCudaPtr(ptr);
+  } else {
+    auto const& first = get<Object const>(j_interface);
+    auto ptr = ArrayInterfaceHandler::GetPtrFromArrayData<void*>(first);
+    is_cuda = ArrayInterfaceHandler::IsCudaPtr(ptr);
+  }
 
-// macro to dispatch according to specified pointer types
-#define DISPATCH_CONST_PTR(dtype, old_ptr, cast_ptr, proc)              \
-  switch (dtype) {                                                      \
-    case xgboost::DataType::kFloat32: {                                 \
-      auto cast_ptr = reinterpret_cast<const float*>(old_ptr); proc; break; \
-    }                                                                   \
-    case xgboost::DataType::kDouble: {                                  \
-      auto cast_ptr = reinterpret_cast<const double*>(old_ptr); proc; break; \
-    }                                                                   \
-    case xgboost::DataType::kUInt32: {                                  \
-      auto cast_ptr = reinterpret_cast<const uint32_t*>(old_ptr); proc; break; \
-    }                                                                   \
-    case xgboost::DataType::kUInt64: {                                  \
-      auto cast_ptr = reinterpret_cast<const uint64_t*>(old_ptr); proc; break; \
-    }                                                                   \
-    default: LOG(FATAL) << "Unknown data type" << static_cast<uint8_t>(dtype); \
-  }                                                                     \
+  if (is_cuda) {
+    this->SetInfoFromCUDA(key, j_interface);
+  } else {
+    this->SetInfoFromHost(key, j_interface);
+  }
+}
 
-void MetaInfo::SetInfo(const char* key, const void* dptr, DataType dtype, size_t num) {
-  if (!std::strcmp(key, "label")) {
-    auto& labels = labels_.HostVector();
-    labels.resize(num);
-    DISPATCH_CONST_PTR(dtype, dptr, cast_dptr,
-                       std::copy(cast_dptr, cast_dptr + num, labels.begin()));
-    auto valid = std::none_of(labels.cbegin(), labels.cend(), [](auto y) {
-      return std::isnan(y) || std::isinf(y);
-    });
-    CHECK(valid) << "Label contains NaN, infinity or a value too large.";
-  } else if (!std::strcmp(key, "weight")) {
-    auto& weights = weights_.HostVector();
-    weights.resize(num);
-    DISPATCH_CONST_PTR(dtype, dptr, cast_dptr,
-                       std::copy(cast_dptr, cast_dptr + num, weights.begin()));
-    auto valid = std::none_of(weights.cbegin(), weights.cend(), [](float w) {
-      return w < 0 || std::isinf(w) || std::isnan(w);
-    });
-    CHECK(valid) << "Weights must be positive values.";
-  } else if (!std::strcmp(key, "base_margin")) {
-    auto& base_margin = base_margin_.HostVector();
-    base_margin.resize(num);
-    DISPATCH_CONST_PTR(dtype, dptr, cast_dptr,
-                       std::copy(cast_dptr, cast_dptr + num, base_margin.begin()));
-  } else if (!std::strcmp(key, "group")) {
-    group_ptr_.clear(); group_ptr_.resize(num + 1, 0);
-    DISPATCH_CONST_PTR(dtype, dptr, cast_dptr,
-                       std::copy(cast_dptr, cast_dptr + num, group_ptr_.begin() + 1));
-    group_ptr_[0] = 0;
-    for (size_t i = 1; i < group_ptr_.size(); ++i) {
-      group_ptr_[i] = group_ptr_[i - 1] + group_ptr_[i];
+void MetaInfo::SetInfoFromHost(StringView key, Json arr) {
+  // multi-dim float info
+  if (key == "base_margin") {
+    CopyTensorInfoImpl<3>(arr, &this->base_margin_);
+    // FIXME(jiamingy): Remove the deprecated API and let all language bindings aware of
+    // input shape.  This issue is CPU only since CUDA uses array interface from day 1.
+    //
+    // Python binding always understand the shape, so this condition should not occur for
+    // it.
+    if (this->num_row_ != 0 && this->base_margin_.Shape(0) != this->num_row_) {
+      // API functions that don't use array interface don't understand shape.
+      CHECK(this->base_margin_.Size() % this->num_row_ == 0) << "Incorrect size for base margin.";
+      size_t n_groups = this->base_margin_.Size() / this->num_row_;
+      this->base_margin_.Reshape(this->num_row_, n_groups);
     }
-    ValidateQueryGroup(group_ptr_);
-  } else if (!std::strcmp(key, "qid")) {
-    std::vector<uint32_t> query_ids(num, 0);
-    DISPATCH_CONST_PTR(dtype, dptr, cast_dptr,
-                       std::copy(cast_dptr, cast_dptr + num, query_ids.begin()));
+    return;
+  }
+  // uint info
+  if (key == "group") {
+    linalg::Tensor<bst_group_t, 1> t;
+    CopyTensorInfoImpl(arr, &t);
+    auto const& h_groups = t.Data()->HostVector();
+    group_ptr_.clear();
+    group_ptr_.resize(h_groups.size() + 1, 0);
+    group_ptr_[0] = 0;
+    std::partial_sum(h_groups.cbegin(), h_groups.cend(), group_ptr_.begin() + 1);
+    data::ValidateQueryGroup(group_ptr_);
+    return;
+  } else if (key == "qid") {
+    linalg::Tensor<bst_group_t, 1> t;
+    CopyTensorInfoImpl(arr, &t);
     bool non_dec = true;
+    auto const& query_ids = t.Data()->HostVector();
     for (size_t i = 1; i < query_ids.size(); ++i) {
-      if (query_ids[i] < query_ids[i-1]) {
+      if (query_ids[i] < query_ids[i - 1]) {
         non_dec = false;
         break;
       }
     }
     CHECK(non_dec) << "`qid` must be sorted in non-decreasing order along with data.";
-    group_ptr_.clear(); group_ptr_.push_back(0);
+    group_ptr_.clear();
+    group_ptr_.push_back(0);
     for (size_t i = 1; i < query_ids.size(); ++i) {
-      if (query_ids[i] != query_ids[i-1]) {
+      if (query_ids[i] != query_ids[i - 1]) {
         group_ptr_.push_back(i);
       }
     }
     if (group_ptr_.back() != query_ids.size()) {
       group_ptr_.push_back(query_ids.size());
     }
-  } else if (!std::strcmp(key, "label_lower_bound")) {
-    auto& labels = labels_lower_bound_.HostVector();
-    labels.resize(num);
-    DISPATCH_CONST_PTR(dtype, dptr, cast_dptr,
-                       std::copy(cast_dptr, cast_dptr + num, labels.begin()));
-  } else if (!std::strcmp(key, "label_upper_bound")) {
-    auto& labels = labels_upper_bound_.HostVector();
-    labels.resize(num);
-    DISPATCH_CONST_PTR(dtype, dptr, cast_dptr,
-                       std::copy(cast_dptr, cast_dptr + num, labels.begin()));
-  } else if (!std::strcmp(key, "feature_weights")) {
-    auto &h_feature_weights = feature_weigths.HostVector();
-    h_feature_weights.resize(num);
-    DISPATCH_CONST_PTR(
-        dtype, dptr, cast_dptr,
-        std::copy(cast_dptr, cast_dptr + num, h_feature_weights.begin()));
+    data::ValidateQueryGroup(group_ptr_);
+    return;
+  }
+  // float info
+  linalg::Tensor<float, 1> t;
+  CopyTensorInfoImpl<1>(arr, &t);
+  if (key == "label") {
+    this->labels_ = std::move(*t.Data());
+    auto const& h_labels = labels_.ConstHostVector();
+    auto valid = std::none_of(h_labels.cbegin(), h_labels.cend(), data::LabelsCheck{});
+    CHECK(valid) << "Label contains NaN, infinity or a value too large.";
+  } else if (key == "weight") {
+    this->weights_ = std::move(*t.Data());
+    auto const& h_weights = this->weights_.ConstHostVector();
+    auto valid = std::none_of(h_weights.cbegin(), h_weights.cend(),
+                              [](float w) { return w < 0 || std::isinf(w) || std::isnan(w); });
+    CHECK(valid) << "Weights must be positive values.";
+  } else if (key == "label_lower_bound") {
+    this->labels_lower_bound_ = std::move(*t.Data());
+  } else if (key == "label_upper_bound") {
+    this->labels_upper_bound_ = std::move(*t.Data());
+  } else if (key == "feature_weights") {
+    this->feature_weights = std::move(*t.Data());
+    auto const& h_feature_weights = feature_weights.ConstHostVector();
     bool valid =
-        std::none_of(h_feature_weights.cbegin(), h_feature_weights.cend(),
-                     [](float w) { return w < 0; });
+        std::none_of(h_feature_weights.cbegin(), h_feature_weights.cend(), data::WeightsCheck{});
     CHECK(valid) << "Feature weight must be greater than 0.";
   } else {
     LOG(FATAL) << "Unknown key for MetaInfo: " << key;
   }
 }
 
-void MetaInfo::GetInfo(char const *key, bst_ulong *out_len, DataType dtype,
-                       const void **out_dptr) const {
+void MetaInfo::SetInfo(const char* key, const void* dptr, DataType dtype, size_t num) {
+  auto proc = [&](auto cast_d_ptr) {
+    using T = std::remove_pointer_t<decltype(cast_d_ptr)>;
+    auto t =
+        linalg::TensorView<T, 1>(common::Span<T>{cast_d_ptr, num}, {num}, GenericParameter::kCpuId);
+    CHECK(t.Contiguous());
+    Json interface { t.ArrayInterface() };
+    assert(ArrayInterface<1>{interface}.is_contiguous);
+    return interface;
+  };
+  // Legacy code using XGBoost dtype, which is a small subset of array interface types.
+  switch (dtype) {
+    case xgboost::DataType::kFloat32: {
+      auto cast_ptr = reinterpret_cast<const float*>(dptr);
+      this->SetInfoFromHost(key, proc(cast_ptr));
+      break;
+    }
+    case xgboost::DataType::kDouble: {
+      auto cast_ptr = reinterpret_cast<const double*>(dptr);
+      this->SetInfoFromHost(key, proc(cast_ptr));
+      break;
+    }
+    case xgboost::DataType::kUInt32: {
+      auto cast_ptr = reinterpret_cast<const uint32_t*>(dptr);
+      this->SetInfoFromHost(key, proc(cast_ptr));
+      break;
+    }
+    case xgboost::DataType::kUInt64: {
+      auto cast_ptr = reinterpret_cast<const uint64_t*>(dptr);
+      this->SetInfoFromHost(key, proc(cast_ptr));
+      break;
+    }
+    default:
+      LOG(FATAL) << "Unknown data type" << static_cast<uint8_t>(dtype);
+  }
+}
+
+void MetaInfo::GetInfo(char const* key, bst_ulong* out_len, DataType dtype,
+                       const void** out_dptr) const {
   if (dtype == DataType::kFloat32) {
     const std::vector<bst_float>* vec = nullptr;
     if (!std::strcmp(key, "label")) {
@@ -454,13 +572,13 @@ void MetaInfo::GetInfo(char const *key, bst_ulong *out_len, DataType dtype,
     } else if (!std::strcmp(key, "weight")) {
       vec = &this->weights_.HostVector();
     } else if (!std::strcmp(key, "base_margin")) {
-      vec = &this->base_margin_.HostVector();
+      vec = &this->base_margin_.Data()->HostVector();
     } else if (!std::strcmp(key, "label_lower_bound")) {
       vec = &this->labels_lower_bound_.HostVector();
     } else if (!std::strcmp(key, "label_upper_bound")) {
       vec = &this->labels_upper_bound_.HostVector();
     } else if (!std::strcmp(key, "feature_weights")) {
-      vec = &this->feature_weigths.HostVector();
+      vec = &this->feature_weights.HostVector();
     } else {
       LOG(FATAL) << "Unknown float field name: " << key;
     }
@@ -543,8 +661,7 @@ void MetaInfo::Extend(MetaInfo const& that, bool accumulate_rows, bool check_col
   this->labels_upper_bound_.SetDevice(that.labels_upper_bound_.DeviceIdx());
   this->labels_upper_bound_.Extend(that.labels_upper_bound_);
 
-  this->base_margin_.SetDevice(that.base_margin_.DeviceIdx());
-  this->base_margin_.Extend(that.base_margin_);
+  linalg::Stack(&this->base_margin_, that.base_margin_);
 
   if (this->group_ptr_.size() == 0) {
     this->group_ptr_ = that.group_ptr_;
@@ -566,10 +683,10 @@ void MetaInfo::Extend(MetaInfo const& that, bool accumulate_rows, bool check_col
     auto &h_feature_types = feature_types.HostVector();
     LoadFeatureType(this->feature_type_names, &h_feature_types);
   }
-  if (!that.feature_weigths.Empty()) {
-    this->feature_weigths.Resize(that.feature_weigths.Size());
-    this->feature_weigths.SetDevice(that.feature_weigths.DeviceIdx());
-    this->feature_weigths.Copy(that.feature_weigths);
+  if (!that.feature_weights.Empty()) {
+    this->feature_weights.Resize(that.feature_weights.Size());
+    this->feature_weights.SetDevice(that.feature_weights.DeviceIdx());
+    this->feature_weights.Copy(that.feature_weights);
   }
 }
 
@@ -612,10 +729,10 @@ void MetaInfo::Validate(int32_t device) const {
     check_device(labels_lower_bound_);
     return;
   }
-  if (feature_weigths.Size() != 0) {
-    CHECK_EQ(feature_weigths.Size(), num_col_)
+  if (feature_weights.Size() != 0) {
+    CHECK_EQ(feature_weights.Size(), num_col_)
         << "Size of feature_weights must equal to number of columns.";
-    check_device(feature_weigths);
+    check_device(feature_weights);
   }
   if (labels_upper_bound_.Size() != 0) {
     CHECK_EQ(labels_upper_bound_.Size(), num_row_)
@@ -627,12 +744,12 @@ void MetaInfo::Validate(int32_t device) const {
   if (base_margin_.Size() != 0) {
     CHECK_EQ(base_margin_.Size() % num_row_, 0)
         << "Size of base margin must be a multiple of number of rows.";
-    check_device(base_margin_);
+    check_device(*base_margin_.Data());
   }
 }
 
 #if !defined(XGBOOST_USE_CUDA)
-void MetaInfo::SetInfo(const char * c_key, std::string const& interface_str) {
+void MetaInfo::SetInfoFromCUDA(StringView key, Json arr) {
   common::AssertGPUSupport();
 }
 #endif  // !defined(XGBOOST_USE_CUDA)
@@ -788,10 +905,10 @@ DMatrix* DMatrix::Load(const std::string& uri,
       LOG(CONSOLE) << info.group_ptr_.size() - 1
                    << " groups are loaded from " << fname << ".group";
     }
-    if (MetaTryLoadFloatInfo
-        (fname + ".base_margin", &info.base_margin_.HostVector()) && !silent) {
-      LOG(CONSOLE) << info.base_margin_.Size()
-                   << " base_margin are loaded from " << fname << ".base_margin";
+    if (MetaTryLoadFloatInfo(fname + ".base_margin", &info.base_margin_.Data()->HostVector()) &&
+        !silent) {
+      LOG(CONSOLE) << info.base_margin_.Size() << " base_margin are loaded from " << fname
+                   << ".base_margin";
     }
     if (MetaTryLoadFloatInfo
         (fname + ".weight", &info.weights_.HostVector()) && !silent) {
diff --git a/src/data/data.cu b/src/data/data.cu
index c4c7c503cb2e..b3bb2ccc6d61 100644
--- a/src/data/data.cu
+++ b/src/data/data.cu
@@ -9,84 +9,79 @@
 #include "xgboost/json.h"
 #include "array_interface.h"
 #include "../common/device_helpers.cuh"
+#include "../common/linalg_op.cuh"
 #include "device_adapter.cuh"
 #include "simple_dmatrix.h"
+#include "validation.h"
 
 namespace xgboost {
-
-void CopyInfoImpl(ArrayInterface column, HostDeviceVector<float>* out) {
-  auto SetDeviceToPtr = [](void* ptr) {
-    cudaPointerAttributes attr;
-    dh::safe_cuda(cudaPointerGetAttributes(&attr, ptr));
-    int32_t ptr_device = attr.device;
-    if (ptr_device >= 0) {
-      dh::safe_cuda(cudaSetDevice(ptr_device));
-    }
-    return ptr_device;
-  };
-  auto ptr_device = SetDeviceToPtr(column.data);
-
-  if (column.num_rows == 0) {
-    return;
-  }
-  out->SetDevice(ptr_device);
-
-  size_t size = column.num_rows * column.num_cols;
-  CHECK_NE(size, 0);
-  out->Resize(size);
-
-  auto p_dst = thrust::device_pointer_cast(out->DevicePointer());
-  dh::LaunchN(size, [=] __device__(size_t idx) {
-    size_t ridx = idx / column.num_cols;
-    size_t cidx = idx - (ridx * column.num_cols);
-    p_dst[idx] = column.GetElement(ridx, cidx);
-  });
-}
-
 namespace {
-auto SetDeviceToPtr(void *ptr) {
+auto SetDeviceToPtr(void* ptr) {
   cudaPointerAttributes attr;
   dh::safe_cuda(cudaPointerGetAttributes(&attr, ptr));
   int32_t ptr_device = attr.device;
   dh::safe_cuda(cudaSetDevice(ptr_device));
   return ptr_device;
 }
-}  // anonymous namespace
 
-void CopyGroupInfoImpl(ArrayInterface column, std::vector<bst_group_t>* out) {
-  CHECK(column.type != ArrayInterface::kF4 && column.type != ArrayInterface::kF8)
+template <typename T, int32_t D>
+void CopyTensorInfoImpl(Json arr_interface, linalg::Tensor<T, D>* p_out) {
+  ArrayInterface<D> array(arr_interface);
+  if (array.n == 0) {
+    return;
+  }
+  CHECK(array.valid.Size() == 0) << "Meta info like label or weight can not have missing value.";
+  auto ptr_device = SetDeviceToPtr(array.data);
+
+  if (array.is_contiguous && array.type == ToDType<T>::kType) {
+    p_out->ModifyInplace([&](HostDeviceVector<T>* data, common::Span<size_t, D> shape) {
+      // set shape
+      std::copy(array.shape, array.shape + D, shape.data());
+      // set data
+      data->SetDevice(ptr_device);
+      data->Resize(array.n);
+      dh::safe_cuda(cudaMemcpyAsync(data->DevicePointer(), array.data, array.n * sizeof(T),
+                                    cudaMemcpyDefault));
+    });
+    return;
+  }
+  p_out->SetDevice(ptr_device);
+  p_out->Reshape(array.shape);
+  auto t = p_out->View(ptr_device);
+  linalg::ElementWiseKernelDevice(t, [=] __device__(size_t i, T) {
+    return linalg::detail::Apply(TypedIndex<T, D>{array}, linalg::UnravelIndex<D>(i, array.shape));
+  });
+}
+
+void CopyGroupInfoImpl(ArrayInterface<1> column, std::vector<bst_group_t>* out) {
+  CHECK(column.type != ArrayInterfaceHandler::kF4 && column.type != ArrayInterfaceHandler::kF8)
       << "Expected integer for group info.";
 
   auto ptr_device = SetDeviceToPtr(column.data);
   CHECK_EQ(ptr_device, dh::CurrentDevice());
-  dh::TemporaryArray<bst_group_t> temp(column.num_rows);
-  auto d_tmp = temp.data();
+  dh::TemporaryArray<bst_group_t> temp(column.Shape(0));
+  auto d_tmp = temp.data().get();
 
-  dh::LaunchN(column.num_rows, [=] __device__(size_t idx) {
-    d_tmp[idx] = column.GetElement<size_t>(idx, 0);
-  });
-  auto length = column.num_rows;
+  dh::LaunchN(column.Shape(0),
+              [=] __device__(size_t idx) { d_tmp[idx] = column.operator()<size_t>(idx); });
+  auto length = column.Shape(0);
   out->resize(length + 1);
   out->at(0) = 0;
   thrust::copy(temp.data(), temp.data() + length, out->begin() + 1);
   std::partial_sum(out->begin(), out->end(), out->begin());
 }
 
-void CopyQidImpl(ArrayInterface array_interface,
-                 std::vector<bst_group_t> *p_group_ptr) {
+void CopyQidImpl(ArrayInterface<1> array_interface, std::vector<bst_group_t>* p_group_ptr) {
   auto &group_ptr_ = *p_group_ptr;
   auto it = dh::MakeTransformIterator<uint32_t>(
       thrust::make_counting_iterator(0ul),
-      [array_interface] __device__(size_t i) {
-        return array_interface.GetElement<uint32_t>(i, 0);
-      });
+      [array_interface] __device__(size_t i) { return array_interface.operator()<uint32_t>(i); });
   dh::caching_device_vector<bool> flag(1);
   auto d_flag = dh::ToSpan(flag);
   auto d = SetDeviceToPtr(array_interface.data);
   dh::LaunchN(1, [=] __device__(size_t) { d_flag[0] = true; });
-  dh::LaunchN(array_interface.num_rows - 1, [=] __device__(size_t i) {
-    if (array_interface.GetElement<uint32_t>(i, 0) >
-        array_interface.GetElement<uint32_t>(i + 1, 0)) {
+  dh::LaunchN(array_interface.Shape(0) - 1, [=] __device__(size_t i) {
+    if (array_interface.operator()<uint32_t>(i) > array_interface.operator()<uint32_t>(i + 1)) {
       d_flag[0] = false;
     }
   });
@@ -95,16 +90,16 @@ void CopyQidImpl(ArrayInterface array_interface,
                            cudaMemcpyDeviceToHost));
   CHECK(non_dec) << "`qid` must be sorted in increasing order along with data.";
   size_t bytes = 0;
-  dh::caching_device_vector<uint32_t> out(array_interface.num_rows);
-  dh::caching_device_vector<uint32_t> cnt(array_interface.num_rows);
+  dh::caching_device_vector<uint32_t> out(array_interface.Shape(0));
+  dh::caching_device_vector<uint32_t> cnt(array_interface.Shape(0));
   HostDeviceVector<int> d_num_runs_out(1, 0, d);
   cub::DeviceRunLengthEncode::Encode(
       nullptr, bytes, it, out.begin(), cnt.begin(),
-      d_num_runs_out.DevicePointer(), array_interface.num_rows);
+      d_num_runs_out.DevicePointer(), array_interface.Shape(0));
   dh::caching_device_vector<char> tmp(bytes);
   cub::DeviceRunLengthEncode::Encode(
       tmp.data().get(), bytes, it, out.begin(), cnt.begin(),
-      d_num_runs_out.DevicePointer(), array_interface.num_rows);
+      d_num_runs_out.DevicePointer(), array_interface.Shape(0));
 
   auto h_num_runs_out = d_num_runs_out.HostSpan()[0];
   group_ptr_.clear();
@@ -115,77 +110,52 @@ void CopyQidImpl(ArrayInterface array_interface,
   thrust::copy(cnt.begin(), cnt.begin() + h_num_runs_out,
                group_ptr_.begin() + 1);
 }
+}  // namespace
 
-namespace {
-// thrust::all_of tries to copy lambda function.
-struct LabelsCheck {
-  __device__ bool operator()(float y) { return ::isnan(y) || ::isinf(y); }
-};
-struct WeightsCheck {
-  __device__ bool operator()(float w) { return LabelsCheck{}(w) || w < 0; }  // NOLINT
-};
-}  // anonymous namespace
-
-void ValidateQueryGroup(std::vector<bst_group_t> const &group_ptr_);
-
-void MetaInfo::SetInfo(const char * c_key, std::string const& interface_str) {
-  Json j_interface = Json::Load({interface_str.c_str(), interface_str.size()});
-  ArrayInterface array_interface(interface_str);
-  std::string key{c_key};
-
-  CHECK(!array_interface.valid.Data())
-      << "Meta info " << key << " should be dense, found validity mask";
-  if (array_interface.num_rows == 0) {
-    return;
-  }
-
+void MetaInfo::SetInfoFromCUDA(StringView key, Json array) {
+  // multi-dim float info
   if (key == "base_margin") {
-    CopyInfoImpl(array_interface, &base_margin_);
+    CopyTensorInfoImpl(array, &base_margin_);
     return;
   }
-
-  CHECK(array_interface.num_cols == 1 || array_interface.num_rows == 1)
-      << "MetaInfo: " << c_key << " has invalid shape";
-  if (!((array_interface.num_cols == 1 && array_interface.num_rows == 0) ||
-        (array_interface.num_cols == 0 && array_interface.num_rows == 1))) {
-    // Not an empty column, transform it.
-    array_interface.AsColumnVector();
+  // uint info
+  if (key == "group") {
+    auto array_interface{ArrayInterface<1>(array)};
+    CopyGroupInfoImpl(array_interface, &group_ptr_);
+    data::ValidateQueryGroup(group_ptr_);
+    return;
+  } else if (key == "qid") {
+    auto array_interface{ArrayInterface<1>(array)};
+    CopyQidImpl(array_interface, &group_ptr_);
+    data::ValidateQueryGroup(group_ptr_);
+    return;
   }
+  // float info
+  linalg::Tensor<float, 1> t;
+  CopyTensorInfoImpl(array, &t);
   if (key == "label") {
-    CopyInfoImpl(array_interface, &labels_);
+    this->labels_ = std::move(*t.Data());
     auto ptr = labels_.ConstDevicePointer();
-    auto valid = thrust::none_of(thrust::device, ptr, ptr + labels_.Size(),
-                                 LabelsCheck{});
+    auto valid = thrust::none_of(thrust::device, ptr, ptr + labels_.Size(), data::LabelsCheck{});
     CHECK(valid) << "Label contains NaN, infinity or a value too large.";
   } else if (key == "weight") {
-    CopyInfoImpl(array_interface, &weights_);
+    this->weights_ = std::move(*t.Data());
     auto ptr = weights_.ConstDevicePointer();
-    auto valid = thrust::none_of(thrust::device, ptr, ptr + weights_.Size(),
-                                 WeightsCheck{});
+    auto valid = thrust::none_of(thrust::device, ptr, ptr + weights_.Size(), data::WeightsCheck{});
     CHECK(valid) << "Weights must be positive values.";
-  } else if (key == "group") {
-    CopyGroupInfoImpl(array_interface, &group_ptr_);
-    ValidateQueryGroup(group_ptr_);
-    return;
-  } else if (key == "qid") {
-    CopyQidImpl(array_interface, &group_ptr_);
-    return;
   } else if (key == "label_lower_bound") {
-    CopyInfoImpl(array_interface, &labels_lower_bound_);
-    return;
+    this->labels_lower_bound_ = std::move(*t.Data());
   } else if (key == "label_upper_bound") {
-    CopyInfoImpl(array_interface, &labels_upper_bound_);
-    return;
+    this->labels_upper_bound_ = std::move(*t.Data());
   } else if (key == "feature_weights") {
-    CopyInfoImpl(array_interface, &feature_weigths);
-    auto d_feature_weights = feature_weigths.ConstDeviceSpan();
-    auto valid = thrust::none_of(
-        thrust::device, d_feature_weights.data(),
-        d_feature_weights.data() + d_feature_weights.size(), WeightsCheck{});
+    this->feature_weights = std::move(*t.Data());
+    auto d_feature_weights = feature_weights.ConstDeviceSpan();
+    auto valid =
+        thrust::none_of(thrust::device, d_feature_weights.data(),
+                        d_feature_weights.data() + d_feature_weights.size(), data::WeightsCheck{});
     CHECK(valid) << "Feature weight must be greater than 0.";
-    return;
   } else {
-    LOG(FATAL) << "Unknown metainfo: " << key;
+    LOG(FATAL) << "Unknown key for MetaInfo: " << key;
   }
 }
 
diff --git a/src/data/device_adapter.cuh b/src/data/device_adapter.cuh
index 628878f319f1..d1bda280a7d5 100644
--- a/src/data/device_adapter.cuh
+++ b/src/data/device_adapter.cuh
@@ -20,7 +20,7 @@ class CudfAdapterBatch : public detail::NoMetaInfo {
 
  public:
   CudfAdapterBatch() = default;
-  CudfAdapterBatch(common::Span<ArrayInterface> columns, size_t num_rows)
+  CudfAdapterBatch(common::Span<ArrayInterface<1>> columns, size_t num_rows)
       : columns_(columns),
         num_rows_(num_rows) {}
   size_t Size() const { return num_rows_ * columns_.size(); }
@@ -29,7 +29,7 @@ class CudfAdapterBatch : public detail::NoMetaInfo {
     size_t row_idx = idx / columns_.size();
     auto const& column = columns_[column_idx];
     float value = column.valid.Data() == nullptr || column.valid.Check(row_idx)
-                  ? column.GetElement(row_idx, 0)
+                      ? column(row_idx)
                       : std::numeric_limits<float>::quiet_NaN();
     return {row_idx, column_idx, value};
   }
@@ -38,7 +38,7 @@ class CudfAdapterBatch : public detail::NoMetaInfo {
   XGBOOST_DEVICE bst_row_t NumCols() const { return columns_.size(); }
 
  private:
-  common::Span<ArrayInterface> columns_;
+  common::Span<ArrayInterface<1>> columns_;
   size_t num_rows_;
 };
 
@@ -101,9 +101,9 @@ class CudfAdapter : public detail::SingleBatchDataIter<CudfAdapterBatch> {
 
     auto const& typestr = get<String const>(json_columns[0]["typestr"]);
     CHECK_EQ(typestr.size(), 3) << ArrayInterfaceErrors::TypestrFormat();
-    std::vector<ArrayInterface> columns;
-    auto first_column = ArrayInterface(get<Object const>(json_columns[0]));
-    num_rows_ = first_column.num_rows;
+    std::vector<ArrayInterface<1>> columns;
+    auto first_column = ArrayInterface<1>(get<Object const>(json_columns[0]));
+    num_rows_ = first_column.Shape(0);
     if (num_rows_ == 0) {
       return;
     }
@@ -112,13 +112,12 @@ class CudfAdapter : public detail::SingleBatchDataIter<CudfAdapterBatch> {
     CHECK_NE(device_idx_, -1);
     dh::safe_cuda(cudaSetDevice(device_idx_));
     for (auto& json_col : json_columns) {
-      auto column = ArrayInterface(get<Object const>(json_col));
+      auto column = ArrayInterface<1>(get<Object const>(json_col));
       columns.push_back(column);
-      CHECK_EQ(column.num_cols, 1);
-      num_rows_ = std::max(num_rows_, size_t(column.num_rows));
+      num_rows_ = std::max(num_rows_, size_t(column.Shape(0)));
       CHECK_EQ(device_idx_, dh::CudaGetPointerDevice(column.data))
           << "All columns should use the same device.";
-      CHECK_EQ(num_rows_, column.num_rows)
+      CHECK_EQ(num_rows_, column.Shape(0))
           << "All columns should have same number of rows.";
     }
     columns_ = columns;
@@ -135,7 +134,7 @@ class CudfAdapter : public detail::SingleBatchDataIter<CudfAdapterBatch> {
 
  private:
   CudfAdapterBatch batch_;
-  dh::device_vector<ArrayInterface> columns_;
+  dh::device_vector<ArrayInterface<1>> columns_;
   size_t num_rows_{0};
   int device_idx_;
 };
@@ -143,23 +142,23 @@ class CudfAdapter : public detail::SingleBatchDataIter<CudfAdapterBatch> {
 class CupyAdapterBatch : public detail::NoMetaInfo {
  public:
   CupyAdapterBatch() = default;
-  explicit CupyAdapterBatch(ArrayInterface array_interface)
+  explicit CupyAdapterBatch(ArrayInterface<2> array_interface)
     : array_interface_(std::move(array_interface)) {}
   size_t Size() const {
-    return array_interface_.num_rows * array_interface_.num_cols;
+    return array_interface_.Shape(0) * array_interface_.Shape(1);
   }
   __device__ COOTuple GetElement(size_t idx) const {
-    size_t column_idx = idx % array_interface_.num_cols;
-    size_t row_idx = idx / array_interface_.num_cols;
-    float value = array_interface_.GetElement(row_idx, column_idx);
+    size_t column_idx = idx % array_interface_.Shape(1);
+    size_t row_idx = idx / array_interface_.Shape(1);
+    float value = array_interface_(row_idx, column_idx);
     return {row_idx, column_idx, value};
   }
 
-  XGBOOST_DEVICE bst_row_t NumRows() const { return array_interface_.num_rows; }
-  XGBOOST_DEVICE bst_row_t NumCols() const { return array_interface_.num_cols; }
+  XGBOOST_DEVICE bst_row_t NumRows() const { return array_interface_.Shape(0); }
+  XGBOOST_DEVICE bst_row_t NumCols() const { return array_interface_.Shape(1); }
 
  private:
-  ArrayInterface array_interface_;
+  ArrayInterface<2> array_interface_;
 };
 
 class CupyAdapter : public detail::SingleBatchDataIter<CupyAdapterBatch> {
@@ -167,9 +166,9 @@ class CupyAdapter : public detail::SingleBatchDataIter<CupyAdapterBatch> {
   explicit CupyAdapter(std::string cuda_interface_str) {
     Json json_array_interface =
         Json::Load({cuda_interface_str.c_str(), cuda_interface_str.size()});
-    array_interface_ = ArrayInterface(get<Object const>(json_array_interface), false);
+    array_interface_ = ArrayInterface<2>(get<Object const>(json_array_interface));
     batch_ = CupyAdapterBatch(array_interface_);
-    if (array_interface_.num_rows == 0) {
+    if (array_interface_.Shape(0) == 0) {
       return;
     }
     device_idx_ = dh::CudaGetPointerDevice(array_interface_.data);
@@ -177,12 +176,12 @@ class CupyAdapter : public detail::SingleBatchDataIter<CupyAdapterBatch> {
   }
   const CupyAdapterBatch& Value() const override { return batch_; }
 
-  size_t NumRows() const { return array_interface_.num_rows; }
-  size_t NumColumns() const { return array_interface_.num_cols; }
+  size_t NumRows() const { return array_interface_.Shape(0); }
+  size_t NumColumns() const { return array_interface_.Shape(1); }
   int32_t DeviceIdx() const { return device_idx_; }
 
  private:
-  ArrayInterface array_interface_;
+  ArrayInterface<2> array_interface_;
   CupyAdapterBatch batch_;
   int32_t device_idx_ {-1};
 };
diff --git a/src/data/file_iterator.h b/src/data/file_iterator.h
index 6d6adb62b008..70a5d51c30b9 100644
--- a/src/data/file_iterator.h
+++ b/src/data/file_iterator.h
@@ -12,6 +12,7 @@
 #include "dmlc/data.h"
 #include "xgboost/c_api.h"
 #include "xgboost/json.h"
+#include "xgboost/linalg.h"
 #include "array_interface.h"
 
 namespace xgboost {
@@ -58,16 +59,14 @@ class FileIterator {
     CHECK(parser_);
     if (parser_->Next()) {
       row_block_ = parser_->Value();
+      using linalg::MakeVec;
 
-      indptr_ = MakeArrayInterface(row_block_.offset, row_block_.size + 1);
-      values_ = MakeArrayInterface(row_block_.value,
-                                   row_block_.offset[row_block_.size]);
-      indices_ = MakeArrayInterface(row_block_.index,
-                                    row_block_.offset[row_block_.size]);
+      indptr_ = MakeVec(row_block_.offset, row_block_.size + 1).ArrayInterfaceStr();
+      values_ = MakeVec(row_block_.value, row_block_.offset[row_block_.size]).ArrayInterfaceStr();
+      indices_ = MakeVec(row_block_.index, row_block_.offset[row_block_.size]).ArrayInterfaceStr();
 
-      size_t n_columns = *std::max_element(
-          row_block_.index,
-          row_block_.index + row_block_.offset[row_block_.size]);
+      size_t n_columns = *std::max_element(row_block_.index,
+                                           row_block_.index + row_block_.offset[row_block_.size]);
       // dmlc parser converts 1-based indexing back to 0-based indexing so we can ignore
       // this condition and just add 1 to n_columns
       n_columns += 1;
diff --git a/src/data/simple_dmatrix.cc b/src/data/simple_dmatrix.cc
index 44a8a3f8fe7c..e83559d3958a 100644
--- a/src/data/simple_dmatrix.cc
+++ b/src/data/simple_dmatrix.cc
@@ -137,9 +137,10 @@ SimpleDMatrix::SimpleDMatrix(AdapterT* adapter, float missing, int nthread) {
                      batch.Weights() + batch.Size());
     }
     if (batch.BaseMargin() != nullptr) {
-      auto& base_margin = info_.base_margin_.HostVector();
-      base_margin.insert(base_margin.end(), batch.BaseMargin(),
-                     batch.BaseMargin() + batch.Size());
+      info_.base_margin_ = linalg::Tensor<float, 3>{batch.BaseMargin(),
+                                                    batch.BaseMargin() + batch.Size(),
+                                                    {batch.Size()},
+                                                    GenericParameter::kCpuId};
     }
     if (batch.Qid() != nullptr) {
       qids.insert(qids.end(), batch.Qid(), batch.Qid() + batch.Size());
diff --git a/src/data/validation.h b/src/data/validation.h
new file mode 100644
index 000000000000..6d3701114886
--- /dev/null
+++ b/src/data/validation.h
@@ -0,0 +1,40 @@
+/*!
+ * Copyright 2021 by XGBoost Contributors
+ */
+#ifndef XGBOOST_DATA_VALIDATION_H_
+#define XGBOOST_DATA_VALIDATION_H_
+#include <cmath>
+#include <vector>
+
+#include "xgboost/base.h"
+#include "xgboost/logging.h"
+
+namespace xgboost {
+namespace data {
+struct LabelsCheck {
+  XGBOOST_DEVICE bool operator()(float y) {
+#if defined(__CUDA_ARCH__)
+    return ::isnan(y) || ::isinf(y);
+#else
+    return std::isnan(y) || std::isinf(y);
+#endif
+  }
+};
+
+struct WeightsCheck {
+  XGBOOST_DEVICE bool operator()(float w) { return LabelsCheck{}(w) || w < 0; }  // NOLINT
+};
+
+inline void ValidateQueryGroup(std::vector<bst_group_t> const &group_ptr_) {
+  bool valid_query_group = true;
+  for (size_t i = 1; i < group_ptr_.size(); ++i) {
+    valid_query_group = valid_query_group && group_ptr_[i] >= group_ptr_[i - 1];
+    if (XGBOOST_EXPECT(!valid_query_group, false)) {
+      break;
+    }
+  }
+  CHECK(valid_query_group) << "Invalid group structure.";
+}
+}  // namespace data
+}  // namespace xgboost
+#endif  // XGBOOST_DATA_VALIDATION_H_
diff --git a/src/gbm/gblinear.cc b/src/gbm/gblinear.cc
index e5f2a457214c..e5f5916211cc 100644
--- a/src/gbm/gblinear.cc
+++ b/src/gbm/gblinear.cc
@@ -178,7 +178,7 @@ class GBLinear : public GradientBooster {
                            unsigned layer_begin, unsigned layer_end, bool, int, unsigned) override {
     model_.LazyInitModel();
     LinearCheckLayer(layer_begin, layer_end);
-    const auto& base_margin = p_fmat->Info().base_margin_.ConstHostVector();
+    auto base_margin = p_fmat->Info().base_margin_.View(GenericParameter::kCpuId);
     const int ngroup = model_.learner_model_param->num_output_group;
     const size_t ncolumns = model_.learner_model_param->num_feature + 1;
     // allocate space for (#features + bias) times #groups times #rows
@@ -203,9 +203,9 @@ class GBLinear : public GradientBooster {
             p_contribs[ins.index] = ins.fvalue * model_[ins.index][gid];
           }
           // add base margin to BIAS
-          p_contribs[ncolumns - 1] = model_.Bias()[gid] +
-            ((base_margin.size() != 0) ? base_margin[row_idx * ngroup + gid] :
-                                         learner_model_param_->base_score);
+          p_contribs[ncolumns - 1] =
+              model_.Bias()[gid] + ((base_margin.Size() != 0) ? base_margin(row_idx, gid)
+                                                              : learner_model_param_->base_score);
         }
       });
     }
@@ -270,7 +270,7 @@ class GBLinear : public GradientBooster {
     monitor_.Start("PredictBatchInternal");
     model_.LazyInitModel();
     std::vector<bst_float> &preds = *out_preds;
-    const auto& base_margin = p_fmat->Info().base_margin_.ConstHostVector();
+    auto base_margin = p_fmat->Info().base_margin_.View(GenericParameter::kCpuId);
     // start collecting the prediction
     const int ngroup = model_.learner_model_param->num_output_group;
     preds.resize(p_fmat->Info().num_row_ * ngroup);
@@ -280,16 +280,15 @@ class GBLinear : public GradientBooster {
       // k is number of group
       // parallel over local batch
       const auto nsize = static_cast<omp_ulong>(batch.Size());
-      if (base_margin.size() != 0) {
-        CHECK_EQ(base_margin.size(), nsize * ngroup);
+      if (base_margin.Size() != 0) {
+        CHECK_EQ(base_margin.Size(), nsize * ngroup);
       }
       common::ParallelFor(nsize, [&](omp_ulong i) {
         const size_t ridx = page.base_rowid + i;
         // loop over output groups
         for (int gid = 0; gid < ngroup; ++gid) {
-          bst_float margin =
-              (base_margin.size() != 0) ?
-              base_margin[ridx * ngroup + gid] : learner_model_param_->base_score;
+          float margin =
+              (base_margin.Size() != 0) ? base_margin(ridx, gid) : learner_model_param_->base_score;
           this->Pred(batch[i], &preds[ridx * ngroup], gid, margin);
         }
       });
diff --git a/src/predictor/cpu_predictor.cc b/src/predictor/cpu_predictor.cc
index d581f64a1d56..92797235d16b 100644
--- a/src/predictor/cpu_predictor.cc
+++ b/src/predictor/cpu_predictor.cc
@@ -282,27 +282,6 @@ class CPUPredictor : public Predictor {
     }
   }
 
-  void InitOutPredictions(const MetaInfo& info,
-                          HostDeviceVector<bst_float>* out_preds,
-                          const gbm::GBTreeModel& model) const override {
-    CHECK_NE(model.learner_model_param->num_output_group, 0);
-    size_t n = model.learner_model_param->num_output_group * info.num_row_;
-    const auto& base_margin = info.base_margin_.HostVector();
-    out_preds->Resize(n);
-    std::vector<bst_float>& out_preds_h = out_preds->HostVector();
-    if (base_margin.empty()) {
-      std::fill(out_preds_h.begin(), out_preds_h.end(),
-                model.learner_model_param->base_score);
-    } else {
-      std::string expected{
-          "(" + std::to_string(info.num_row_) + ", " +
-          std::to_string(model.learner_model_param->num_output_group) + ")"};
-      CHECK_EQ(base_margin.size(), n)
-          << "Invalid shape of base_margin. Expected:" << expected;
-      std::copy(base_margin.begin(), base_margin.end(), out_preds_h.begin());
-    }
-  }
-
  public:
   explicit CPUPredictor(GenericParameter const* generic_param) :
       Predictor::Predictor{generic_param} {}
@@ -456,7 +435,7 @@ class CPUPredictor : public Predictor {
     common::ParallelFor(bst_omp_uint(ntree_limit), [&](bst_omp_uint i) {
       FillNodeMeanValues(model.trees[i].get(), &(mean_values[i]));
     });
-    const std::vector<bst_float>& base_margin = info.base_margin_.HostVector();
+    auto base_margin = info.base_margin_.View(GenericParameter::kCpuId);
     // start collecting the contributions
     for (const auto &batch : p_fmat->GetBatches<SparsePage>()) {
       auto page = batch.GetView();
@@ -496,8 +475,9 @@ class CPUPredictor : public Predictor {
           }
           feats.Drop(page[i]);
           // add base margin to BIAS
-          if (base_margin.size() != 0) {
-            p_contribs[ncolumns - 1] += base_margin[row_idx * ngroup + gid];
+          if (base_margin.Size() != 0) {
+            CHECK_EQ(base_margin.Shape(1), ngroup);
+            p_contribs[ncolumns - 1] += base_margin(row_idx, gid);
           } else {
             p_contribs[ncolumns - 1] += model.learner_model_param->base_score;
           }
diff --git a/src/predictor/gpu_predictor.cu b/src/predictor/gpu_predictor.cu
index 51674237e973..025a1c495c52 100644
--- a/src/predictor/gpu_predictor.cu
+++ b/src/predictor/gpu_predictor.cu
@@ -854,8 +854,8 @@ class GPUPredictor : public xgboost::Predictor {
           dh::tend(phis));
     }
     // Add the base margin term to last column
-    p_fmat->Info().base_margin_.SetDevice(generic_param_->gpu_id);
-    const auto margin = p_fmat->Info().base_margin_.ConstDeviceSpan();
+    p_fmat->Info().base_margin_.Data()->SetDevice(generic_param_->gpu_id);
+    const auto margin = p_fmat->Info().base_margin_.Data()->ConstDeviceSpan();
     float base_score = model.learner_model_param->base_score;
     dh::LaunchN(
         p_fmat->Info().num_row_ * model.learner_model_param->num_output_group,
@@ -913,8 +913,8 @@ class GPUPredictor : public xgboost::Predictor {
           dh::tend(phis));
     }
     // Add the base margin term to last column
-    p_fmat->Info().base_margin_.SetDevice(generic_param_->gpu_id);
-    const auto margin = p_fmat->Info().base_margin_.ConstDeviceSpan();
+    p_fmat->Info().base_margin_.Data()->SetDevice(generic_param_->gpu_id);
+    const auto margin = p_fmat->Info().base_margin_.Data()->ConstDeviceSpan();
     float base_score = model.learner_model_param->base_score;
     size_t n_features = model.learner_model_param->num_feature;
     dh::LaunchN(
@@ -928,27 +928,6 @@ class GPUPredictor : public xgboost::Predictor {
         });
   }
 
- protected:
-  void InitOutPredictions(const MetaInfo& info,
-                          HostDeviceVector<bst_float>* out_preds,
-                          const gbm::GBTreeModel& model) const override {
-    size_t n_classes = model.learner_model_param->num_output_group;
-    size_t n = n_classes * info.num_row_;
-    const HostDeviceVector<bst_float>& base_margin = info.base_margin_;
-    out_preds->SetDevice(generic_param_->gpu_id);
-    out_preds->Resize(n);
-    if (base_margin.Size() != 0) {
-      std::string expected{
-          "(" + std::to_string(info.num_row_) + ", " +
-          std::to_string(model.learner_model_param->num_output_group) + ")"};
-      CHECK_EQ(base_margin.Size(), n)
-          << "Invalid shape of base_margin. Expected:" << expected;
-      out_preds->Copy(base_margin);
-    } else {
-      out_preds->Fill(model.learner_model_param->base_score);
-    }
-  }
-
   void PredictInstance(const SparsePage::Inst&,
                        std::vector<bst_float>*,
                        const gbm::GBTreeModel&, unsigned) const override {
diff --git a/src/predictor/predictor.cc b/src/predictor/predictor.cc
index 9aa18b19ce30..e9e10a632a7f 100644
--- a/src/predictor/predictor.cc
+++ b/src/predictor/predictor.cc
@@ -1,5 +1,5 @@
 /*!
- * Copyright 2017-2020 by Contributors
+ * Copyright 2017-2021 by Contributors
  */
 #include <dmlc/registry.h>
 #include <mutex>
@@ -8,6 +8,8 @@
 #include "xgboost/data.h"
 #include "xgboost/generic_parameters.h"
 
+#include "../gbm/gbtree.h"
+
 namespace dmlc {
 DMLC_REGISTRY_ENABLE(::xgboost::PredictorReg);
 }  // namespace dmlc
@@ -58,6 +60,33 @@ Predictor* Predictor::Create(
   auto p_predictor = (e->body)(generic_param);
   return p_predictor;
 }
+
+void ValidateBaseMarginShape(linalg::Tensor<float, 3> const& margin, bst_row_t n_samples,
+                             bst_group_t n_groups) {
+  // FIXME: Bindings other than Python doesn't have shape.
+  std::string expected{"Invalid shape of base_margin. Expected: (" + std::to_string(n_samples) +
+                       ", " + std::to_string(n_groups) + ")"};
+  CHECK_EQ(margin.Shape(0), n_samples) << expected;
+  CHECK_EQ(margin.Shape(1), n_groups) << expected;
+}
+
+void Predictor::InitOutPredictions(const MetaInfo& info, HostDeviceVector<bst_float>* out_preds,
+                                   const gbm::GBTreeModel& model) const {
+  CHECK_NE(model.learner_model_param->num_output_group, 0);
+  size_t n_classes = model.learner_model_param->num_output_group;
+  size_t n = n_classes * info.num_row_;
+  const HostDeviceVector<bst_float>* base_margin = info.base_margin_.Data();
+  if (generic_param_->gpu_id >= 0) {
+    out_preds->SetDevice(generic_param_->gpu_id);
+  }
+  out_preds->Resize(n);
+  if (base_margin->Size() != 0) {
+    ValidateBaseMarginShape(info.base_margin_, info.num_row_, n_classes);
+    out_preds->Copy(*base_margin);
+  } else {
+    out_preds->Fill(model.learner_model_param->base_score);
+  }
+}
 }  // namespace xgboost
 
 namespace xgboost {
diff --git a/src/tree/hist/evaluate_splits.h b/src/tree/hist/evaluate_splits.h
index e8e6f50d88b2..c183ad910676 100644
--- a/src/tree/hist/evaluate_splits.h
+++ b/src/tree/hist/evaluate_splits.h
@@ -375,7 +375,7 @@ template <typename GradientSumT, typename ExpandEntry> class HistEvaluator {
         n_threads_{n_threads},
         task_{task} {
     interaction_constraints_.Configure(param, info.num_col_);
-    column_sampler_->Init(info.num_col_, info.feature_weigths.HostVector(),
+    column_sampler_->Init(info.num_col_, info.feature_weights.HostVector(),
                           param_.colsample_bynode, param_.colsample_bylevel,
                           param_.colsample_bytree, skip_0_index);
   }
diff --git a/src/tree/updater_colmaker.cc b/src/tree/updater_colmaker.cc
index 52fb85821543..ba405cf10865 100644
--- a/src/tree/updater_colmaker.cc
+++ b/src/tree/updater_colmaker.cc
@@ -230,7 +230,7 @@ class ColMaker: public TreeUpdater {
       }
       {
         column_sampler_.Init(fmat.Info().num_col_,
-                             fmat.Info().feature_weigths.ConstHostVector(),
+                             fmat.Info().feature_weights.ConstHostVector(),
                              param_.colsample_bynode, param_.colsample_bylevel,
                              param_.colsample_bytree);
       }
diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu
index d37227575986..9b673171fbbf 100644
--- a/src/tree/updater_gpu_hist.cu
+++ b/src/tree/updater_gpu_hist.cu
@@ -233,7 +233,7 @@ struct GPUHistMakerDevice {
   // thread safe
   void Reset(HostDeviceVector<GradientPair>* dh_gpair, DMatrix* dmat, int64_t num_columns) {
     auto const& info = dmat->Info();
-    this->column_sampler.Init(num_columns, info.feature_weigths.HostVector(),
+    this->column_sampler.Init(num_columns, info.feature_weights.HostVector(),
                               param.colsample_bynode, param.colsample_bylevel,
                               param.colsample_bytree);
     dh::safe_cuda(cudaSetDevice(device_id));
diff --git a/tests/cpp/common/test_hist_util.h b/tests/cpp/common/test_hist_util.h
index c7fc4972d3ee..49952d202706 100644
--- a/tests/cpp/common/test_hist_util.h
+++ b/tests/cpp/common/test_hist_util.h
@@ -57,7 +57,7 @@ inline data::CupyAdapter AdapterFromData(const thrust::device_vector<float> &x,
     Json(Integer(reinterpret_cast<Integer::Int>(x.data().get()))),
     Json(Boolean(false))};
   array_interface["data"] = j_data;
-  array_interface["version"] = Integer(static_cast<Integer::Int>(1));
+  array_interface["version"] = 3;
   array_interface["typestr"] = String("<f4");
   std::string str;
   Json::Dump(array_interface, &str);
diff --git a/tests/cpp/common/test_linalg.cc b/tests/cpp/common/test_linalg.cc
index e4ca2d86594b..44a91c3f2c2e 100644
--- a/tests/cpp/common/test_linalg.cc
+++ b/tests/cpp/common/test_linalg.cc
@@ -1,11 +1,21 @@
+/*!
+ * Copyright 2021 by XGBoost Contributors
+ */
 #include <gtest/gtest.h>
+#include <xgboost/generic_parameters.h>
 #include <xgboost/host_device_vector.h>
 #include <xgboost/linalg.h>
 
 #include <numeric>
 
+#include "../../../src/common/linalg_op.h"
+
 namespace xgboost {
 namespace linalg {
+namespace {
+auto kCpuId = GenericParameter::kCpuId;
+}
+
 auto MakeMatrixFromTest(HostDeviceVector<float> *storage, size_t n_rows, size_t n_cols) {
   storage->Resize(n_rows * n_cols);
   auto &h_storage = storage->HostVector();
@@ -16,16 +26,16 @@ auto MakeMatrixFromTest(HostDeviceVector<float> *storage, size_t n_rows, size_t
   return m;
 }
 
-TEST(Linalg, Matrix) {
+TEST(Linalg, MatrixView) {
   size_t kRows = 31, kCols = 77;
   HostDeviceVector<float> storage;
   auto m = MakeMatrixFromTest(&storage, kRows, kCols);
-  ASSERT_EQ(m.DeviceIdx(), GenericParameter::kCpuId);
+  ASSERT_EQ(m.DeviceIdx(), kCpuId);
   ASSERT_EQ(m(0, 0), 0);
   ASSERT_EQ(m(kRows - 1, kCols - 1), storage.Size() - 1);
 }
 
-TEST(Linalg, Vector) {
+TEST(Linalg, VectorView) {
   size_t kRows = 31, kCols = 77;
   HostDeviceVector<float> storage;
   auto m = MakeMatrixFromTest(&storage, kRows, kCols);
@@ -37,7 +47,7 @@ TEST(Linalg, Vector) {
   ASSERT_EQ(v(0), 3);
 }
 
-TEST(Linalg, Tensor) {
+TEST(Linalg, TensorView) {
   std::vector<double> data(2 * 3 * 4, 0);
   std::iota(data.begin(), data.end(), 0);
 
@@ -99,14 +109,123 @@ TEST(Linalg, Tensor) {
   }
 }
 
+TEST(Linalg, Tensor) {
+  {
+    Tensor<float, 3> t{{2, 3, 4}, kCpuId};
+    auto view = t.View(kCpuId);
+
+    auto const &as_const = t;
+    auto k_view = as_const.View(kCpuId);
+
+    size_t n = 2 * 3 * 4;
+    ASSERT_EQ(t.Size(), n);
+    ASSERT_TRUE(std::equal(k_view.cbegin(), k_view.cbegin(), view.begin()));
+
+    Tensor<float, 3> t_0{std::move(t)};
+    ASSERT_EQ(t_0.Size(), n);
+    ASSERT_EQ(t_0.Shape(0), 2);
+    ASSERT_EQ(t_0.Shape(1), 3);
+    ASSERT_EQ(t_0.Shape(2), 4);
+  }
+  {
+    // Reshape
+    Tensor<float, 3> t{{2, 3, 4}, kCpuId};
+    t.Reshape(4, 3, 2);
+    ASSERT_EQ(t.Size(), 24);
+    ASSERT_EQ(t.Shape(2), 2);
+    t.Reshape(1);
+    ASSERT_EQ(t.Size(), 1);
+    t.Reshape(0, 0, 0);
+    ASSERT_EQ(t.Size(), 0);
+    t.Reshape(0, 3, 0);
+    ASSERT_EQ(t.Size(), 0);
+    ASSERT_EQ(t.Shape(1), 3);
+    t.Reshape(3, 3, 3);
+    ASSERT_EQ(t.Size(), 27);
+  }
+}
+
 TEST(Linalg, Empty) {
-  auto t = TensorView<double, 2>{{}, {0, 3}, GenericParameter::kCpuId};
-  for (int32_t i : {0, 1, 2}) {
-    auto s = t.Slice(All(), i);
-    ASSERT_EQ(s.Size(), 0);
-    ASSERT_EQ(s.Shape().size(), 1);
-    ASSERT_EQ(s.Shape(0), 0);
+  {
+    auto t = TensorView<double, 2>{{}, {0, 3}, kCpuId};
+    for (int32_t i : {0, 1, 2}) {
+      auto s = t.Slice(All(), i);
+      ASSERT_EQ(s.Size(), 0);
+      ASSERT_EQ(s.Shape().size(), 1);
+      ASSERT_EQ(s.Shape(0), 0);
+    }
   }
+  {
+    auto t = Tensor<double, 2>{{0, 3}, kCpuId};
+    ASSERT_EQ(t.Size(), 0);
+    auto view = t.View(kCpuId);
+
+    for (int32_t i : {0, 1, 2}) {
+      auto s = view.Slice(All(), i);
+      ASSERT_EQ(s.Size(), 0);
+      ASSERT_EQ(s.Shape().size(), 1);
+      ASSERT_EQ(s.Shape(0), 0);
+    }
+  }
+}
+
+TEST(Linalg, ArrayInterface) {
+  auto cpu = kCpuId;
+  auto t = Tensor<double, 2>{{3, 3}, cpu};
+  auto v = t.View(cpu);
+  std::iota(v.begin(), v.end(), 0);
+  auto arr = Json::Load(StringView{v.ArrayInterfaceStr()});
+  ASSERT_EQ(get<Integer>(arr["shape"][0]), 3);
+  ASSERT_EQ(get<Integer>(arr["strides"][0]), 3 * sizeof(double));
+
+  ASSERT_FALSE(get<Boolean>(arr["data"][1]));
+  ASSERT_EQ(reinterpret_cast<double *>(get<Integer>(arr["data"][0])), v.Values().data());
+}
+
+TEST(Linalg, Popc) {
+  {
+    uint32_t v{0};
+    ASSERT_EQ(detail::NativePopc(v), 0);
+    ASSERT_EQ(detail::Popc(v), 0);
+    v = 1;
+    ASSERT_EQ(detail::NativePopc(v), 1);
+    ASSERT_EQ(detail::Popc(v), 1);
+    v = 0xffffffff;
+    ASSERT_EQ(detail::NativePopc(v), 32);
+    ASSERT_EQ(detail::Popc(v), 32);
+  }
+  {
+    uint64_t v{0};
+    ASSERT_EQ(detail::NativePopc(v), 0);
+    ASSERT_EQ(detail::Popc(v), 0);
+    v = 1;
+    ASSERT_EQ(detail::NativePopc(v), 1);
+    ASSERT_EQ(detail::Popc(v), 1);
+    v = 0xffffffff;
+    ASSERT_EQ(detail::NativePopc(v), 32);
+    ASSERT_EQ(detail::Popc(v), 32);
+    v = 0xffffffffffffffff;
+    ASSERT_EQ(detail::NativePopc(v), 64);
+    ASSERT_EQ(detail::Popc(v), 64);
+  }
+}
+
+TEST(Linalg, Stack) {
+  Tensor<float, 3> l{{2, 3, 4}, kCpuId};
+  ElementWiseKernelHost(l.View(kCpuId), omp_get_max_threads(),
+                        [=](size_t i, float v) { return i; });
+  Tensor<float, 3> r_0{{2, 3, 4}, kCpuId};
+  ElementWiseKernelHost(r_0.View(kCpuId), omp_get_max_threads(),
+                        [=](size_t i, float v) { return i; });
+
+  Stack(&l, r_0);
+
+  Tensor<float, 3> r_1{{0, 3, 4}, kCpuId};
+  Stack(&l, r_1);
+  ASSERT_EQ(l.Shape(0), 4);
+
+  Stack(&r_1, l);
+  ASSERT_EQ(r_1.Shape(0), l.Shape(0));
 }
 }  // namespace linalg
 }  // namespace xgboost
diff --git a/tests/cpp/common/test_linalg.cu b/tests/cpp/common/test_linalg.cu
new file mode 100644
index 000000000000..abfef8bfd353
--- /dev/null
+++ b/tests/cpp/common/test_linalg.cu
@@ -0,0 +1,62 @@
+/*!
+ * Copyright 2021 by XGBoost Contributors
+ */
+#include <gtest/gtest.h>
+
+#include "../../../src/common/linalg_op.cuh"
+#include "xgboost/generic_parameters.h"
+#include "xgboost/linalg.h"
+
+namespace xgboost {
+namespace linalg {
+namespace {
+void TestElementWiseKernel() {
+  Tensor<float, 3> l{{2, 3, 4}, 0};
+  {
+    /**
+     * Non-contiguous
+     */
+    // GPU view
+    auto t = l.View(0).Slice(linalg::All(), 1, linalg::All());
+    ASSERT_FALSE(t.Contiguous());
+    ElementWiseKernelDevice(t, [] __device__(size_t i, float) { return i; });
+    // CPU view
+    t = l.View(GenericParameter::kCpuId).Slice(linalg::All(), 1, linalg::All());
+    size_t k = 0;
+    for (size_t i = 0; i < l.Shape(0); ++i) {
+      for (size_t j = 0; j < l.Shape(2); ++j) {
+        ASSERT_EQ(k++, t(i, j));
+      }
+    }
+
+    t = l.View(0).Slice(linalg::All(), 1, linalg::All());
+    ElementWiseKernelDevice(t, [] __device__(size_t i, float v) {
+      SPAN_CHECK(v == i);
+      return v;
+    });
+  }
+
+  {
+    /**
+     * Contiguous
+     */
+    auto t = l.View(0);
+    ElementWiseKernelDevice(t, [] __device__(size_t i, float) { return i; });
+    ASSERT_TRUE(t.Contiguous());
+    // CPU view
+    t = l.View(GenericParameter::kCpuId);
+
+    size_t ind = 0;
+    for (size_t i = 0; i < l.Shape(0); ++i) {
+      for (size_t j = 0; j < l.Shape(1); ++j) {
+        for (size_t k = 0; k < l.Shape(2); ++k) {
+          ASSERT_EQ(ind++, t(i, j, k));
+        }
+      }
+    }
+  }
+}
+}  // anonymous namespace
+TEST(Linalg, GPUElementWise) { TestElementWiseKernel(); }
+}  // namespace linalg
+}  // namespace xgboost
diff --git a/tests/cpp/data/test_adapter.cc b/tests/cpp/data/test_adapter.cc
index ccb19de71a74..c6de226194f6 100644
--- a/tests/cpp/data/test_adapter.cc
+++ b/tests/cpp/data/test_adapter.cc
@@ -41,9 +41,10 @@ TEST(Adapter, CSRArrayAdapter) {
   HostDeviceVector<bst_feature_t> indices;
   size_t n_features = 100, n_samples = 10;
   RandomDataGenerator{n_samples, n_features, 0.5}.GenerateCSR(&values, &indptr, &indices);
-  auto indptr_arr = MakeArrayInterface(indptr.HostPointer(), indptr.Size());
-  auto values_arr = MakeArrayInterface(values.HostPointer(), values.Size());
-  auto indices_arr = MakeArrayInterface(indices.HostPointer(), indices.Size());
+  using linalg::MakeVec;
+  auto indptr_arr = MakeVec(indptr.HostPointer(), indptr.Size()).ArrayInterfaceStr();
+  auto values_arr = MakeVec(values.HostPointer(), values.Size()).ArrayInterfaceStr();
+  auto indices_arr = MakeVec(indices.HostPointer(), indices.Size()).ArrayInterfaceStr();
   auto adapter = data::CSRArrayAdapter(
       StringView{indptr_arr.c_str(), indptr_arr.size()},
       StringView{values_arr.c_str(), values_arr.size()},
diff --git a/tests/cpp/data/test_array_interface.cc b/tests/cpp/data/test_array_interface.cc
index 875858855ed5..f438593b42a4 100644
--- a/tests/cpp/data/test_array_interface.cc
+++ b/tests/cpp/data/test_array_interface.cc
@@ -11,21 +11,22 @@ TEST(ArrayInterface, Initialize) {
   size_t constexpr kRows = 10, kCols = 10;
   HostDeviceVector<float> storage;
   auto array = RandomDataGenerator{kRows, kCols, 0}.GenerateArrayInterface(&storage);
-  auto arr_interface = ArrayInterface(array);
-  ASSERT_EQ(arr_interface.num_rows, kRows);
-  ASSERT_EQ(arr_interface.num_cols, kCols);
+  auto arr_interface = ArrayInterface<2>(StringView{array});
+  ASSERT_EQ(arr_interface.Shape(0), kRows);
+  ASSERT_EQ(arr_interface.Shape(1), kCols);
   ASSERT_EQ(arr_interface.data, storage.ConstHostPointer());
   ASSERT_EQ(arr_interface.ElementSize(), 4);
-  ASSERT_EQ(arr_interface.type, ArrayInterface::kF4);
+  ASSERT_EQ(arr_interface.type, ArrayInterfaceHandler::kF4);
 
   HostDeviceVector<size_t> u64_storage(storage.Size());
-  std::string u64_arr_str;
-  Json::Dump(GetArrayInterface(&u64_storage, kRows, kCols), &u64_arr_str);
+  std::string u64_arr_str{linalg::TensorView<size_t const, 2>{
+      u64_storage.ConstHostSpan(), {kRows, kCols}, GenericParameter::kCpuId}
+                              .ArrayInterfaceStr()};
   std::copy(storage.ConstHostVector().cbegin(), storage.ConstHostVector().cend(),
             u64_storage.HostSpan().begin());
-  auto u64_arr = ArrayInterface{u64_arr_str};
+  auto u64_arr = ArrayInterface<2>{u64_arr_str};
   ASSERT_EQ(u64_arr.ElementSize(), 8);
-  ASSERT_EQ(u64_arr.type, ArrayInterface::kU8);
+  ASSERT_EQ(u64_arr.type, ArrayInterfaceHandler::kU8);
 }
 
 TEST(ArrayInterface, Error) {
@@ -38,23 +39,22 @@ TEST(ArrayInterface, Error) {
         Json(Boolean(false))};
 
   auto const& column_obj = get<Object>(column);
-  std::pair<size_t, size_t> shape{kRows, kCols};
   std::string typestr{"<f4"};
+  size_t n = kRows * kCols;
 
   // missing version
-  EXPECT_THROW(ArrayInterfaceHandler::ExtractData(column_obj, shape),
-               dmlc::Error);
-  column["version"] = Integer(static_cast<Integer::Int>(1));
+  EXPECT_THROW(ArrayInterfaceHandler::ExtractData(column_obj, n), dmlc::Error);
+  column["version"] = 3;
   // missing data
-  EXPECT_THROW(ArrayInterfaceHandler::ExtractData(column_obj, shape),
+  EXPECT_THROW(ArrayInterfaceHandler::ExtractData(column_obj, n),
                dmlc::Error);
   column["data"] = j_data;
   // missing typestr
-  EXPECT_THROW(ArrayInterfaceHandler::ExtractData(column_obj, shape),
+  EXPECT_THROW(ArrayInterfaceHandler::ExtractData(column_obj, n),
                dmlc::Error);
   column["typestr"] = String("<f4");
   // nullptr is not valid
-  EXPECT_THROW(ArrayInterfaceHandler::ExtractData(column_obj, shape),
+  EXPECT_THROW(ArrayInterfaceHandler::ExtractData(column_obj, n),
                dmlc::Error);
 
   HostDeviceVector<float> storage;
@@ -63,22 +63,41 @@ TEST(ArrayInterface, Error) {
       Json(Integer(reinterpret_cast<Integer::Int>(storage.ConstHostPointer()))),
       Json(Boolean(false))};
   column["data"] = j_data;
-  EXPECT_NO_THROW(ArrayInterfaceHandler::ExtractData(column_obj, shape));
+  EXPECT_NO_THROW(ArrayInterfaceHandler::ExtractData(column_obj, n));
 }
 
 TEST(ArrayInterface, GetElement) {
   size_t kRows = 4, kCols = 2;
   HostDeviceVector<float> storage;
   auto intefrace_str = RandomDataGenerator{kRows, kCols, 0}.GenerateArrayInterface(&storage);
-  ArrayInterface array_interface{intefrace_str};
+  ArrayInterface<2> array_interface{intefrace_str};
 
   auto const& h_storage = storage.ConstHostVector();
   for (size_t i = 0; i < kRows; ++i) {
     for (size_t j = 0; j < kCols; ++j) {
-      float v0 = array_interface.GetElement(i, j);
+      float v0 = array_interface(i, j);
       float v1 = h_storage.at(i * kCols + j);
       ASSERT_EQ(v0, v1);
     }
   }
 }
+
+TEST(ArrayInterface, TrivialDim) {
+  size_t kRows{1000}, kCols = 1;
+  HostDeviceVector<float> storage;
+  auto interface_str = RandomDataGenerator{kRows, kCols, 0}.GenerateArrayInterface(&storage);
+  {
+    ArrayInterface<1> arr_i{interface_str};
+    ASSERT_EQ(arr_i.n, kRows);
+    ASSERT_EQ(arr_i.Shape(0), kRows);
+  }
+
+  std::swap(kRows, kCols);
+  interface_str = RandomDataGenerator{kRows, kCols, 0}.GenerateArrayInterface(&storage);
+  {
+    ArrayInterface<1> arr_i{interface_str};
+    ASSERT_EQ(arr_i.n, kCols);
+    ASSERT_EQ(arr_i.Shape(0), kCols);
+  }
+}
 }  // namespace xgboost
diff --git a/tests/cpp/data/test_array_interface.cu b/tests/cpp/data/test_array_interface.cu
index 75923e74ba1a..c8e07852534b 100644
--- a/tests/cpp/data/test_array_interface.cu
+++ b/tests/cpp/data/test_array_interface.cu
@@ -32,11 +32,24 @@ TEST(ArrayInterface, Stream) {
   dh::caching_device_vector<uint64_t> out(1, 0);
   uint64_t dur = 1e9;
   dh::LaunchKernel{1, 1, 0, stream}(SleepForTest, out.data().get(), dur);
-  ArrayInterface arr(arr_str);
+  ArrayInterface<2> arr(arr_str);
 
   auto t = out[0];
   CHECK_GE(t, dur);
 
   cudaStreamDestroy(stream);
 }
+
+TEST(ArrayInterface, Ptr) {
+  std::vector<float> h_data(10);
+  ASSERT_FALSE(ArrayInterfaceHandler::IsCudaPtr(h_data.data()));
+  dh::safe_cuda(cudaGetLastError());
+
+  dh::device_vector<float> d_data(10);
+  ASSERT_TRUE(ArrayInterfaceHandler::IsCudaPtr(d_data.data().get()));
+  dh::safe_cuda(cudaGetLastError());
+
+  ASSERT_FALSE(ArrayInterfaceHandler::IsCudaPtr(nullptr));
+  dh::safe_cuda(cudaGetLastError());
+}
 }  // namespace xgboost
diff --git a/tests/cpp/data/test_array_interface.h b/tests/cpp/data/test_array_interface.h
index 7872a9507aa5..78bce76f53e7 100644
--- a/tests/cpp/data/test_array_interface.h
+++ b/tests/cpp/data/test_array_interface.h
@@ -19,6 +19,7 @@ Json GenerateDenseColumn(std::string const& typestr, size_t kRows,
   std::vector<Json> j_shape {Json(Integer(static_cast<Integer::Int>(kRows)))};
   column["shape"] = Array(j_shape);
   column["strides"] = Array(std::vector<Json>{Json(Integer(static_cast<Integer::Int>(sizeof(T))))});
+  column["stream"] = nullptr;
 
   d_data.resize(kRows);
   thrust::sequence(thrust::device, d_data.begin(), d_data.end(), 0.0f, 2.0f);
@@ -30,7 +31,7 @@ Json GenerateDenseColumn(std::string const& typestr, size_t kRows,
         Json(Boolean(false))};
   column["data"] = j_data;
 
-  column["version"] = Integer(static_cast<Integer::Int>(1));
+  column["version"] = 3;
   column["typestr"] = String(typestr);
   return column;
 }
@@ -43,6 +44,7 @@ Json GenerateSparseColumn(std::string const& typestr, size_t kRows,
   std::vector<Json> j_shape {Json(Integer(static_cast<Integer::Int>(kRows)))};
   column["shape"] = Array(j_shape);
   column["strides"] = Array(std::vector<Json>{Json(Integer(static_cast<Integer::Int>(sizeof(T))))});
+  column["stream"] = nullptr;
 
   d_data.resize(kRows);
   for (size_t i = 0; i < d_data.size(); ++i) {
@@ -56,7 +58,7 @@ Json GenerateSparseColumn(std::string const& typestr, size_t kRows,
         Json(Boolean(false))};
   column["data"] = j_data;
 
-  column["version"] = Integer(static_cast<Integer::Int>(1));
+  column["version"] = 3;
   column["typestr"] = String(typestr);
   return column;
 }
@@ -75,9 +77,9 @@ Json Generate2dArrayInterface(int rows, int cols, std::string typestr,
       Json(Integer(reinterpret_cast<Integer::Int>(data.data().get()))),
       Json(Boolean(false))};
   array_interface["data"] = j_data;
-  array_interface["version"] = Integer(static_cast<Integer::Int>(1));
+  array_interface["version"] = 3;
   array_interface["typestr"] = String(typestr);
+  array_interface["stream"] = nullptr;
   return array_interface;
 }
-
 }  // namespace xgboost
diff --git a/tests/cpp/data/test_iterative_device_dmatrix.cu b/tests/cpp/data/test_iterative_device_dmatrix.cu
index cb64a3b5cdb2..27f6b0b3ffe9 100644
--- a/tests/cpp/data/test_iterative_device_dmatrix.cu
+++ b/tests/cpp/data/test_iterative_device_dmatrix.cu
@@ -103,7 +103,7 @@ TEST(IterativeDeviceDMatrix, RowMajor) {
 
     auto j_interface =
         Json::Load({interface_str.c_str(), interface_str.size()});
-    ArrayInterface loaded {get<Object const>(j_interface)};
+    ArrayInterface<2> loaded {get<Object const>(j_interface)};
     std::vector<float> h_data(cols * rows);
     common::Span<float> s_data{static_cast<float*>(loaded.data), cols * rows};
     dh::CopyDeviceSpanToVector(&h_data, s_data);
@@ -128,7 +128,7 @@ TEST(IterativeDeviceDMatrix, RowMajorMissing) {
   std::string interface_str = iter.AsArray();
   auto j_interface =
       Json::Load({interface_str.c_str(), interface_str.size()});
-  ArrayInterface loaded {get<Object const>(j_interface)};
+  ArrayInterface<2> loaded {get<Object const>(j_interface)};
   std::vector<float> h_data(cols * rows);
   common::Span<float> s_data{static_cast<float*>(loaded.data), cols * rows};
   dh::CopyDeviceSpanToVector(&h_data, s_data);
diff --git a/tests/cpp/data/test_metainfo.cc b/tests/cpp/data/test_metainfo.cc
index bb5452a56d28..4f379f4fdd25 100644
--- a/tests/cpp/data/test_metainfo.cc
+++ b/tests/cpp/data/test_metainfo.cc
@@ -1,4 +1,6 @@
-// Copyright 2016-2020 by Contributors
+// Copyright 2016-2021 by Contributors
+#include "test_metainfo.h"
+
 #include <dmlc/io.h>
 #include <dmlc/filesystem.h>
 #include <xgboost/data.h>
@@ -122,7 +124,10 @@ TEST(MetaInfo, SaveLoadBinary) {
     EXPECT_EQ(inforead.labels_.HostVector(), info.labels_.HostVector());
     EXPECT_EQ(inforead.group_ptr_, info.group_ptr_);
     EXPECT_EQ(inforead.weights_.HostVector(), info.weights_.HostVector());
-    EXPECT_EQ(inforead.base_margin_.HostVector(), info.base_margin_.HostVector());
+
+    auto orig_margin = info.base_margin_.View(xgboost::GenericParameter::kCpuId);
+    auto read_margin = inforead.base_margin_.View(xgboost::GenericParameter::kCpuId);
+    EXPECT_TRUE(std::equal(orig_margin.cbegin(), orig_margin.cend(), read_margin.cbegin()));
 
     EXPECT_EQ(inforead.feature_type_names.size(), kCols);
     EXPECT_EQ(inforead.feature_types.Size(), kCols);
@@ -254,10 +259,10 @@ TEST(MetaInfo, Validate) {
   xgboost::HostDeviceVector<xgboost::bst_group_t> d_groups{groups};
   d_groups.SetDevice(0);
   d_groups.DevicePointer();  // pull to device
-  auto arr_interface = xgboost::GetArrayInterface(&d_groups, 64, 1);
-  std::string arr_interface_str;
-  xgboost::Json::Dump(arr_interface, &arr_interface_str);
-  EXPECT_THROW(info.SetInfo("group", arr_interface_str), dmlc::Error);
+  std::string arr_interface_str{
+      xgboost::linalg::MakeVec(d_groups.ConstDevicePointer(), d_groups.Size(), 0)
+          .ArrayInterfaceStr()};
+  EXPECT_THROW(info.SetInfo("group", xgboost::StringView{arr_interface_str}), dmlc::Error);
 #endif  // defined(XGBOOST_USE_CUDA)
 }
 
@@ -292,3 +297,7 @@ TEST(MetaInfo, HostExtend) {
     ASSERT_EQ(lhs.group_ptr_.at(i), per_group * i);
   }
 }
+
+namespace xgboost {
+TEST(MetaInfo, CPUStridedData) { TestMetaInfoStridedData(GenericParameter::kCpuId); }
+}  // namespace xgboost
diff --git a/tests/cpp/data/test_metainfo.cu b/tests/cpp/data/test_metainfo.cu
index 090374b913d6..d57e9ceda807 100644
--- a/tests/cpp/data/test_metainfo.cu
+++ b/tests/cpp/data/test_metainfo.cu
@@ -1,12 +1,14 @@
-/*! Copyright 2019 by Contributors */
-
+/*! Copyright 2019-2021 by XGBoost Contributors */
 #include <gtest/gtest.h>
 #include <xgboost/data.h>
 #include <xgboost/json.h>
+#include <xgboost/generic_parameters.h>
 #include <thrust/device_vector.h>
 #include "test_array_interface.h"
 #include "../../../src/common/device_helpers.cuh"
 
+#include "test_metainfo.h"
+
 namespace xgboost {
 
 template <typename T>
@@ -23,7 +25,7 @@ std::string PrepareData(std::string typestr, thrust::device_vector<T>* out, cons
   std::vector<Json> j_shape {Json(Integer(static_cast<Integer::Int>(kRows)))};
   column["shape"] = Array(j_shape);
   column["strides"] = Array(std::vector<Json>{Json(Integer(static_cast<Integer::Int>(sizeof(T))))});
-  column["version"] = Integer(static_cast<Integer::Int>(1));
+  column["version"] = 3;
   column["typestr"] = String(typestr);
 
   auto p_d_data = d_data.data().get();
@@ -31,6 +33,7 @@ std::string PrepareData(std::string typestr, thrust::device_vector<T>* out, cons
         Json(Integer(reinterpret_cast<Integer::Int>(p_d_data))),
         Json(Boolean(false))};
   column["data"] = j_data;
+  column["stream"] = nullptr;
   Json array(std::vector<Json>{column});
 
   std::string str;
@@ -49,6 +52,7 @@ TEST(MetaInfo, FromInterface) {
   info.SetInfo("label", str.c_str());
 
   auto const& h_label = info.labels_.HostVector();
+  ASSERT_EQ(h_label.size(), d_data.size());
   for (size_t i = 0; i < d_data.size(); ++i) {
     ASSERT_EQ(h_label[i], d_data[i]);
   }
@@ -60,9 +64,10 @@ TEST(MetaInfo, FromInterface) {
   }
 
   info.SetInfo("base_margin", str.c_str());
-  auto const& h_base_margin = info.base_margin_.HostVector();
+  auto const h_base_margin = info.base_margin_.View(GenericParameter::kCpuId);
+  ASSERT_EQ(h_base_margin.Size(), d_data.size());
   for (size_t i = 0; i < d_data.size(); ++i) {
-    ASSERT_EQ(h_base_margin[i], d_data[i]);
+    ASSERT_EQ(h_base_margin(i), d_data[i]);
   }
 
   thrust::device_vector<int> d_group_data;
@@ -76,6 +81,10 @@ TEST(MetaInfo, FromInterface) {
   EXPECT_EQ(info.group_ptr_, expected_group_ptr);
 }
 
+TEST(MetaInfo, GPUStridedData) {
+  TestMetaInfoStridedData(0);
+}
+
 TEST(MetaInfo, Group) {
   cudaSetDevice(0);
   MetaInfo info;
diff --git a/tests/cpp/data/test_metainfo.h b/tests/cpp/data/test_metainfo.h
new file mode 100644
index 000000000000..67da633d4be5
--- /dev/null
+++ b/tests/cpp/data/test_metainfo.h
@@ -0,0 +1,82 @@
+/*!
+ * Copyright 2021 by XGBoost Contributors
+ */
+#ifndef XGBOOST_TESTS_CPP_DATA_TEST_METAINFO_H_
+#define XGBOOST_TESTS_CPP_DATA_TEST_METAINFO_H_
+#include <gtest/gtest.h>
+#include <xgboost/data.h>
+#include <xgboost/host_device_vector.h>
+#include <xgboost/linalg.h>
+
+#include <numeric>
+#include "../../../src/data/array_interface.h"
+#include "../../../src/common/linalg_op.h"
+
+namespace xgboost {
+inline void TestMetaInfoStridedData(int32_t device) {
+  MetaInfo info;
+  {
+    // label
+    HostDeviceVector<float> labels;
+    labels.Resize(64);
+    auto& h_labels = labels.HostVector();
+    std::iota(h_labels.begin(), h_labels.end(), 0.0f);
+    bool is_gpu = device >= 0;
+    if (is_gpu) {
+      labels.SetDevice(0);
+    }
+
+    auto t = linalg::TensorView<float const, 2>{
+        is_gpu ? labels.ConstDeviceSpan() : labels.ConstHostSpan(), {32, 2}, device};
+    auto s = t.Slice(linalg::All(), 0);
+
+    auto str = s.ArrayInterfaceStr();
+    ASSERT_EQ(s.Size(), 32);
+
+    info.SetInfo("label", StringView{str});
+    auto const& h_result = info.labels_.HostVector();
+    ASSERT_EQ(h_result.size(), 32);
+
+    for (auto v : h_result) {
+      ASSERT_EQ(static_cast<int32_t>(v) % 2, 0);
+    }
+  }
+  {
+    // qid
+    linalg::Tensor<uint64_t, 2> qid;
+    qid.Reshape(32, 2);
+    auto& h_qid = qid.Data()->HostVector();
+    std::iota(h_qid.begin(), h_qid.end(), 0);
+    auto s = qid.View(device).Slice(linalg::All(), 0);
+    auto str = s.ArrayInterfaceStr();
+    info.SetInfo("qid", StringView{str});
+    auto const& h_result = info.group_ptr_;
+    ASSERT_EQ(h_result.size(), s.Size() + 1);
+  }
+  {
+    // base margin
+    linalg::Tensor<float, 4> base_margin;
+    base_margin.Reshape(4, 3, 2, 3);
+    auto& h_margin = base_margin.Data()->HostVector();
+    std::iota(h_margin.begin(), h_margin.end(), 0.0);
+    auto t_margin = base_margin.View(device).Slice(linalg::All(), linalg::All(), 0, linalg::All());
+    ASSERT_EQ(t_margin.Shape().size(), 3);
+
+    info.SetInfo("base_margin", StringView{t_margin.ArrayInterfaceStr()});
+    auto const& h_result = info.base_margin_.View(-1);
+    ASSERT_EQ(h_result.Shape().size(), 3);
+    auto in_margin = base_margin.View(-1);
+    linalg::ElementWiseKernelHost(h_result, omp_get_max_threads(), [&](size_t i, float v_0) {
+      auto tup = linalg::UnravelIndex(i, h_result.Shape());
+      auto i0 = std::get<0>(tup);
+      auto i1 = std::get<1>(tup);
+      auto i2 = std::get<2>(tup);
+      // Sliced at 3^th dimension.
+      auto v_1 = in_margin(i0, i1, 0, i2);
+      CHECK_EQ(v_0, v_1);
+      return v_0;
+    });
+  }
+}
+}  // namespace xgboost
+#endif  // XGBOOST_TESTS_CPP_DATA_TEST_METAINFO_H_
diff --git a/tests/cpp/data/test_simple_dmatrix.cc b/tests/cpp/data/test_simple_dmatrix.cc
index f777b00e244c..c25e877079d2 100644
--- a/tests/cpp/data/test_simple_dmatrix.cc
+++ b/tests/cpp/data/test_simple_dmatrix.cc
@@ -253,8 +253,8 @@ TEST(SimpleDMatrix, Slice) {
   std::iota(lower.begin(), lower.end(), 0.0f);
   std::iota(upper.begin(), upper.end(), 1.0f);
 
-  auto& margin = p_m->Info().base_margin_.HostVector();
-  margin.resize(kRows * kClasses);
+  auto& margin = p_m->Info().base_margin_;
+  margin = linalg::Tensor<float, 3>{{kRows, kClasses}, GenericParameter::kCpuId};
 
   std::array<int32_t, 3> ridxs {1, 3, 5};
   std::unique_ptr<DMatrix> out { p_m->Slice(ridxs) };
@@ -284,10 +284,10 @@ TEST(SimpleDMatrix, Slice) {
         ASSERT_EQ(p_m->Info().weights_.HostVector().at(ridx),
                   out->Info().weights_.HostVector().at(i));
 
-        auto& out_margin = out->Info().base_margin_.HostVector();
+        auto out_margin = out->Info().base_margin_.View(GenericParameter::kCpuId);
+        auto in_margin = margin.View(GenericParameter::kCpuId);
         for (size_t j = 0; j < kClasses; ++j) {
-          auto in_beg = ridx * kClasses;
-          ASSERT_EQ(out_margin.at(i * kClasses + j), margin.at(in_beg + j));
+          ASSERT_EQ(out_margin(i, j), in_margin(ridx, j));
         }
       }
     }
diff --git a/tests/cpp/data/test_simple_dmatrix.cu b/tests/cpp/data/test_simple_dmatrix.cu
index d74f5b150087..19f13b1fddfd 100644
--- a/tests/cpp/data/test_simple_dmatrix.cu
+++ b/tests/cpp/data/test_simple_dmatrix.cu
@@ -122,13 +122,13 @@ TEST(SimpleDMatrix, FromColumnarWithEmptyRows) {
     col["data"] = j_data;
     std::vector<Json> j_shape{Json(Integer(static_cast<Integer::Int>(kRows)))};
     col["shape"] = Array(j_shape);
-    col["version"] = Integer(static_cast<Integer::Int>(1));
+    col["version"] = 3;
     col["typestr"] = String("<f4");
 
     // Construct the mask object.
     col["mask"] = Object();
     auto& j_mask = col["mask"];
-    j_mask["version"] = Integer(static_cast<Integer::Int>(1));
+    j_mask["version"] = 3;
     auto& mask_storage = column_bitfields[i];
     mask_storage.resize(16);  // 16 bytes
 
@@ -220,7 +220,7 @@ TEST(SimpleCSRSource, FromColumnarSparse) {
   for (size_t c = 0; c < kCols; ++c) {
     auto& column = j_columns[c];
     column = Object();
-    column["version"] = Integer(static_cast<Integer::Int>(1));
+    column["version"] = 3;
     column["typestr"] = String("<f4");
     auto p_d_data = raw_pointer_cast(columns_data[c].data());
     std::vector<Json> j_data {
@@ -229,12 +229,12 @@ TEST(SimpleCSRSource, FromColumnarSparse) {
     column["data"] = j_data;
     std::vector<Json> j_shape {Json(Integer(static_cast<Integer::Int>(kRows)))};
     column["shape"] = Array(j_shape);
-    column["version"] = Integer(static_cast<Integer::Int>(1));
+    column["version"] = 3;
     column["typestr"] = String("<f4");
 
     column["mask"] = Object();
     auto& j_mask = column["mask"];
-    j_mask["version"] = Integer(static_cast<Integer::Int>(1));
+    j_mask["version"] = 3;
     j_mask["data"] = std::vector<Json>{
       Json(Integer(reinterpret_cast<Integer::Int>(column_bitfields[c].data().get()))),
       Json(Boolean(false))};
diff --git a/tests/cpp/helpers.cc b/tests/cpp/helpers.cc
index db4454c9eb1e..0906d9ed87df 100644
--- a/tests/cpp/helpers.cc
+++ b/tests/cpp/helpers.cc
@@ -228,6 +228,7 @@ RandomDataGenerator::GenerateArrayInterfaceBatch(
     if (device_ >= 0) {
       array_interface["data"][0] =
           Integer(reinterpret_cast<int64_t>(storage->DevicePointer() + offset));
+      array_interface["stream"] = Null{};
     } else {
       array_interface["data"][0] =
           Integer(reinterpret_cast<int64_t>(storage->HostPointer() + offset));
@@ -240,7 +241,7 @@ RandomDataGenerator::GenerateArrayInterfaceBatch(
     array_interface["shape"][1] = cols_;
 
     array_interface["typestr"] = String("<f4");
-    array_interface["version"] = 1;
+    array_interface["version"] = 3;
     return array_interface;
   };
 
diff --git a/tests/cpp/helpers.h b/tests/cpp/helpers.h
index c424d65ced05..c89bb3f4563a 100644
--- a/tests/cpp/helpers.h
+++ b/tests/cpp/helpers.h
@@ -188,6 +188,7 @@ Json GetArrayInterface(HostDeviceVector<T> *storage, size_t rows, size_t cols) {
   if (storage->DeviceCanRead()) {
     array_interface["data"][0] =
         Integer(reinterpret_cast<int64_t>(storage->ConstDevicePointer()));
+    array_interface["stream"] = nullptr;
   } else {
     array_interface["data"][0] =
         Integer(reinterpret_cast<int64_t>(storage->ConstHostPointer()));
@@ -198,9 +199,9 @@ Json GetArrayInterface(HostDeviceVector<T> *storage, size_t rows, size_t cols) {
   array_interface["shape"][0] = rows;
   array_interface["shape"][1] = cols;
 
-  char t = ArrayInterfaceHandler::TypeChar<T>();
+  char t = linalg::detail::ArrayInterfaceHandler::TypeChar<T>();
   array_interface["typestr"] = String(std::string{"<"} + t + std::to_string(sizeof(T)));
-  array_interface["version"] = 1;
+  array_interface["version"] = 3;
   return array_interface;
 }
 
diff --git a/tests/cpp/predictor/test_gpu_predictor.cu b/tests/cpp/predictor/test_gpu_predictor.cu
index ad1083f9161b..b36df742da4f 100644
--- a/tests/cpp/predictor/test_gpu_predictor.cu
+++ b/tests/cpp/predictor/test_gpu_predictor.cu
@@ -108,7 +108,9 @@ TEST(GPUPredictor, ExternalMemoryTest) {
   dmats.push_back(CreateSparsePageDMatrix(8000));
 
   for (const auto& dmat: dmats) {
-    dmat->Info().base_margin_.Resize(dmat->Info().num_row_ * n_classes, 0.5);
+    dmat->Info().base_margin_ =
+        linalg::Tensor<float, 3>{{dmat->Info().num_row_, static_cast<size_t>(n_classes)}, 0};
+    dmat->Info().base_margin_.Data()->Fill(0.5);
     PredictionCacheEntry out_predictions;
     gpu_predictor->InitOutPredictions(dmat->Info(), &out_predictions.predictions, model);
     gpu_predictor->PredictBatch(dmat.get(), &out_predictions, model, 0);
diff --git a/tests/python/test_dmatrix.py b/tests/python/test_dmatrix.py
index 1b5ad266d3d9..9dcaa1c0d2c1 100644
--- a/tests/python/test_dmatrix.py
+++ b/tests/python/test_dmatrix.py
@@ -141,6 +141,7 @@ def test_slice(self):
         # base margin is per-class in multi-class classifier
         base_margin = rng.randn(100, 3).astype(np.float32)
         d.set_base_margin(base_margin)
+        np.testing.assert_allclose(d.get_base_margin().reshape(100, 3), base_margin)
 
         ridxs = [1, 2, 3, 4, 5, 6]
         sliced = d.slice(ridxs)
@@ -154,7 +155,7 @@ def test_slice(self):
         # Slicing a DMatrix results into a DMatrix that's equivalent to a DMatrix that's
         # constructed from the corresponding NumPy slice
         d2 = xgb.DMatrix(X[1:7, :], y[1:7])
-        d2.set_base_margin(base_margin[1:7, :].flatten())
+        d2.set_base_margin(base_margin[1:7, :])
         eval_res = {}
         _ = xgb.train(
             {'num_class': 3, 'objective': 'multi:softprob',
@@ -280,7 +281,7 @@ def test_feature_weights(self):
         m.set_info(feature_weights=fw)
         np.testing.assert_allclose(fw, m.get_float_info('feature_weights'))
         # Handle empty
-        m.set_info(feature_weights=np.empty((0, 0)))
+        m.set_info(feature_weights=np.empty((0, )))
 
         assert m.get_float_info('feature_weights').shape[0] == 0
 
diff --git a/tests/python/test_with_dask.py b/tests/python/test_with_dask.py
index 3315d3d8f184..da9441aeed3e 100644
--- a/tests/python/test_with_dask.py
+++ b/tests/python/test_with_dask.py
@@ -137,16 +137,13 @@ def test_from_dask_array() -> None:
             prediction = prediction.compute()
 
             booster: xgb.Booster = result['booster']
-            single_node_predt = booster.predict(
-                xgb.DMatrix(X.compute())
-            )
+            single_node_predt = booster.predict(xgb.DMatrix(X.compute()))
             np.testing.assert_allclose(prediction, single_node_predt)
 
             config = json.loads(booster.save_config())
             assert int(config['learner']['generic_param']['nthread']) == 5
 
-            from_arr = xgb.dask.predict(
-                client, model=booster, data=X)
+            from_arr = xgb.dask.predict(client, model=booster, data=X)
 
             assert isinstance(from_arr, da.Array)
             assert np.all(single_node_predt == from_arr.compute())
@@ -477,23 +474,6 @@ def test_dask_classifier(model: str, client: "Client") -> None:
     run_dask_classifier(X, y_bin, w, model, None, client, 2)
 
 
-@pytest.mark.skipif(**tm.no_sklearn())
-def test_sklearn_grid_search(client: "Client") -> None:
-    from sklearn.model_selection import GridSearchCV
-    X, y, _ = generate_array()
-    reg = xgb.dask.DaskXGBRegressor(learning_rate=0.1,
-                                    tree_method='hist')
-    reg.client = client
-    model = GridSearchCV(reg, {'max_depth': [2, 4],
-                               'n_estimators': [5, 10]},
-                         cv=2, verbose=1)
-    model.fit(X, y)
-    # Expect unique results for each parameter value This confirms
-    # sklearn is able to successfully update the parameter
-    means = model.cv_results_['mean_test_score']
-    assert len(means) == len(set(means))
-
-
 def test_empty_dmatrix_training_continuation(client: "Client") -> None:
     kRows, kCols = 1, 97
     X = dd.from_array(np.random.randn(kRows, kCols))
@@ -714,18 +694,11 @@ def test_auc(client: "Client") -> None:
 
 # No test for Exact, as empty DMatrix handling are mostly for distributed
 # environment and Exact doesn't support it.
-def test_empty_dmatrix_hist() -> None:
-    with LocalCluster(n_workers=kWorkers) as cluster:
-        with Client(cluster) as client:
-            parameters = {'tree_method': 'hist'}
-            run_empty_dmatrix_reg(client, parameters)
-            run_empty_dmatrix_cls(client, parameters)
-
-
-def test_empty_dmatrix_approx() -> None:
+@pytest.mark.parametrize("tree_method", ["hist", "approx"])
+def test_empty_dmatrix(tree_method) -> None:
     with LocalCluster(n_workers=kWorkers) as cluster:
         with Client(cluster) as client:
-            parameters = {'tree_method': 'approx'}
+            parameters = {'tree_method': tree_method}
             run_empty_dmatrix_reg(client, parameters)
             run_empty_dmatrix_cls(client, parameters)
 
@@ -1102,12 +1075,12 @@ def after_iteration(
         os.remove(after_fname)
 
     def run_updater_test(
-            self,
-            client: "Client",
-            params: Dict,
-            num_rounds: int,
-            dataset: tm.TestDataset,
-            tree_method: str
+        self,
+        client: "Client",
+        params: Dict,
+        num_rounds: int,
+        dataset: tm.TestDataset,
+        tree_method: str
     ) -> None:
         params['tree_method'] = tree_method
         params = dataset.set_params(params)
diff --git a/tests/python/test_with_sklearn.py b/tests/python/test_with_sklearn.py
index 9d63b160c244..4ab86b7e20ba 100644
--- a/tests/python/test_with_sklearn.py
+++ b/tests/python/test_with_sklearn.py
@@ -17,16 +17,6 @@
 from sklearn.utils.estimator_checks import parametrize_with_checks
 
 
-class TemporaryDirectory(object):
-    """Context manager for tempfile.mkdtemp()"""
-    def __enter__(self):
-        self.name = tempfile.mkdtemp()
-        return self.name
-
-    def __exit__(self, exc_type, exc_value, traceback):
-        shutil.rmtree(self.name)
-
-
 def test_binary_classification():
     from sklearn.datasets import load_digits
     from sklearn.model_selection import KFold
@@ -509,7 +499,7 @@ def wrapped(y, p):
     assert is_called[0]
 
 
-def test_sklearn_api():
+def run_sklearn_api(booster, error, n_est):
     from sklearn.datasets import load_iris
     from sklearn.model_selection import train_test_split
 
@@ -517,30 +507,18 @@ def test_sklearn_api():
     tr_d, te_d, tr_l, te_l = train_test_split(iris.data, iris.target,
                                               train_size=120, test_size=0.2)
 
-    classifier = xgb.XGBClassifier(booster='gbtree', n_estimators=10)
+    classifier = xgb.XGBClassifier(booster=booster, n_estimators=n_est)
     classifier.fit(tr_d, tr_l)
 
     preds = classifier.predict(te_d)
     labels = te_l
     err = sum([1 for p, l in zip(preds, labels) if p != l]) * 1.0 / len(te_l)
-    assert err < 0.2
-
-
-def test_sklearn_api_gblinear():
-    from sklearn.datasets import load_iris
-    from sklearn.model_selection import train_test_split
+    assert err < error
 
-    iris = load_iris()
-    tr_d, te_d, tr_l, te_l = train_test_split(iris.data, iris.target,
-                                              train_size=120)
 
-    classifier = xgb.XGBClassifier(booster='gblinear', n_estimators=100)
-    classifier.fit(tr_d, tr_l)
-
-    preds = classifier.predict(te_d)
-    labels = te_l
-    err = sum([1 for p, l in zip(preds, labels) if p != l]) * 1.0 / len(te_l)
-    assert err < 0.5
+def test_sklearn_api():
+    run_sklearn_api("gbtree", 0.2, 10)
+    run_sklearn_api("gblinear", 0.5, 100)
 
 
 @pytest.mark.skipif(**tm.no_matplotlib())
@@ -721,7 +699,7 @@ def test_sklearn_get_default_params():
     assert cls.get_params()['base_score'] is not None
 
 
-def test_validation_weights_xgbmodel():
+def run_validation_weights(model):
     from sklearn.datasets import make_hastie_10_2
 
     # prepare training and test data
@@ -733,7 +711,7 @@ def test_validation_weights_xgbmodel():
     # instantiate model
     param_dist = {'objective': 'binary:logistic', 'n_estimators': 2,
                   'random_state': 123}
-    clf = xgb.sklearn.XGBModel(**param_dist)
+    clf = model(**param_dist)
 
     # train it using instance weights only in the training set
     weights_train = np.random.choice([1, 2], len(X_train))
@@ -778,49 +756,9 @@ def test_validation_weights_xgbmodel():
                 sample_weight_eval_set=[weights_train])
 
 
-def test_validation_weights_xgbclassifier():
-    from sklearn.datasets import make_hastie_10_2
-
-    # prepare training and test data
-    X, y = make_hastie_10_2(n_samples=2000, random_state=42)
-    labels, y = np.unique(y, return_inverse=True)
-    X_train, X_test = X[:1600], X[1600:]
-    y_train, y_test = y[:1600], y[1600:]
-
-    # instantiate model
-    param_dist = {'objective': 'binary:logistic', 'n_estimators': 2,
-                  'random_state': 123}
-    clf = xgb.sklearn.XGBClassifier(**param_dist)
-
-    # train it using instance weights only in the training set
-    weights_train = np.random.choice([1, 2], len(X_train))
-    clf.fit(X_train, y_train,
-            sample_weight=weights_train,
-            eval_set=[(X_test, y_test)],
-            eval_metric='logloss',
-            verbose=False)
-
-    # evaluate logloss metric on test set *without* using weights
-    evals_result_without_weights = clf.evals_result()
-    logloss_without_weights = evals_result_without_weights[
-        "validation_0"]["logloss"]
-
-    # now use weights for the test set
-    np.random.seed(0)
-    weights_test = np.random.choice([1, 2], len(X_test))
-    clf.fit(X_train, y_train,
-            sample_weight=weights_train,
-            eval_set=[(X_test, y_test)],
-            sample_weight_eval_set=[weights_test],
-            eval_metric='logloss',
-            verbose=False)
-    evals_result_with_weights = clf.evals_result()
-    logloss_with_weights = evals_result_with_weights["validation_0"]["logloss"]
-
-    # check that the logloss in the test set is actually different
-    # when using weights than when not using them
-    assert all((logloss_with_weights[i] != logloss_without_weights[i]
-                for i in [0, 1]))
+def test_validation_weights():
+    run_validation_weights(xgb.XGBModel)
+    run_validation_weights(xgb.XGBClassifier)
 
 
 def save_load_model(model_path):
@@ -862,16 +800,16 @@ def save_load_model(model_path):
 
 
 def test_save_load_model():
-    with TemporaryDirectory() as tempdir:
+    with tempfile.TemporaryDirectory() as tempdir:
         model_path = os.path.join(tempdir, 'digits.model')
         save_load_model(model_path)
 
-    with TemporaryDirectory() as tempdir:
+    with tempfile.TemporaryDirectory() as tempdir:
         model_path = os.path.join(tempdir, 'digits.model.json')
         save_load_model(model_path)
 
     from sklearn.datasets import load_digits
-    with TemporaryDirectory() as tempdir:
+    with tempfile.TemporaryDirectory() as tempdir:
         model_path = os.path.join(tempdir, 'digits.model.json')
         digits = load_digits(n_class=2)
         y = digits['target']
@@ -949,7 +887,7 @@ def test_XGBClassifier_resume():
     from sklearn.datasets import load_breast_cancer
     from sklearn.metrics import log_loss
 
-    with TemporaryDirectory() as tempdir:
+    with tempfile.TemporaryDirectory() as tempdir:
         model1_path = os.path.join(tempdir, 'test_XGBClassifier.model')
         model1_booster_path = os.path.join(tempdir, 'test_XGBClassifier.booster')
 
@@ -1089,7 +1027,7 @@ def test_pandas_input():
 
 
 def run_feature_weights(X, y, fw, model=xgb.XGBRegressor):
-    with TemporaryDirectory() as tmpdir:
+    with tempfile.TemporaryDirectory() as tmpdir:
         colsample_bynode = 0.5
         reg = model(tree_method='hist', colsample_bynode=colsample_bynode)