From 03f68306724207463f79b8d2f44922f4fdb95f62 Mon Sep 17 00:00:00 2001
From: fis <jm.yuan@outlook.com>
Date: Fri, 19 Nov 2021 18:20:04 +0800
Subject: [PATCH 01/13] Prototype.

---
 include/xgboost/linalg.h        | 47 ++++++++++++++++++++++++++++++---
 tests/cpp/common/test_linalg.cc | 17 ++++++++++++
 2 files changed, 61 insertions(+), 3 deletions(-)
diff --git a/include/xgboost/linalg.h b/include/xgboost/linalg.h
index bb0486866f81..cfb78ca7e925 100644
--- a/include/xgboost/linalg.h
+++ b/include/xgboost/linalg.h
@@ -55,8 +55,16 @@ constexpr void CalcStride(size_t (&shape)[D], size_t (&stride)[D]) {
 }
 
 struct AllTag {};
+
 struct IntTag {};
 
+template <typename I>
+struct RangeTag {
+  I beg;
+  I end;
+  constexpr size_t Size() const { return end - beg; }
+};
+
 /**
  * \brief Calculate the dimension of sliced tensor.
  */
@@ -83,7 +91,7 @@ template <typename S>
 using RemoveCRType = std::remove_const_t<std::remove_reference_t<S>>;
 
 template <typename S>
-using IndexToTag = std::conditional_t<std::is_integral<RemoveCRType<S>>::value, IntTag, AllTag>;
+using IndexToTag = std::conditional_t<std::is_integral<RemoveCRType<S>>::value, IntTag, S>;
 
 template <int32_t n, typename Fn>
 XGBOOST_DEVICE constexpr auto UnrollLoop(Fn fn) {
@@ -192,9 +200,16 @@ XGBOOST_DEVICE decltype(auto) constexpr Apply(Fn &&f, Tup &&t) {
 }  // namespace detail
 
 /**
- * \brief Specify all elements in the axis is used for slice.
+ * \brief Specify all elements in the axis for slicing.
  */
 constexpr detail::AllTag All() { return {}; }
+/**
+ * \brief Specify a range of elements in the axis for slicing.
+ */
+template <typename I>
+constexpr detail::RangeTag<I> Range(I beg, I end) {
+  return {beg, end};
+}
 
 /**
  * \brief A tensor view with static type and shape. It implements indexing and slicing.
@@ -233,7 +248,33 @@ class TensorView {
     }
   }
 
-  template <size_t old_dim, size_t new_dim, int32_t D, typename... S>
+  template <size_t old_dim, size_t new_dim, int32_t D, typename I>
+  XGBOOST_DEVICE size_t MakeSliceDim(size_t new_shape[D], size_t new_stride[D],
+                                     detail::RangeTag<I> &&range) const {
+    static_assert(new_dim < D, "");
+    static_assert(old_dim < kDim, "");
+    new_stride[new_dim] = stride_[old_dim];
+    assert(range.Size() <= shape_[old_dim]);
+    new_shape[new_dim] = range.Size();
+    return 0;
+  }
+  /**
+   * \brief Slice dimension for Range tag.
+   */
+  template <size_t old_dim, size_t new_dim, int32_t D, typename I, typename... S>
+  XGBOOST_DEVICE size_t MakeSliceDim(size_t new_shape[D], size_t new_stride[D],
+                                     detail::RangeTag<I> &&range, S &&...slices) const {
+    static_assert(new_dim < D, "");
+    static_assert(old_dim < kDim, "");
+    new_stride[new_dim] = stride_[old_dim];
+    assert(range.Size() <= shape_[old_dim]);
+    new_shape[new_dim] = range.Size();
+    return MakeSliceDim<old_dim + 1, new_dim + 1, D>(new_shape, new_stride,
+                                                     std::forward<S>(slices)...) +
+           range.beg;
+  }
+
+  template <size_t old_dim, size_t new_dim, int32_t D>
   XGBOOST_DEVICE size_t MakeSliceDim(size_t new_shape[D], size_t new_stride[D],
                                      detail::AllTag) const {
     static_assert(new_dim < D, "");
diff --git a/tests/cpp/common/test_linalg.cc b/tests/cpp/common/test_linalg.cc
index 44a91c3f2c2e..aa982f6128e8 100644
--- a/tests/cpp/common/test_linalg.cc
+++ b/tests/cpp/common/test_linalg.cc
@@ -107,6 +107,23 @@ TEST(Linalg, TensorView) {
     auto s = t.Slice(1, 2, All());
     static_assert(decltype(s)::kDimension == 1, "");
   }
+  {
+    TensorView<double> t{data, {2, 3, 4}, 0};
+    auto s = t.Slice(1, linalg::All(), 1);
+    ASSERT_EQ(s(0), 13);
+    ASSERT_EQ(s(1), 17);
+    ASSERT_EQ(s(2), 21);
+  }
+  {
+    TensorView<double> t{data, {2, 3, 4}, 0};
+    auto s = t.Slice(linalg::All(), linalg::Range(0, 3), 2);
+    static_assert(decltype(s)::kDimension == 2, "");
+  }
+  {
+    TensorView<double> t{data, {2, 3, 4}, 0};
+    auto s = t.Slice(linalg::All(), linalg::Range(0, 3), linalg::Range(0, 4));
+    static_assert(decltype(s)::kDimension == 3, "");
+  }
 }
 
 TEST(Linalg, Tensor) {

From 7a116bd79d23b72b723eb7e19e3b403c1cfc580e Mon Sep 17 00:00:00 2001
From: fis <jm.yuan@outlook.com>
Date: Fri, 19 Nov 2021 19:36:33 +0800
Subject: [PATCH 02/13] remove iterators.

---
 include/xgboost/linalg.h        | 16 ++++++++--------
 tests/cpp/common/test_linalg.cc |  5 +++--
 2 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/include/xgboost/linalg.h b/include/xgboost/linalg.h
index cfb78ca7e925..55d29ea69010 100644
--- a/include/xgboost/linalg.h
+++ b/include/xgboost/linalg.h
@@ -254,9 +254,11 @@ class TensorView {
     static_assert(new_dim < D, "");
     static_assert(old_dim < kDim, "");
     new_stride[new_dim] = stride_[old_dim];
-    assert(range.Size() <= shape_[old_dim]);
     new_shape[new_dim] = range.Size();
-    return 0;
+    assert(static_cast<decltype(shape_[old_dim])>(range.end) <= shape_[old_dim]);
+
+    auto offset = stride_[old_dim] * range.beg;
+    return offset;
   }
   /**
    * \brief Slice dimension for Range tag.
@@ -267,11 +269,13 @@ class TensorView {
     static_assert(new_dim < D, "");
     static_assert(old_dim < kDim, "");
     new_stride[new_dim] = stride_[old_dim];
-    assert(range.Size() <= shape_[old_dim]);
     new_shape[new_dim] = range.Size();
+    assert(static_cast<decltype(shape_[old_dim])>(range.end) <= shape_[old_dim]);
+
+    auto offset = stride_[old_dim] * range.beg;
     return MakeSliceDim<old_dim + 1, new_dim + 1, D>(new_shape, new_stride,
                                                      std::forward<S>(slices)...) +
-           range.beg;
+           offset;
   }
 
   template <size_t old_dim, size_t new_dim, int32_t D>
@@ -439,10 +443,6 @@ class TensorView {
    */
   XGBOOST_DEVICE auto Stride(size_t i) const { return stride_[i]; }
 
-  XGBOOST_DEVICE auto cbegin() const { return data_.cbegin(); }  // NOLINT
-  XGBOOST_DEVICE auto cend() const { return data_.cend(); }      // NOLINT
-  XGBOOST_DEVICE auto begin() { return data_.begin(); }          // NOLINT
-  XGBOOST_DEVICE auto end() { return data_.end(); }              // NOLINT
   /**
    * \brief Number of items in the tensor.
    */
diff --git a/tests/cpp/common/test_linalg.cc b/tests/cpp/common/test_linalg.cc
index aa982f6128e8..79654157f4a9 100644
--- a/tests/cpp/common/test_linalg.cc
+++ b/tests/cpp/common/test_linalg.cc
@@ -136,7 +136,8 @@ TEST(Linalg, Tensor) {
 
     size_t n = 2 * 3 * 4;
     ASSERT_EQ(t.Size(), n);
-    ASSERT_TRUE(std::equal(k_view.cbegin(), k_view.cbegin(), view.begin()));
+    ASSERT_TRUE(
+        std::equal(k_view.Values().cbegin(), k_view.Values().cbegin(), view.Values().begin()));
 
     Tensor<float, 3> t_0{std::move(t)};
     ASSERT_EQ(t_0.Size(), n);
@@ -190,7 +191,7 @@ TEST(Linalg, ArrayInterface) {
   auto cpu = kCpuId;
   auto t = Tensor<double, 2>{{3, 3}, cpu};
   auto v = t.View(cpu);
-  std::iota(v.begin(), v.end(), 0);
+  std::iota(v.Values().begin(), v.Values().end(), 0);
   auto arr = Json::Load(StringView{v.ArrayInterfaceStr()});
   ASSERT_EQ(get<Integer>(arr["shape"][0]), 3);
   ASSERT_EQ(get<Integer>(arr["strides"][0]), 3 * sizeof(double));

From 7913d942e69adcbd4a236aeb0f556b62a331c107 Mon Sep 17 00:00:00 2001
From: fis <jm.yuan@outlook.com>
Date: Fri, 19 Nov 2021 20:24:16 +0800
Subject: [PATCH 03/13] Helper functions.

---
 include/xgboost/linalg.h        | 23 +++++++++++++++++++----
 src/metric/auc.cc               |  5 ++---
 tests/cpp/common/test_linalg.cc | 29 ++++++++++++++++++++++++-----
 tests/cpp/common/test_linalg.cu | 20 ++++++++++++++++++++
 4 files changed, 65 insertions(+), 12 deletions(-)

diff --git a/include/xgboost/linalg.h b/include/xgboost/linalg.h
index 55d29ea69010..9d408e3b9713 100644
--- a/include/xgboost/linalg.h
+++ b/include/xgboost/linalg.h
@@ -212,7 +212,7 @@ constexpr detail::RangeTag<I> Range(I beg, I end) {
 }
 
 /**
- * \brief A tensor view with static type and shape. It implements indexing and slicing.
+ * \brief A tensor view with static type and dimension. It implements indexing and slicing.
  *
  * Most of the algorithms in XGBoost are implemented for both CPU and GPU without using
  * much linear algebra routines, this class is a helper intended to ease some high level
@@ -224,7 +224,7 @@ constexpr detail::RangeTag<I> Range(I beg, I end) {
  * some functions expect data types that can be used in everywhere (update prediction
  * cache for example).
  */
-template <typename T, int32_t kDim = 5>
+template <typename T, int32_t kDim>
 class TensorView {
  public:
   using ShapeT = size_t[kDim];
@@ -389,7 +389,7 @@ class TensorView {
    */
   template <typename... Index>
   XGBOOST_DEVICE T &operator()(Index &&...index) {
-    static_assert(sizeof...(index) <= kDim, "Invalid index.");
+    static_assert(sizeof...(index) == kDim, "Invalid index.");
     size_t offset = detail::Offset<0ul>(stride_, 0ul, std::forward<Index>(index)...);
     assert(offset < data_.size() && "Out of bound access.");
     return ptr_[offset];
@@ -399,7 +399,7 @@ class TensorView {
    */
   template <typename... Index>
   XGBOOST_DEVICE T const &operator()(Index &&...index) const {
-    static_assert(sizeof...(index) <= kDim, "Invalid index.");
+    static_assert(sizeof...(index) == kDim, "Invalid index.");
     size_t offset = detail::Offset<0ul>(stride_, 0ul, std::forward<Index>(index)...);
     assert(offset < data_.size() && "Out of bound access.");
     return ptr_[offset];
@@ -516,6 +516,21 @@ class TensorView {
   }
 };
 
+/**
+ * \brief Constructor for automatic type deduction.
+ */
+template <typename Container, typename I, int32_t D,
+          std::enable_if_t<!common::detail::IsSpan<Container>::value> * = nullptr>
+auto MakeTensorView(Container &data, I const (&shape)[D], int32_t device = 0) {  // NOLINT
+  using T = typename Container::value_type;
+  return TensorView<T, D>{data, shape, device};
+}
+
+template <typename T, typename I, int32_t D>
+auto MakeTensorView(common::Span<T> data, I const (&shape)[D], int32_t device = 0) {
+  return TensorView<T, D>{data, shape, device};
+}
+
 /**
  * \brief Turns linear index into multi-dimension index.  Similar to numpy unravel.
  */
diff --git a/src/metric/auc.cc b/src/metric/auc.cc
index ec8b6ee01bdd..5097116fbb78 100644
--- a/src/metric/auc.cc
+++ b/src/metric/auc.cc
@@ -85,9 +85,8 @@ double MultiClassOVR(common::Span<float const> predts, MetaInfo const &info,
   auto const &labels = info.labels_.ConstHostVector();
 
   std::vector<double> results_storage(n_classes * 3, 0);
-  linalg::TensorView<double> results(results_storage,
-                                     {n_classes, static_cast<size_t>(3)},
-                                     GenericParameter::kCpuId);
+  linalg::TensorView<double, 2> results(results_storage, {n_classes, static_cast<size_t>(3)},
+                                        GenericParameter::kCpuId);
   auto local_area = results.Slice(linalg::All(), 0);
   auto tp = results.Slice(linalg::All(), 1);
   auto auc = results.Slice(linalg::All(), 2);
diff --git a/tests/cpp/common/test_linalg.cc b/tests/cpp/common/test_linalg.cc
index 79654157f4a9..47d6e4622547 100644
--- a/tests/cpp/common/test_linalg.cc
+++ b/tests/cpp/common/test_linalg.cc
@@ -51,7 +51,7 @@ TEST(Linalg, TensorView) {
   std::vector<double> data(2 * 3 * 4, 0);
   std::iota(data.begin(), data.end(), 0);
 
-  TensorView<double> t{data, {2, 3, 4}, -1};
+  auto t = MakeTensorView(data, {2, 3, 4}, -1);
   ASSERT_EQ(t.Shape()[0], 2);
   ASSERT_EQ(t.Shape()[1], 3);
   ASSERT_EQ(t.Shape()[2], 4);
@@ -96,33 +96,52 @@ TEST(Linalg, TensorView) {
     // assignment
     TensorView<double, 3> t{data, {2, 3, 4}, 0};
     double pi = 3.14159;
+    auto old = t(1, 2, 3);
     t(1, 2, 3) = pi;
     ASSERT_EQ(t(1, 2, 3), pi);
+    t(1, 2, 3) = old;
+    ASSERT_EQ(t(1, 2, 3), old);
   }
 
   {
     // Don't assign the initial dimension, tensor should be able to deduce the correct dim
     // for Slice.
-    TensorView<double> t{data, {2, 3, 4}, 0};
+    auto t = MakeTensorView(data, {2, 3, 4}, 0);
     auto s = t.Slice(1, 2, All());
     static_assert(decltype(s)::kDimension == 1, "");
   }
   {
-    TensorView<double> t{data, {2, 3, 4}, 0};
+    auto t = MakeTensorView(data, {2, 3, 4}, 0);
     auto s = t.Slice(1, linalg::All(), 1);
     ASSERT_EQ(s(0), 13);
     ASSERT_EQ(s(1), 17);
     ASSERT_EQ(s(2), 21);
   }
   {
-    TensorView<double> t{data, {2, 3, 4}, 0};
+    auto t = MakeTensorView(data, {2, 3, 4}, 0);
     auto s = t.Slice(linalg::All(), linalg::Range(0, 3), 2);
     static_assert(decltype(s)::kDimension == 2, "");
+    std::vector<double> sol{2, 6, 10, 14, 18, 22};
+    auto k = 0;
+    for (size_t i = 0; i < s.Shape(0); ++i) {
+      for (size_t j = 0; j < s.Shape(1); ++j) {
+        ASSERT_EQ(s(i, j), sol.at(k));
+        k++;
+      }
+    }
   }
   {
-    TensorView<double> t{data, {2, 3, 4}, 0};
+    auto t = MakeTensorView(data, {2, 3, 4}, 0);
     auto s = t.Slice(linalg::All(), linalg::Range(0, 3), linalg::Range(0, 4));
     static_assert(decltype(s)::kDimension == 3, "");
+    auto all = t.Slice(linalg::All(), linalg::All(), linalg::All());
+    for (size_t i = 0; i < s.Shape(0); ++i) {
+      for (size_t j = 0; j < s.Shape(1); ++j) {
+        for (size_t k = 0; k < s.Shape(2); ++k) {
+          ASSERT_EQ(s(i, j, k), all(i,j, k));
+        }
+      }
+    }
   }
 }
 
diff --git a/tests/cpp/common/test_linalg.cu b/tests/cpp/common/test_linalg.cu
index abfef8bfd353..9f5101c5b827 100644
--- a/tests/cpp/common/test_linalg.cu
+++ b/tests/cpp/common/test_linalg.cu
@@ -56,7 +56,27 @@ void TestElementWiseKernel() {
     }
   }
 }
+
+void TestSlice() {
+  thrust::device_vector<double> data(2 * 3 * 4);
+  auto t = MakeTensorView(dh::ToSpan(data), {2, 3, 4}, 0);
+  dh::LaunchN(1, [=] __device__(size_t) {
+    auto s = t.Slice(linalg::All(), linalg::Range(0, 3), linalg::Range(0, 4));
+    auto all = t.Slice(linalg::All(), linalg::All(), linalg::All());
+    static_assert(decltype(s)::kDimension == 3, "");
+    for (size_t i = 0; i < s.Shape(0); ++i) {
+      for (size_t j = 0; j < s.Shape(1); ++j) {
+        for (size_t k = 0; k < s.Shape(2); ++k) {
+          SPAN_CHECK(s(i, j, k) == all(i, j, k));
+        }
+      }
+    }
+  });
+}
 }  // anonymous namespace
+
 TEST(Linalg, GPUElementWise) { TestElementWiseKernel(); }
+
+TEST(Linalg, GPUTensorView) { TestSlice(); }
 }  // namespace linalg
 }  // namespace xgboost

From a0ef665bcac5c686f2f87317f1bd587a6eb4f3fb Mon Sep 17 00:00:00 2001
From: fis <jm.yuan@outlook.com>
Date: Fri, 19 Nov 2021 21:02:29 +0800
Subject: [PATCH 04/13] Decouple.

---
 include/xgboost/linalg.h        | 92 ++++++++++++++++++++-------------
 src/common/linalg_op.cuh        |  2 +-
 src/common/linalg_op.h          |  2 +-
 tests/cpp/common/test_linalg.cc | 25 +++++++--
 tests/cpp/common/test_linalg.cu |  4 +-
 5 files changed, 80 insertions(+), 45 deletions(-)

diff --git a/include/xgboost/linalg.h b/include/xgboost/linalg.h
index 9d408e3b9713..f68e6ad6dbbc 100644
--- a/include/xgboost/linalg.h
+++ b/include/xgboost/linalg.h
@@ -20,6 +20,15 @@
 #include <utility>
 #include <vector>
 
+// decouple it from xgboost.
+#ifndef LINALG_HD
+#if defined(__CUDA__) || defined(__NVCC__)
+#define LINALG_HD __host__ __device__
+#else
+#define LINALG_HD
+#endif  // defined (__CUDA__) || defined(__NVCC__)
+#endif  // LINALG_HD
+
 namespace xgboost {
 namespace linalg {
 namespace detail {
@@ -47,7 +56,7 @@ constexpr std::enable_if_t<sizeof...(Tail) != 0, size_t> Offset(S (&strides)[D],
 }
 
 template <int32_t D>
-constexpr void CalcStride(size_t (&shape)[D], size_t (&stride)[D]) {
+constexpr void CalcStride(size_t const (&shape)[D], size_t (&stride)[D]) {
   stride[D - 1] = 1;
   for (int32_t s = D - 2; s >= 0; --s) {
     stride[s] = shape[s + 1] * stride[s + 1];
@@ -94,7 +103,7 @@ template <typename S>
 using IndexToTag = std::conditional_t<std::is_integral<RemoveCRType<S>>::value, IntTag, S>;
 
 template <int32_t n, typename Fn>
-XGBOOST_DEVICE constexpr auto UnrollLoop(Fn fn) {
+LINALG_HD constexpr auto UnrollLoop(Fn fn) {
 #if defined __CUDA_ARCH__
 #pragma unroll n
 #endif  // defined __CUDA_ARCH__
@@ -110,7 +119,7 @@ int32_t NativePopc(T v) {
   return c;
 }
 
-inline XGBOOST_DEVICE int Popc(uint32_t v) {
+inline LINALG_HD int Popc(uint32_t v) {
 #if defined(__CUDA_ARCH__)
   return __popc(v);
 #elif defined(__GNUC__) || defined(__clang__)
@@ -122,7 +131,7 @@ inline XGBOOST_DEVICE int Popc(uint32_t v) {
 #endif  // compiler
 }
 
-inline XGBOOST_DEVICE int Popc(uint64_t v) {
+inline LINALG_HD int Popc(uint64_t v) {
 #if defined(__CUDA_ARCH__)
   return __popcll(v);
 #elif defined(__GNUC__) || defined(__clang__)
@@ -148,7 +157,7 @@ constexpr auto Arr2Tup(T (&arr)[N]) {
 // slow on both CPU and GPU, especially 64 bit integer.  So here we first try to avoid 64
 // bit when the index is smaller, then try to avoid division when it's exp of 2.
 template <typename I, int32_t D>
-XGBOOST_DEVICE auto UnravelImpl(I idx, common::Span<size_t const, D> shape) {
+LINALG_HD auto UnravelImpl(I idx, common::Span<size_t const, D> shape) {
   size_t index[D]{0};
   static_assert(std::is_signed<decltype(D)>::value,
                 "Don't change the type without changing the for loop.");
@@ -182,7 +191,7 @@ void ReshapeImpl(size_t (&out_shape)[D], I &&s, S &&...rest) {
 }
 
 template <typename Fn, typename Tup, size_t... I>
-XGBOOST_DEVICE decltype(auto) constexpr Apply(Fn &&f, Tup &&t, std::index_sequence<I...>) {
+LINALG_HD decltype(auto) constexpr Apply(Fn &&f, Tup &&t, std::index_sequence<I...>) {
   return f(std::get<I>(t)...);
 }
 
@@ -193,7 +202,7 @@ XGBOOST_DEVICE decltype(auto) constexpr Apply(Fn &&f, Tup &&t, std::index_sequen
  * \param t tuple of arguments
  */
 template <typename Fn, typename Tup>
-XGBOOST_DEVICE decltype(auto) constexpr Apply(Fn &&f, Tup &&t) {
+LINALG_HD decltype(auto) constexpr Apply(Fn &&f, Tup &&t) {
   constexpr auto kSize = std::tuple_size<Tup>::value;
   return Apply(std::forward<Fn>(f), std::forward<Tup>(t), std::make_index_sequence<kSize>{});
 }
@@ -240,7 +249,7 @@ class TensorView {
   int32_t device_{-1};
 
   // Unlike `Tensor`, the data_ can have arbitrary size since this is just a view.
-  XGBOOST_DEVICE void CalcSize() {
+  LINALG_HD void CalcSize() {
     if (data_.empty()) {
       size_ = 0;
     } else {
@@ -249,8 +258,8 @@ class TensorView {
   }
 
   template <size_t old_dim, size_t new_dim, int32_t D, typename I>
-  XGBOOST_DEVICE size_t MakeSliceDim(size_t new_shape[D], size_t new_stride[D],
-                                     detail::RangeTag<I> &&range) const {
+  LINALG_HD size_t MakeSliceDim(size_t new_shape[D], size_t new_stride[D],
+                                detail::RangeTag<I> &&range) const {
     static_assert(new_dim < D, "");
     static_assert(old_dim < kDim, "");
     new_stride[new_dim] = stride_[old_dim];
@@ -264,8 +273,8 @@ class TensorView {
    * \brief Slice dimension for Range tag.
    */
   template <size_t old_dim, size_t new_dim, int32_t D, typename I, typename... S>
-  XGBOOST_DEVICE size_t MakeSliceDim(size_t new_shape[D], size_t new_stride[D],
-                                     detail::RangeTag<I> &&range, S &&...slices) const {
+  LINALG_HD size_t MakeSliceDim(size_t new_shape[D], size_t new_stride[D],
+                                detail::RangeTag<I> &&range, S &&...slices) const {
     static_assert(new_dim < D, "");
     static_assert(old_dim < kDim, "");
     new_stride[new_dim] = stride_[old_dim];
@@ -279,8 +288,7 @@ class TensorView {
   }
 
   template <size_t old_dim, size_t new_dim, int32_t D>
-  XGBOOST_DEVICE size_t MakeSliceDim(size_t new_shape[D], size_t new_stride[D],
-                                     detail::AllTag) const {
+  LINALG_HD size_t MakeSliceDim(size_t new_shape[D], size_t new_stride[D], detail::AllTag) const {
     static_assert(new_dim < D, "");
     static_assert(old_dim < kDim, "");
     new_stride[new_dim] = stride_[old_dim];
@@ -291,8 +299,8 @@ class TensorView {
    * \brief Slice dimension for All tag.
    */
   template <size_t old_dim, size_t new_dim, int32_t D, typename... S>
-  XGBOOST_DEVICE size_t MakeSliceDim(size_t new_shape[D], size_t new_stride[D], detail::AllTag,
-                                     S &&...slices) const {
+  LINALG_HD size_t MakeSliceDim(size_t new_shape[D], size_t new_stride[D], detail::AllTag,
+                                S &&...slices) const {
     static_assert(new_dim < D, "");
     static_assert(old_dim < kDim, "");
     new_stride[new_dim] = stride_[old_dim];
@@ -302,7 +310,7 @@ class TensorView {
   }
 
   template <size_t old_dim, size_t new_dim, int32_t D, typename Index>
-  XGBOOST_DEVICE size_t MakeSliceDim(size_t new_shape[D], size_t new_stride[D], Index i) const {
+  LINALG_HD size_t MakeSliceDim(size_t new_shape[D], size_t new_stride[D], Index i) const {
     static_assert(old_dim < kDim, "");
     return stride_[old_dim] * i;
   }
@@ -310,7 +318,7 @@ class TensorView {
    * \brief Slice dimension for Index tag.
    */
   template <size_t old_dim, size_t new_dim, int32_t D, typename Index, typename... S>
-  XGBOOST_DEVICE std::enable_if_t<std::is_integral<Index>::value, size_t> MakeSliceDim(
+  LINALG_HD std::enable_if_t<std::is_integral<Index>::value, size_t> MakeSliceDim(
       size_t new_shape[D], size_t new_stride[D], Index i, S &&...slices) const {
     static_assert(old_dim < kDim, "");
     auto offset = stride_[old_dim] * i;
@@ -336,7 +344,7 @@ class TensorView {
    * \param device Device ordinal
    */
   template <typename I, int32_t D>
-  XGBOOST_DEVICE TensorView(common::Span<T> data, I const (&shape)[D], int32_t device)
+  LINALG_HD TensorView(common::Span<T> data, I const (&shape)[D], int32_t device)
       : data_{data}, ptr_{data_.data()}, device_{device} {
     static_assert(D > 0 && D <= kDim, "Invalid shape.");
     // shape
@@ -355,8 +363,8 @@ class TensorView {
    *        stride can be calculated from shape.
    */
   template <typename I, int32_t D>
-  XGBOOST_DEVICE TensorView(common::Span<T> data, I const (&shape)[D], I const (&stride)[D],
-                            int32_t device)
+  LINALG_HD TensorView(common::Span<T> data, I const (&shape)[D], I const (&stride)[D],
+                       int32_t device)
       : data_{data}, ptr_{data_.data()}, device_{device} {
     static_assert(D == kDim, "Invalid shape & stride.");
     detail::UnrollLoop<D>([&](auto i) {
@@ -366,7 +374,7 @@ class TensorView {
     this->CalcSize();
   }
 
-  XGBOOST_DEVICE TensorView(TensorView const &that)
+  LINALG_HD TensorView(TensorView const &that)
       : data_{that.data_}, ptr_{data_.data()}, size_{that.size_}, device_{that.device_} {
     detail::UnrollLoop<kDim>([&](auto i) {
       stride_[i] = that.stride_[i];
@@ -388,7 +396,7 @@ class TensorView {
    * \endcode
    */
   template <typename... Index>
-  XGBOOST_DEVICE T &operator()(Index &&...index) {
+  LINALG_HD T &operator()(Index &&...index) {
     static_assert(sizeof...(index) == kDim, "Invalid index.");
     size_t offset = detail::Offset<0ul>(stride_, 0ul, std::forward<Index>(index)...);
     assert(offset < data_.size() && "Out of bound access.");
@@ -398,7 +406,7 @@ class TensorView {
    * \brief Index the tensor to obtain a scalar value.
    */
   template <typename... Index>
-  XGBOOST_DEVICE T const &operator()(Index &&...index) const {
+  LINALG_HD T const &operator()(Index &&...index) const {
     static_assert(sizeof...(index) == kDim, "Invalid index.");
     size_t offset = detail::Offset<0ul>(stride_, 0ul, std::forward<Index>(index)...);
     assert(offset < data_.size() && "Out of bound access.");
@@ -419,7 +427,7 @@ class TensorView {
    * \endcode
    */
   template <typename... S>
-  XGBOOST_DEVICE auto Slice(S &&...slices) const {
+  LINALG_HD auto Slice(S &&...slices) const {
     static_assert(sizeof...(slices) <= kDim, "Invalid slice.");
     int32_t constexpr kNewDim{detail::CalcSliceDim<detail::IndexToTag<S>...>()};
     size_t new_shape[kNewDim];
@@ -432,33 +440,39 @@ class TensorView {
     return ret;
   }
 
-  XGBOOST_DEVICE auto Shape() const { return common::Span<size_t const, kDim>{shape_}; }
+  LINALG_HD auto Shape() const { return common::Span<size_t const, kDim>{shape_}; }
   /**
    * Get the shape for i^th dimension
    */
-  XGBOOST_DEVICE auto Shape(size_t i) const { return shape_[i]; }
-  XGBOOST_DEVICE auto Stride() const { return common::Span<size_t const, kDim>{stride_}; }
+  LINALG_HD auto Shape(size_t i) const { return shape_[i]; }
+  LINALG_HD auto Stride() const { return common::Span<size_t const, kDim>{stride_}; }
   /**
    * Get the stride for i^th dimension, stride is specified as number of items instead of bytes.
    */
-  XGBOOST_DEVICE auto Stride(size_t i) const { return stride_[i]; }
+  LINALG_HD auto Stride(size_t i) const { return stride_[i]; }
 
   /**
    * \brief Number of items in the tensor.
    */
-  XGBOOST_DEVICE size_t Size() const { return size_; }
+  LINALG_HD size_t Size() const { return size_; }
   /**
-   * \brief Whether it's a contiguous array. (c and f contiguous are both contiguous)
+   * \brief Whether it's a c-contiguous array.
    */
-  XGBOOST_DEVICE bool Contiguous() const { return size_ == data_.size(); }
+  LINALG_HD bool CContiguous() const {
+    StrideT stride;
+    static_assert(std::is_same<decltype(stride), decltype(stride_)>::value, "");
+    // It's contiguous if the stride can be calculated from shape.
+    detail::CalcStride(shape_, stride);
+    return common::Span<size_t const, kDim>{stride_} == common::Span<size_t const, kDim>{stride};
+  }
   /**
-   * \brief Obtain the raw data.
+   * \brief Obtain a reference to the raw data.
    */
-  XGBOOST_DEVICE auto Values() const { return data_; }
+  LINALG_HD auto Values() const -> decltype(data_) const & { return data_; }
   /**
    * \brief Obtain the CUDA device ordinal.
    */
-  XGBOOST_DEVICE auto DeviceIdx() const { return device_; }
+  LINALG_HD auto DeviceIdx() const { return device_; }
 
   /**
    * \brief Array Interface defined by
@@ -527,7 +541,7 @@ auto MakeTensorView(Container &data, I const (&shape)[D], int32_t device = 0) {
 }
 
 template <typename T, typename I, int32_t D>
-auto MakeTensorView(common::Span<T> data, I const (&shape)[D], int32_t device = 0) {
+LINALG_HD auto MakeTensorView(common::Span<T> data, I const (&shape)[D], int32_t device = 0) {
   return TensorView<T, D>{data, shape, device};
 }
 
@@ -535,7 +549,7 @@ auto MakeTensorView(common::Span<T> data, I const (&shape)[D], int32_t device =
  * \brief Turns linear index into multi-dimension index.  Similar to numpy unravel.
  */
 template <size_t D>
-XGBOOST_DEVICE auto UnravelIndex(size_t idx, common::Span<size_t const, D> shape) {
+LINALG_HD auto UnravelIndex(size_t idx, common::Span<size_t const, D> shape) {
   if (idx > std::numeric_limits<uint32_t>::max()) {
     return detail::UnravelImpl<uint64_t, D>(static_cast<uint64_t>(idx), shape);
   } else {
@@ -730,4 +744,8 @@ void Stack(Tensor<T, D> *l, Tensor<T, D> const &r) {
 }
 }  // namespace linalg
 }  // namespace xgboost
+
+#if defined(LINALG_HD)
+#undef LINALG_HD
+#endif  // defined(LINALG_HD)
 #endif  // XGBOOST_LINALG_H_
diff --git a/src/common/linalg_op.cuh b/src/common/linalg_op.cuh
index dfab58729b56..f0a18a912a22 100644
--- a/src/common/linalg_op.cuh
+++ b/src/common/linalg_op.cuh
@@ -10,7 +10,7 @@ namespace xgboost {
 namespace linalg {
 template <typename T, int32_t D, typename Fn>
 void ElementWiseKernelDevice(linalg::TensorView<T, D> t, Fn&& fn, cudaStream_t s = nullptr) {
-  if (t.Contiguous()) {
+  if (t.CContiguous()) {
     auto ptr = t.Values().data();
     dh::LaunchN(t.Size(), s, [=] __device__(size_t i) { ptr[i] = fn(i, ptr[i]); });
   } else {
diff --git a/src/common/linalg_op.h b/src/common/linalg_op.h
index a74b119e7947..987e07a59d77 100644
--- a/src/common/linalg_op.h
+++ b/src/common/linalg_op.h
@@ -10,7 +10,7 @@ namespace xgboost {
 namespace linalg {
 template <typename T, int32_t D, typename Fn>
 void ElementWiseKernelHost(linalg::TensorView<T, D> t, int32_t n_threads, Fn&& fn) {
-  if (t.Contiguous()) {
+  if (t.CContiguous()) {
     auto ptr = t.Values().data();
     common::ParallelFor(t.Size(), n_threads, [&](size_t i) { ptr[i] = fn(i, ptr[i]); });
   } else {
diff --git a/tests/cpp/common/test_linalg.cc b/tests/cpp/common/test_linalg.cc
index 47d6e4622547..75a33a415e01 100644
--- a/tests/cpp/common/test_linalg.cc
+++ b/tests/cpp/common/test_linalg.cc
@@ -119,9 +119,9 @@ TEST(Linalg, TensorView) {
   }
   {
     auto t = MakeTensorView(data, {2, 3, 4}, 0);
-    auto s = t.Slice(linalg::All(), linalg::Range(0, 3), 2);
+    auto s = t.Slice(linalg::All(), linalg::Range(1, 3), 2);
     static_assert(decltype(s)::kDimension == 2, "");
-    std::vector<double> sol{2, 6, 10, 14, 18, 22};
+    std::vector<double> sol{6, 10, 18, 22};
     auto k = 0;
     for (size_t i = 0; i < s.Shape(0); ++i) {
       for (size_t j = 0; j < s.Shape(1); ++j) {
@@ -129,6 +129,21 @@ TEST(Linalg, TensorView) {
         k++;
       }
     }
+    ASSERT_FALSE(s.CContiguous());
+  }
+  {
+    auto t = MakeTensorView(data, {2, 3, 4}, 0);
+    auto s = t.Slice(1, linalg::Range(1, 3), linalg::Range(1, 3));
+    static_assert(decltype(s)::kDimension == 2, "");
+    std::vector<double> sol{17, 18, 21, 22};
+    auto k = 0;
+    for (size_t i = 0; i < s.Shape(0); ++i) {
+      for (size_t j = 0; j < s.Shape(1); ++j) {
+        ASSERT_EQ(s(i, j), sol.at(k));
+        k++;
+      }
+    }
+    ASSERT_FALSE(s.CContiguous());
   }
   {
     auto t = MakeTensorView(data, {2, 3, 4}, 0);
@@ -138,10 +153,12 @@ TEST(Linalg, TensorView) {
     for (size_t i = 0; i < s.Shape(0); ++i) {
       for (size_t j = 0; j < s.Shape(1); ++j) {
         for (size_t k = 0; k < s.Shape(2); ++k) {
-          ASSERT_EQ(s(i, j, k), all(i,j, k));
+          ASSERT_EQ(s(i, j, k), all(i, j, k));
         }
       }
     }
+    ASSERT_TRUE(s.CContiguous());
+    ASSERT_TRUE(all.CContiguous());
   }
 }
 
@@ -156,7 +173,7 @@ TEST(Linalg, Tensor) {
     size_t n = 2 * 3 * 4;
     ASSERT_EQ(t.Size(), n);
     ASSERT_TRUE(
-        std::equal(k_view.Values().cbegin(), k_view.Values().cbegin(), view.Values().begin()));
+        std::equal(k_view.Values().cbegin(), k_view.Values().cend(), view.Values().cbegin()));
 
     Tensor<float, 3> t_0{std::move(t)};
     ASSERT_EQ(t_0.Size(), n);
diff --git a/tests/cpp/common/test_linalg.cu b/tests/cpp/common/test_linalg.cu
index 9f5101c5b827..9ea6b22dd012 100644
--- a/tests/cpp/common/test_linalg.cu
+++ b/tests/cpp/common/test_linalg.cu
@@ -18,7 +18,7 @@ void TestElementWiseKernel() {
      */
     // GPU view
     auto t = l.View(0).Slice(linalg::All(), 1, linalg::All());
-    ASSERT_FALSE(t.Contiguous());
+    ASSERT_FALSE(t.CContiguous());
     ElementWiseKernelDevice(t, [] __device__(size_t i, float) { return i; });
     // CPU view
     t = l.View(GenericParameter::kCpuId).Slice(linalg::All(), 1, linalg::All());
@@ -42,7 +42,7 @@ void TestElementWiseKernel() {
      */
     auto t = l.View(0);
     ElementWiseKernelDevice(t, [] __device__(size_t i, float) { return i; });
-    ASSERT_TRUE(t.Contiguous());
+    ASSERT_TRUE(t.CContiguous());
     // CPU view
     t = l.View(GenericParameter::kCpuId);
 

From d2e3170fd49f4bc491e4152ae528f30fdf801fb4 Mon Sep 17 00:00:00 2001
From: fis <jm.yuan@outlook.com>
Date: Fri, 19 Nov 2021 21:26:05 +0800
Subject: [PATCH 05/13] Move array interface out.

---
 include/xgboost/linalg.h               | 131 ++++++++++++++-----------
 src/data/file_iterator.h               |   6 +-
 tests/cpp/common/test_linalg.cc        |   6 +-
 tests/cpp/data/test_adapter.cc         |   6 +-
 tests/cpp/data/test_array_interface.cc |   5 +-
 5 files changed, 85 insertions(+), 69 deletions(-)

diff --git a/include/xgboost/linalg.h b/include/xgboost/linalg.h
index f68e6ad6dbbc..b6e688f83a84 100644
--- a/include/xgboost/linalg.h
+++ b/include/xgboost/linalg.h
@@ -374,11 +374,14 @@ class TensorView {
     this->CalcSize();
   }
 
-  LINALG_HD TensorView(TensorView const &that)
-      : data_{that.data_}, ptr_{data_.data()}, size_{that.size_}, device_{that.device_} {
+  template <
+      typename U,
+      std::enable_if_t<common::detail::IsAllowedElementTypeConversion<U, T>::value> * = nullptr>
+  LINALG_HD TensorView(TensorView<U, kDim> const &that)  // NOLINT
+      : data_{that.Values()}, ptr_{data_.data()}, size_{that.Size()}, device_{that.DeviceIdx()} {
     detail::UnrollLoop<kDim>([&](auto i) {
-      stride_[i] = that.stride_[i];
-      shape_[i] = that.shape_[i];
+      stride_[i] = that.Stride(i);
+      shape_[i] = that.Shape(i);
     });
   }
 
@@ -473,61 +476,6 @@ class TensorView {
    * \brief Obtain the CUDA device ordinal.
    */
   LINALG_HD auto DeviceIdx() const { return device_; }
-
-  /**
-   * \brief Array Interface defined by
-   * <a href="https://numpy.org/doc/stable/reference/arrays.interface.html">numpy</a>.
-   *
-   * `stream` is optionally included when data is on CUDA device.
-   */
-  Json ArrayInterface() const {
-    Json array_interface{Object{}};
-    array_interface["data"] = std::vector<Json>(2);
-    array_interface["data"][0] = Integer(reinterpret_cast<int64_t>(data_.data()));
-    array_interface["data"][1] = Boolean{true};
-    if (this->DeviceIdx() >= 0) {
-      // Change this once we have different CUDA stream.
-      array_interface["stream"] = Null{};
-    }
-    std::vector<Json> shape(Shape().size());
-    std::vector<Json> stride(Stride().size());
-    for (size_t i = 0; i < Shape().size(); ++i) {
-      shape[i] = Integer(Shape(i));
-      stride[i] = Integer(Stride(i) * sizeof(T));
-    }
-    array_interface["shape"] = Array{shape};
-    array_interface["strides"] = Array{stride};
-    array_interface["version"] = 3;
-
-    char constexpr kT = detail::ArrayInterfaceHandler::TypeChar<T>();
-    static_assert(kT != '\0', "");
-    if (DMLC_LITTLE_ENDIAN) {
-      array_interface["typestr"] = String{"<" + (kT + std::to_string(sizeof(T)))};
-    } else {
-      array_interface["typestr"] = String{">" + (kT + std::to_string(sizeof(T)))};
-    }
-    return array_interface;
-  }
-  /**
-   * \brief Same as const version, but returns non-readonly data pointer.
-   */
-  Json ArrayInterface() {
-    auto const &as_const = *this;
-    auto res = as_const.ArrayInterface();
-    res["data"][1] = Boolean{false};
-    return res;
-  }
-
-  auto ArrayInterfaceStr() const {
-    std::string str;
-    Json::Dump(this->ArrayInterface(), &str);
-    return str;
-  }
-  auto ArrayInterfaceStr() {
-    std::string str;
-    Json::Dump(this->ArrayInterface(), &str);
-    return str;
-  }
 };
 
 /**
@@ -586,6 +534,71 @@ auto MakeVec(T *ptr, size_t s, int32_t device = -1) {
 template <typename T>
 using MatrixView = TensorView<T, 2>;
 
+
+  /**
+   * \brief Array Interface defined by
+   * <a href="https://numpy.org/doc/stable/reference/arrays.interface.html">numpy</a>.
+   *
+   * `stream` is optionally included when data is on CUDA device.
+   */
+template <typename T, int32_t D>
+Json ArrayInterface(TensorView<T const, D> const& t) {
+  Json array_interface{Object{}};
+  array_interface["data"] = std::vector<Json>(2);
+  array_interface["data"][0] = Integer(reinterpret_cast<int64_t>(t.Values().data()));
+  array_interface["data"][1] = Boolean{true};
+  if (t.DeviceIdx() >= 0) {
+    // Change this once we have different CUDA stream.
+    array_interface["stream"] = Null{};
+  }
+  std::vector<Json> shape(t.Shape().size());
+  std::vector<Json> stride(t.Stride().size());
+  for (size_t i = 0; i < t.Shape().size(); ++i) {
+    shape[i] = Integer(t.Shape(i));
+    stride[i] = Integer(t.Stride(i) * sizeof(T));
+  }
+  array_interface["shape"] = Array{shape};
+  array_interface["strides"] = Array{stride};
+  array_interface["version"] = 3;
+
+  char constexpr kT = detail::ArrayInterfaceHandler::TypeChar<T>();
+  static_assert(kT != '\0', "");
+  if (DMLC_LITTLE_ENDIAN) {
+    array_interface["typestr"] = String{"<" + (kT + std::to_string(sizeof(T)))};
+  } else {
+    array_interface["typestr"] = String{">" + (kT + std::to_string(sizeof(T)))};
+  }
+  return array_interface;
+}
+
+/**
+ * \brief Same as const version, but returns non-readonly data pointer.
+ */
+template <typename T, int32_t D>
+Json ArrayInterface(TensorView<T, D> const &t) {
+  TensorView<T const, D> const &as_const = t;
+  auto res = ArrayInterface(as_const);
+  res["data"][1] = Boolean{false};
+  return res;
+}
+
+/**
+ * \brief Return string representation of array interface.
+ */
+template <typename T, int32_t D>
+auto ArrayInterfaceStr(TensorView<T const, D> const &t) {
+  std::string str;
+  Json::Dump(ArrayInterface(t), &str);
+  return str;
+}
+
+template <typename T, int32_t D>
+auto ArrayInterfaceStr(TensorView<T, D> const &t) {
+  std::string str;
+  Json::Dump(ArrayInterface(t), &str);
+  return str;
+}
+
 /**
  * \brief A tensor storage. To use it for other functionality like slicing one needs to
  *        obtain a view first.  This way we can use it on both host and device.
diff --git a/src/data/file_iterator.h b/src/data/file_iterator.h
index 70a5d51c30b9..96f0e09d485b 100644
--- a/src/data/file_iterator.h
+++ b/src/data/file_iterator.h
@@ -61,9 +61,9 @@ class FileIterator {
       row_block_ = parser_->Value();
       using linalg::MakeVec;
 
-      indptr_ = MakeVec(row_block_.offset, row_block_.size + 1).ArrayInterfaceStr();
-      values_ = MakeVec(row_block_.value, row_block_.offset[row_block_.size]).ArrayInterfaceStr();
-      indices_ = MakeVec(row_block_.index, row_block_.offset[row_block_.size]).ArrayInterfaceStr();
+      indptr_ = ArrayInterfaceStr(MakeVec(row_block_.offset, row_block_.size + 1));
+      values_ = ArrayInterfaceStr(MakeVec(row_block_.value, row_block_.offset[row_block_.size]));
+      indices_ = ArrayInterfaceStr(MakeVec(row_block_.index, row_block_.offset[row_block_.size]));
 
       size_t n_columns = *std::max_element(row_block_.index,
                                            row_block_.index + row_block_.offset[row_block_.size]);
diff --git a/tests/cpp/common/test_linalg.cc b/tests/cpp/common/test_linalg.cc
index 75a33a415e01..f13defeb8bd4 100644
--- a/tests/cpp/common/test_linalg.cc
+++ b/tests/cpp/common/test_linalg.cc
@@ -228,12 +228,16 @@ TEST(Linalg, ArrayInterface) {
   auto t = Tensor<double, 2>{{3, 3}, cpu};
   auto v = t.View(cpu);
   std::iota(v.Values().begin(), v.Values().end(), 0);
-  auto arr = Json::Load(StringView{v.ArrayInterfaceStr()});
+  auto arr = Json::Load(StringView{ArrayInterfaceStr(v)});
   ASSERT_EQ(get<Integer>(arr["shape"][0]), 3);
   ASSERT_EQ(get<Integer>(arr["strides"][0]), 3 * sizeof(double));
 
   ASSERT_FALSE(get<Boolean>(arr["data"][1]));
   ASSERT_EQ(reinterpret_cast<double *>(get<Integer>(arr["data"][0])), v.Values().data());
+
+  TensorView<double const, 2> as_const = v;
+  auto const_arr = ArrayInterface(as_const);
+  ASSERT_TRUE(get<Boolean>(const_arr["data"][1]));
 }
 
 TEST(Linalg, Popc) {
diff --git a/tests/cpp/data/test_adapter.cc b/tests/cpp/data/test_adapter.cc
index c6de226194f6..fa3ed61f6808 100644
--- a/tests/cpp/data/test_adapter.cc
+++ b/tests/cpp/data/test_adapter.cc
@@ -42,9 +42,9 @@ TEST(Adapter, CSRArrayAdapter) {
   size_t n_features = 100, n_samples = 10;
   RandomDataGenerator{n_samples, n_features, 0.5}.GenerateCSR(&values, &indptr, &indices);
   using linalg::MakeVec;
-  auto indptr_arr = MakeVec(indptr.HostPointer(), indptr.Size()).ArrayInterfaceStr();
-  auto values_arr = MakeVec(values.HostPointer(), values.Size()).ArrayInterfaceStr();
-  auto indices_arr = MakeVec(indices.HostPointer(), indices.Size()).ArrayInterfaceStr();
+  auto indptr_arr = ArrayInterfaceStr(MakeVec(indptr.HostPointer(), indptr.Size()));
+  auto values_arr = ArrayInterfaceStr(MakeVec(values.HostPointer(), values.Size()));
+  auto indices_arr = ArrayInterfaceStr(MakeVec(indices.HostPointer(), indices.Size()));
   auto adapter = data::CSRArrayAdapter(
       StringView{indptr_arr.c_str(), indptr_arr.size()},
       StringView{values_arr.c_str(), values_arr.size()},
diff --git a/tests/cpp/data/test_array_interface.cc b/tests/cpp/data/test_array_interface.cc
index 8efd30eb75b2..3c2e0e38d5c3 100644
--- a/tests/cpp/data/test_array_interface.cc
+++ b/tests/cpp/data/test_array_interface.cc
@@ -19,9 +19,8 @@ TEST(ArrayInterface, Initialize) {
   ASSERT_EQ(arr_interface.type, ArrayInterfaceHandler::kF4);
 
   HostDeviceVector<size_t> u64_storage(storage.Size());
-  std::string u64_arr_str{linalg::TensorView<size_t const, 2>{
-      u64_storage.ConstHostSpan(), {kRows, kCols}, GenericParameter::kCpuId}
-                              .ArrayInterfaceStr()};
+  std::string u64_arr_str{ArrayInterfaceStr(linalg::TensorView<size_t const, 2>{
+      u64_storage.ConstHostSpan(), {kRows, kCols}, GenericParameter::kCpuId})};
   std::copy(storage.ConstHostVector().cbegin(), storage.ConstHostVector().cend(),
             u64_storage.HostSpan().begin());
   auto u64_arr = ArrayInterface<2>{u64_arr_str};

From c5d3bbe60450f9702e8ec84dbcf1f9843ad97ef4 Mon Sep 17 00:00:00 2001
From: fis <jm.yuan@outlook.com>
Date: Fri, 19 Nov 2021 21:29:19 +0800
Subject: [PATCH 06/13] format.

---
 include/xgboost/linalg.h | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/include/xgboost/linalg.h b/include/xgboost/linalg.h
index b6e688f83a84..8329320380ea 100644
--- a/include/xgboost/linalg.h
+++ b/include/xgboost/linalg.h
@@ -534,15 +534,14 @@ auto MakeVec(T *ptr, size_t s, int32_t device = -1) {
 template <typename T>
 using MatrixView = TensorView<T, 2>;
 
-
-  /**
-   * \brief Array Interface defined by
-   * <a href="https://numpy.org/doc/stable/reference/arrays.interface.html">numpy</a>.
-   *
-   * `stream` is optionally included when data is on CUDA device.
-   */
+/**
+ * \brief Array Interface defined by
+ * <a href="https://numpy.org/doc/stable/reference/arrays.interface.html">numpy</a>.
+ *
+ * `stream` is optionally included when data is on CUDA device.
+ */
 template <typename T, int32_t D>
-Json ArrayInterface(TensorView<T const, D> const& t) {
+Json ArrayInterface(TensorView<T const, D> const &t) {
   Json array_interface{Object{}};
   array_interface["data"] = std::vector<Json>(2);
   array_interface["data"][0] = Integer(reinterpret_cast<int64_t>(t.Values().data()));

From eeeb4412466ee67ee8d3b0ef841fbafcac9ad53d Mon Sep 17 00:00:00 2001
From: fis <jm.yuan@outlook.com>
Date: Fri, 19 Nov 2021 21:38:26 +0800
Subject: [PATCH 07/13] Compile with master.

---
 include/xgboost/linalg.h        | 4 ++--
 src/data/data.cc                | 6 +++---
 tests/cpp/data/test_metainfo.cc | 8 ++++----
 tests/cpp/data/test_metainfo.h  | 6 +++---
 4 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/include/xgboost/linalg.h b/include/xgboost/linalg.h
index 8329320380ea..b6b2584988c1 100644
--- a/include/xgboost/linalg.h
+++ b/include/xgboost/linalg.h
@@ -400,7 +400,7 @@ class TensorView {
    */
   template <typename... Index>
   LINALG_HD T &operator()(Index &&...index) {
-    static_assert(sizeof...(index) == kDim, "Invalid index.");
+    static_assert(sizeof...(index) <= kDim, "Invalid index.");
     size_t offset = detail::Offset<0ul>(stride_, 0ul, std::forward<Index>(index)...);
     assert(offset < data_.size() && "Out of bound access.");
     return ptr_[offset];
@@ -410,7 +410,7 @@ class TensorView {
    */
   template <typename... Index>
   LINALG_HD T const &operator()(Index &&...index) const {
-    static_assert(sizeof...(index) == kDim, "Invalid index.");
+    static_assert(sizeof...(index) <= kDim, "Invalid index.");
     size_t offset = detail::Offset<0ul>(stride_, 0ul, std::forward<Index>(index)...);
     assert(offset < data_.size() && "Out of bound access.");
     return ptr_[offset];
diff --git a/src/data/data.cc b/src/data/data.cc
index 3a2215180dce..205311c8f6d2 100644
--- a/src/data/data.cc
+++ b/src/data/data.cc
@@ -413,7 +413,7 @@ void CopyTensorInfoImpl(Json arr_interface, linalg::Tensor<T, D>* p_out) {
   }
   p_out->Reshape(array.shape);
   auto t = p_out->View(GenericParameter::kCpuId);
-  CHECK(t.Contiguous());
+  CHECK(t.CContiguous());
   // FIXME(jiamingy): Remove the use of this default thread.
   linalg::ElementWiseKernelHost(t, common::OmpGetNumThreads(0), [&](auto i, auto) {
     return linalg::detail::Apply(TypedIndex<T, D>{array}, linalg::UnravelIndex<D>(i, t.Shape()));
@@ -531,8 +531,8 @@ void MetaInfo::SetInfo(const char* key, const void* dptr, DataType dtype, size_t
     using T = std::remove_pointer_t<decltype(cast_d_ptr)>;
     auto t =
         linalg::TensorView<T, 1>(common::Span<T>{cast_d_ptr, num}, {num}, GenericParameter::kCpuId);
-    CHECK(t.Contiguous());
-    Json interface { t.ArrayInterface() };
+    CHECK(t.CContiguous());
+    Json interface { linalg::ArrayInterface(t) };
     assert(ArrayInterface<1>{interface}.is_contiguous);
     return interface;
   };
diff --git a/tests/cpp/data/test_metainfo.cc b/tests/cpp/data/test_metainfo.cc
index 4f379f4fdd25..4a1ef7a7290f 100644
--- a/tests/cpp/data/test_metainfo.cc
+++ b/tests/cpp/data/test_metainfo.cc
@@ -127,7 +127,8 @@ TEST(MetaInfo, SaveLoadBinary) {
 
     auto orig_margin = info.base_margin_.View(xgboost::GenericParameter::kCpuId);
     auto read_margin = inforead.base_margin_.View(xgboost::GenericParameter::kCpuId);
-    EXPECT_TRUE(std::equal(orig_margin.cbegin(), orig_margin.cend(), read_margin.cbegin()));
+    EXPECT_TRUE(std::equal(orig_margin.Values().cbegin(), orig_margin.Values().cend(),
+                           read_margin.Values().cbegin()));
 
     EXPECT_EQ(inforead.feature_type_names.size(), kCols);
     EXPECT_EQ(inforead.feature_types.Size(), kCols);
@@ -259,9 +260,8 @@ TEST(MetaInfo, Validate) {
   xgboost::HostDeviceVector<xgboost::bst_group_t> d_groups{groups};
   d_groups.SetDevice(0);
   d_groups.DevicePointer();  // pull to device
-  std::string arr_interface_str{
-      xgboost::linalg::MakeVec(d_groups.ConstDevicePointer(), d_groups.Size(), 0)
-          .ArrayInterfaceStr()};
+  std::string arr_interface_str{ArrayInterfaceStr(
+      xgboost::linalg::MakeVec(d_groups.ConstDevicePointer(), d_groups.Size(), 0))};
   EXPECT_THROW(info.SetInfo("group", xgboost::StringView{arr_interface_str}), dmlc::Error);
 #endif  // defined(XGBOOST_USE_CUDA)
 }
diff --git a/tests/cpp/data/test_metainfo.h b/tests/cpp/data/test_metainfo.h
index 67da633d4be5..2b30e646593c 100644
--- a/tests/cpp/data/test_metainfo.h
+++ b/tests/cpp/data/test_metainfo.h
@@ -30,7 +30,7 @@ inline void TestMetaInfoStridedData(int32_t device) {
         is_gpu ? labels.ConstDeviceSpan() : labels.ConstHostSpan(), {32, 2}, device};
     auto s = t.Slice(linalg::All(), 0);
 
-    auto str = s.ArrayInterfaceStr();
+    auto str = ArrayInterfaceStr(s);
     ASSERT_EQ(s.Size(), 32);
 
     info.SetInfo("label", StringView{str});
@@ -48,7 +48,7 @@ inline void TestMetaInfoStridedData(int32_t device) {
     auto& h_qid = qid.Data()->HostVector();
     std::iota(h_qid.begin(), h_qid.end(), 0);
     auto s = qid.View(device).Slice(linalg::All(), 0);
-    auto str = s.ArrayInterfaceStr();
+    auto str = ArrayInterfaceStr(s);
     info.SetInfo("qid", StringView{str});
     auto const& h_result = info.group_ptr_;
     ASSERT_EQ(h_result.size(), s.Size() + 1);
@@ -62,7 +62,7 @@ inline void TestMetaInfoStridedData(int32_t device) {
     auto t_margin = base_margin.View(device).Slice(linalg::All(), linalg::All(), 0, linalg::All());
     ASSERT_EQ(t_margin.Shape().size(), 3);
 
-    info.SetInfo("base_margin", StringView{t_margin.ArrayInterfaceStr()});
+    info.SetInfo("base_margin", StringView{ArrayInterfaceStr(t_margin)});
     auto const& h_result = info.base_margin_.View(-1);
     ASSERT_EQ(h_result.Shape().size(), 3);
     auto in_margin = base_margin.View(-1);

From f94b6fec661e4bf5710879ff68111be00d2a422e Mon Sep 17 00:00:00 2001
From: fis <jm.yuan@outlook.com>
Date: Fri, 19 Nov 2021 21:51:35 +0800
Subject: [PATCH 08/13] Don't the default device.

---
 include/xgboost/linalg.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/xgboost/linalg.h b/include/xgboost/linalg.h
index b6b2584988c1..a3b8193603f9 100644
--- a/include/xgboost/linalg.h
+++ b/include/xgboost/linalg.h
@@ -483,13 +483,13 @@ class TensorView {
  */
 template <typename Container, typename I, int32_t D,
           std::enable_if_t<!common::detail::IsSpan<Container>::value> * = nullptr>
-auto MakeTensorView(Container &data, I const (&shape)[D], int32_t device = 0) {  // NOLINT
+auto MakeTensorView(Container &data, I const (&shape)[D], int32_t device) {  // NOLINT
   using T = typename Container::value_type;
   return TensorView<T, D>{data, shape, device};
 }
 
 template <typename T, typename I, int32_t D>
-LINALG_HD auto MakeTensorView(common::Span<T> data, I const (&shape)[D], int32_t device = 0) {
+LINALG_HD auto MakeTensorView(common::Span<T> data, I const (&shape)[D], int32_t device) {
   return TensorView<T, D>{data, shape, device};
 }
 

From 9a520b1ea10b7510663a2525e80ace3947c0d566 Mon Sep 17 00:00:00 2001
From: fis <jm.yuan@outlook.com>
Date: Fri, 19 Nov 2021 22:06:59 +0800
Subject: [PATCH 09/13] Test.

---
 tests/cpp/common/test_linalg.cc | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/tests/cpp/common/test_linalg.cc b/tests/cpp/common/test_linalg.cc
index f13defeb8bd4..4425cbf41e38 100644
--- a/tests/cpp/common/test_linalg.cc
+++ b/tests/cpp/common/test_linalg.cc
@@ -160,6 +160,18 @@ TEST(Linalg, TensorView) {
     ASSERT_TRUE(s.CContiguous());
     ASSERT_TRUE(all.CContiguous());
   }
+
+  {
+    auto t = MakeTensorView(data, {2, 3, 4}, 0);
+    auto copied = t;
+    auto moved = std::move(t);
+    for (size_t i = 0; i < t.Shape().size(); ++i) {
+      ASSERT_EQ(t.Shape(i), copied.Shape(i));
+      ASSERT_EQ(t.Shape(i), moved.Shape(i));
+      ASSERT_EQ(t.Stride(i), copied.Stride(i));
+      ASSERT_EQ(t.Stride(i), moved.Stride(i));
+    }
+  }
 }
 
 TEST(Linalg, Tensor) {

From 88b2c52967b574d3917bb987a70f1706ca60a89e Mon Sep 17 00:00:00 2001
From: fis <jm.yuan@outlook.com>
Date: Fri, 19 Nov 2021 22:11:45 +0800
Subject: [PATCH 10/13] don't test moved object.

---
 tests/cpp/common/test_linalg.cc | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/tests/cpp/common/test_linalg.cc b/tests/cpp/common/test_linalg.cc
index 4425cbf41e38..e6d1e2414129 100644
--- a/tests/cpp/common/test_linalg.cc
+++ b/tests/cpp/common/test_linalg.cc
@@ -118,6 +118,7 @@ TEST(Linalg, TensorView) {
     ASSERT_EQ(s(2), 21);
   }
   {
+    // range slice
     auto t = MakeTensorView(data, {2, 3, 4}, 0);
     auto s = t.Slice(linalg::All(), linalg::Range(1, 3), 2);
     static_assert(decltype(s)::kDimension == 2, "");
@@ -132,6 +133,7 @@ TEST(Linalg, TensorView) {
     ASSERT_FALSE(s.CContiguous());
   }
   {
+    // range slice
     auto t = MakeTensorView(data, {2, 3, 4}, 0);
     auto s = t.Slice(1, linalg::Range(1, 3), linalg::Range(1, 3));
     static_assert(decltype(s)::kDimension == 2, "");
@@ -146,6 +148,7 @@ TEST(Linalg, TensorView) {
     ASSERT_FALSE(s.CContiguous());
   }
   {
+    // same as no slice.
     auto t = MakeTensorView(data, {2, 3, 4}, 0);
     auto s = t.Slice(linalg::All(), linalg::Range(0, 3), linalg::Range(0, 4));
     static_assert(decltype(s)::kDimension == 3, "");
@@ -162,14 +165,13 @@ TEST(Linalg, TensorView) {
   }
 
   {
+    // copy and move constructor.
     auto t = MakeTensorView(data, {2, 3, 4}, 0);
-    auto copied = t;
-    auto moved = std::move(t);
+    auto from_copy = t;
+    auto from_move = std::move(t);
     for (size_t i = 0; i < t.Shape().size(); ++i) {
-      ASSERT_EQ(t.Shape(i), copied.Shape(i));
-      ASSERT_EQ(t.Shape(i), moved.Shape(i));
-      ASSERT_EQ(t.Stride(i), copied.Stride(i));
-      ASSERT_EQ(t.Stride(i), moved.Stride(i));
+      ASSERT_EQ(from_copy.Shape(i), from_move.Shape(i));
+      ASSERT_EQ(from_copy.Stride(i), from_copy.Stride(i));
     }
   }
 }

From aef06970a3bceea5102820727b2c5a495b3208e4 Mon Sep 17 00:00:00 2001
From: fis <jm.yuan@outlook.com>
Date: Tue, 23 Nov 2021 05:54:52 +0800
Subject: [PATCH 11/13] Restore contiguous.

---
 include/xgboost/linalg.h | 4 ++++
 src/common/linalg_op.cuh | 2 +-
 src/common/linalg_op.h   | 2 +-
 3 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/include/xgboost/linalg.h b/include/xgboost/linalg.h
index a3b8193603f9..fd9597a6667e 100644
--- a/include/xgboost/linalg.h
+++ b/include/xgboost/linalg.h
@@ -458,6 +458,10 @@ class TensorView {
    * \brief Number of items in the tensor.
    */
   LINALG_HD size_t Size() const { return size_; }
+  /**
+   * \brief Whether this is a contiguous array, both C and F contiguous returns true.
+   */
+  LINALG_HD bool Contiguous() const { return this->Size() == data_.size(); }
   /**
    * \brief Whether it's a c-contiguous array.
    */
diff --git a/src/common/linalg_op.cuh b/src/common/linalg_op.cuh
index f0a18a912a22..dfab58729b56 100644
--- a/src/common/linalg_op.cuh
+++ b/src/common/linalg_op.cuh
@@ -10,7 +10,7 @@ namespace xgboost {
 namespace linalg {
 template <typename T, int32_t D, typename Fn>
 void ElementWiseKernelDevice(linalg::TensorView<T, D> t, Fn&& fn, cudaStream_t s = nullptr) {
-  if (t.CContiguous()) {
+  if (t.Contiguous()) {
     auto ptr = t.Values().data();
     dh::LaunchN(t.Size(), s, [=] __device__(size_t i) { ptr[i] = fn(i, ptr[i]); });
   } else {
diff --git a/src/common/linalg_op.h b/src/common/linalg_op.h
index 987e07a59d77..a74b119e7947 100644
--- a/src/common/linalg_op.h
+++ b/src/common/linalg_op.h
@@ -10,7 +10,7 @@ namespace xgboost {
 namespace linalg {
 template <typename T, int32_t D, typename Fn>
 void ElementWiseKernelHost(linalg::TensorView<T, D> t, int32_t n_threads, Fn&& fn) {
-  if (t.CContiguous()) {
+  if (t.Contiguous()) {
     auto ptr = t.Values().data();
     common::ParallelFor(t.Size(), n_threads, [&](size_t i) { ptr[i] = fn(i, ptr[i]); });
   } else {

From 2c6a57fa22faaa892f46be66956094fba3ba1cf0 Mon Sep 17 00:00:00 2001
From: fis <jm.yuan@outlook.com>
Date: Tue, 23 Nov 2021 06:24:16 +0800
Subject: [PATCH 12/13] Extend to f-contiguous.

---
 include/xgboost/linalg.h        | 29 ++++++++++++++++++++++++-----
 tests/cpp/common/test_linalg.cc | 22 +++++++++++++++++++++-
 2 files changed, 45 insertions(+), 6 deletions(-)

diff --git a/include/xgboost/linalg.h b/include/xgboost/linalg.h
index fd9597a6667e..200610367616 100644
--- a/include/xgboost/linalg.h
+++ b/include/xgboost/linalg.h
@@ -55,11 +55,18 @@ constexpr std::enable_if_t<sizeof...(Tail) != 0, size_t> Offset(S (&strides)[D],
   return Offset<dim + 1>(strides, n + (head * strides[dim]), std::forward<Tail>(rest)...);
 }
 
-template <int32_t D>
+template <int32_t D, bool f_array = false>
 constexpr void CalcStride(size_t const (&shape)[D], size_t (&stride)[D]) {
-  stride[D - 1] = 1;
-  for (int32_t s = D - 2; s >= 0; --s) {
-    stride[s] = shape[s + 1] * stride[s + 1];
+  if (f_array) {
+    stride[0] = 1;
+    for (int32_t s = 1; s < D; ++s) {
+      stride[s] = shape[s - 1] * stride[s - 1];
+    }
+  } else {
+    stride[D - 1] = 1;
+    for (int32_t s = D - 2; s >= 0; --s) {
+      stride[s] = shape[s + 1] * stride[s + 1];
+    }
   }
 }
 
@@ -461,7 +468,9 @@ class TensorView {
   /**
    * \brief Whether this is a contiguous array, both C and F contiguous returns true.
    */
-  LINALG_HD bool Contiguous() const { return this->Size() == data_.size(); }
+  LINALG_HD bool Contiguous() const {
+    return data_.size() == this->Size() || this->CContiguous() || this->FContiguous();
+  }
   /**
    * \brief Whether it's a c-contiguous array.
    */
@@ -472,6 +481,16 @@ class TensorView {
     detail::CalcStride(shape_, stride);
     return common::Span<size_t const, kDim>{stride_} == common::Span<size_t const, kDim>{stride};
   }
+  /**
+   * \brief Whether it's a f-contiguous array.
+   */
+  LINALG_HD bool FContiguous() const {
+    StrideT stride;
+    static_assert(std::is_same<decltype(stride), decltype(stride_)>::value, "");
+    // It's contiguous if the stride can be calculated from shape.
+    detail::CalcStride<kDim, true>(shape_, stride);
+    return common::Span<size_t const, kDim>{stride_} == common::Span<size_t const, kDim>{stride};
+  }
   /**
    * \brief Obtain a reference to the raw data.
    */
diff --git a/tests/cpp/common/test_linalg.cc b/tests/cpp/common/test_linalg.cc
index e6d1e2414129..bc4945219763 100644
--- a/tests/cpp/common/test_linalg.cc
+++ b/tests/cpp/common/test_linalg.cc
@@ -166,7 +166,7 @@ TEST(Linalg, TensorView) {
 
   {
     // copy and move constructor.
-    auto t = MakeTensorView(data, {2, 3, 4}, 0);
+    auto t = MakeTensorView(data, {2, 3, 4}, kCpuId);
     auto from_copy = t;
     auto from_move = std::move(t);
     for (size_t i = 0; i < t.Shape().size(); ++i) {
@@ -174,6 +174,26 @@ TEST(Linalg, TensorView) {
       ASSERT_EQ(from_copy.Stride(i), from_copy.Stride(i));
     }
   }
+
+  {
+    // multiple slices
+    auto t = MakeTensorView(data, {2, 3, 4}, kCpuId);
+    auto s_0 = t.Slice(linalg::All(), linalg::Range(0, 2), linalg::Range(1, 4));
+    ASSERT_FALSE(s_0.CContiguous());
+    auto s_1 = s_0.Slice(1, 1, linalg::Range(0, 2));
+    ASSERT_EQ(s_1.Size(), 2);
+    ASSERT_TRUE(s_1.CContiguous());
+    ASSERT_TRUE(s_1.Contiguous());
+    ASSERT_EQ(s_1(0), 17);
+    ASSERT_EQ(s_1(1), 18);
+  }
+  {
+    // f-contiguous
+    TensorView<double, 3> t{data, {4, 3, 2}, {1, 4, 12}, kCpuId};
+    ASSERT_TRUE(t.Contiguous());
+    ASSERT_TRUE(t.FContiguous());
+    ASSERT_FALSE(t.CContiguous());
+  }
 }
 
 TEST(Linalg, Tensor) {

From 6e66d7dcb20e6b7c8ae420733494de6c4ae238c0 Mon Sep 17 00:00:00 2001
From: fis <jm.yuan@outlook.com>
Date: Tue, 23 Nov 2021 06:47:54 +0800
Subject: [PATCH 13/13] One more test.

---
 tests/cpp/common/test_linalg.cc | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/tests/cpp/common/test_linalg.cc b/tests/cpp/common/test_linalg.cc
index bc4945219763..a4f3e6ab41fb 100644
--- a/tests/cpp/common/test_linalg.cc
+++ b/tests/cpp/common/test_linalg.cc
@@ -186,6 +186,16 @@ TEST(Linalg, TensorView) {
     ASSERT_TRUE(s_1.Contiguous());
     ASSERT_EQ(s_1(0), 17);
     ASSERT_EQ(s_1(1), 18);
+
+    auto s_2 = s_0.Slice(1, linalg::All(), linalg::Range(0, 2));
+    std::vector<double> sol{13, 14, 17, 18};
+    auto k = 0;
+    for (size_t i = 0; i < s_2.Shape(0); i++) {
+      for (size_t j = 0; j < s_2.Shape(1); ++j) {
+        ASSERT_EQ(s_2(i, j), sol[k]);
+        k++;
+      }
+    }
   }
   {
     // f-contiguous