add some kernels(csr*dense->csr, dense*dense->csr) of SparseTensor ma…

…tmul (PaddlePaddle#42935) * add some kernel(csr*dense->csr, dense*dense->csr) of SparseTensor matmul * fix CI * fix CI * fix comment * fix comment
sneaxiy · Jun 27, 2022 · 07d1017 · 07d1017
1 parent 3533b1b
commit 07d1017
Show file tree

Hide file tree

Showing 27 changed files with 1,656 additions and 65 deletions.
diff --git a/paddle/fluid/eager/grad_node_info.cc b/paddle/fluid/eager/grad_node_info.cc
@@ -27,6 +27,7 @@
 #include "paddle/phi/common/data_type.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/sparse_coo_tensor.h"
+#include "paddle/phi/core/sparse_csr_tensor.h"
 
 /**
  * Implementation of GradNodeBase, Edge and GradTensorHolder.
@@ -114,6 +115,10 @@ void GradNodeBase::SetGradInMeta(const paddle::experimental::Tensor& fwd_out,
     phi::SparseCooTensor* coo_tensor =
         static_cast<phi::SparseCooTensor*>(fwd_out.impl().get());
     dense_tensor = coo_tensor->mutable_non_zero_elements();
+  } else if (phi::SparseCsrTensor::classof(fwd_out.impl().get())) {
+    phi::SparseCsrTensor* csr_tensor =
+        static_cast<phi::SparseCsrTensor*>(fwd_out.impl().get());
+    dense_tensor = csr_tensor->mutable_non_zero_elements();
   } else {
     VLOG(6) << "Unable to initialize the DenseTensorMeta of GradSlotMeta with "
                "non-DenseTensor argument.";

diff --git a/paddle/fluid/eager/grad_tensor_holder.cc b/paddle/fluid/eager/grad_tensor_holder.cc
@@ -66,8 +66,17 @@ void GradTensorHolder::CopyValueFromTensor(
     // Create new tensor->impl and fill it with 1.0
     if (t.defined()) {
       // Fill 1.0, use full to support complex, one_like don't support it.
-      buffer_[slot_id][rank] =
-          paddle::experimental::full(t.shape(), 1, t.dtype(), t.place());
+      if (t.is_dense_tensor()) {
+        buffer_[slot_id][rank] =
+            paddle::experimental::full(t.shape(), 1, t.dtype(), t.place());
+      } else if (t.is_sparse_csr_tensor() || t.is_sparse_coo_tensor()) {
+        buffer_[slot_id][rank] =
+            paddle::experimental::sparse::full_like(t, 1, t.dtype());
+      } else {
+        PADDLE_THROW(paddle::platform::errors::Fatal(
+            "Only Support DENSE_TENSOR, SPARSE_COO_TENSOR, SPARSE_CSR_TENSOR "
+            "now."));
+      }
       egr::EagerUtils::autograd_meta(&(buffer_[slot_id][rank]))
           ->SetStopGradient(false);
     }

diff --git a/paddle/fluid/platform/device/gpu/cuda/cusparse_helper.h b/paddle/fluid/platform/device/gpu/cuda/cusparse_helper.h
@@ -31,7 +31,7 @@ class CusparseHandleHolder {
 // ROCM is not yet supported
 #if defined(PADDLE_WITH_CUDA)
 // The generic APIs is supported from CUDA10.1
-#if CUDA_VERSION >= 10010
+#if CUDA_VERSION >= 11000
     PADDLE_RETRY_CUDA_SUCCESS(dynload::cusparseCreate(&handle_));
     PADDLE_RETRY_CUDA_SUCCESS(dynload::cusparseSetStream(handle_, stream));
 #endif
@@ -41,7 +41,7 @@ class CusparseHandleHolder {
 
   ~CusparseHandleHolder() PADDLE_MAY_THROW {
 #if defined(PADDLE_WITH_CUDA)
-#if CUDA_VERSION >= 10010
+#if CUDA_VERSION >= 11000
     PADDLE_RETRY_CUDA_SUCCESS(dynload::cusparseDestroy(handle_));
 #endif
 #endif

diff --git a/paddle/fluid/platform/dynload/cusparse.cc b/paddle/fluid/platform/dynload/cusparse.cc
@@ -24,10 +24,6 @@ namespace dynload {
 CUSPARSE_ROUTINE_EACH(DEFINE_WRAP);
 #endif
 
-#ifdef CUSPARSE_ROUTINE_EACH_11020
-CUSPARSE_ROUTINE_EACH_11020(DEFINE_WRAP);
-#endif
-
 #ifdef CUSPARSE_ROUTINE_EACH_R2
 CUSPARSE_ROUTINE_EACH_R2(DEFINE_WRAP);
 #endif

diff --git a/paddle/fluid/platform/dynload/cusparse.h b/paddle/fluid/platform/dynload/cusparse.h
@@ -29,23 +29,17 @@ namespace dynload {
   extern DynLoad__##__name __name
 
 #if defined(PADDLE_WITH_CUDA)
-// The generic APIs is supported from CUDA10.1
-#if CUDA_VERSION >= 10010
-#define CUSPARSE_ROUTINE_EACH(__macro) \
-  __macro(cusparseCreate);             \
-  __macro(cusparseSetStream);          \
-  __macro(cusparseCreateMatDescr);     \
-  __macro(cusparseDestroy);            \
-  __macro(cusparseSnnz);               \
-  __macro(cusparseDnnz);               \
-  __macro(cusparseSetMatType);         \
-  __macro(cusparseSetMatIndexBase);
-
-CUSPARSE_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP);
-
-// APIs available after CUDA 11.2
-#if CUDA_VERSION >= 11020
-#define CUSPARSE_ROUTINE_EACH_11020(__macro) \
+// APIs available after CUDA 11.0
+#if CUDA_VERSION >= 11000
+#define CUSPARSE_ROUTINE_EACH(__macro)       \
+  __macro(cusparseCreate);                   \
+  __macro(cusparseSetStream);                \
+  __macro(cusparseCreateMatDescr);           \
+  __macro(cusparseDestroy);                  \
+  __macro(cusparseSnnz);                     \
+  __macro(cusparseDnnz);                     \
+  __macro(cusparseSetMatType);               \
+  __macro(cusparseSetMatIndexBase);          \
   __macro(cusparseCreateCsr);                \
   __macro(cusparseCreateCoo);                \
   __macro(cusparseCreateDnMat);              \
@@ -59,11 +53,13 @@ CUSPARSE_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP);
   __macro(cusparseDenseToSparse_analysis);   \
   __macro(cusparseDenseToSparse_convert);    \
   __macro(cusparseSparseToDense_bufferSize); \
-  __macro(cusparseSparseToDense);
+  __macro(cusparseSparseToDense);            \
+  __macro(cusparseDnMatSetStridedBatch);     \
+  __macro(cusparseCsrSetStridedBatch);
 
-CUSPARSE_ROUTINE_EACH_11020(PLATFORM_DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP)
+CUSPARSE_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP)
+#endif
 
-// APIs available after CUDA 11.3
 #if CUDA_VERSION >= 11030
 #define CUSPARSE_ROUTINE_EACH_R2(__macro) \
   __macro(cusparseSDDMM_bufferSize);      \
@@ -72,8 +68,7 @@ CUSPARSE_ROUTINE_EACH_11020(PLATFORM_DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP)
 
 CUSPARSE_ROUTINE_EACH_R2(PLATFORM_DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP)
 #endif
-#endif
-#endif
+
 #endif
 
 #undef PLATFORM_DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP

diff --git a/paddle/phi/backends/dynload/cusparse.cc b/paddle/phi/backends/dynload/cusparse.cc
@@ -26,10 +26,6 @@ void *cusparse_dso_handle;
 CUSPARSE_ROUTINE_EACH(DEFINE_WRAP);
 #endif
 
-#ifdef CUSPARSE_ROUTINE_EACH_11020
-CUSPARSE_ROUTINE_EACH_11020(DEFINE_WRAP);
-#endif
-
 #ifdef CUSPARSE_ROUTINE_EACH_R2
 CUSPARSE_ROUTINE_EACH_R2(DEFINE_WRAP);
 #endif

diff --git a/paddle/phi/backends/dynload/cusparse.h b/paddle/phi/backends/dynload/cusparse.h
@@ -30,34 +30,28 @@ extern void *cusparse_dso_handle;
   struct DynLoad__##__name {                                         \
     template <typename... Args>                                      \
     cusparseStatus_t operator()(Args... args) {                      \
-      using cusparseFunc = decltype(&::__name);                      \
+      using Func = decltype(&::__name);                              \
       std::call_once(cusparse_dso_flag, []() {                       \
         cusparse_dso_handle = phi::dynload::GetCusparseDsoHandle();  \
       });                                                            \
       static void *p_##__name = dlsym(cusparse_dso_handle, #__name); \
-      return reinterpret_cast<cusparseFunc>(p_##__name)(args...);    \
+      return reinterpret_cast<Func>(p_##__name)(args...);            \
     }                                                                \
   };                                                                 \
   extern DynLoad__##__name __name
 
 #if defined(PADDLE_WITH_CUDA)
-// The generic APIs is supported from CUDA10.1
-#if CUDA_VERSION >= 10010
-#define CUSPARSE_ROUTINE_EACH(__macro) \
-  __macro(cusparseCreate);             \
-  __macro(cusparseSetStream);          \
-  __macro(cusparseCreateMatDescr);     \
-  __macro(cusparseDestroy);            \
-  __macro(cusparseSnnz);               \
-  __macro(cusparseDnnz);               \
-  __macro(cusparseSetMatType);         \
-  __macro(cusparseSetMatIndexBase);
-
-CUSPARSE_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP);
-
-// APIs available after CUDA 11.2
-#if CUDA_VERSION >= 11020
-#define CUSPARSE_ROUTINE_EACH_11020(__macro) \
+// APIs available after CUDA 11.0
+#if CUDA_VERSION >= 11000
+#define CUSPARSE_ROUTINE_EACH(__macro)       \
+  __macro(cusparseCreate);                   \
+  __macro(cusparseSetStream);                \
+  __macro(cusparseCreateMatDescr);           \
+  __macro(cusparseDestroy);                  \
+  __macro(cusparseSnnz);                     \
+  __macro(cusparseDnnz);                     \
+  __macro(cusparseSetMatType);               \
+  __macro(cusparseSetMatIndexBase);          \
   __macro(cusparseCreateCsr);                \
   __macro(cusparseCreateCoo);                \
   __macro(cusparseCreateDnMat);              \
@@ -71,11 +65,13 @@ CUSPARSE_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP);
   __macro(cusparseDenseToSparse_analysis);   \
   __macro(cusparseDenseToSparse_convert);    \
   __macro(cusparseSparseToDense_bufferSize); \
-  __macro(cusparseSparseToDense);
+  __macro(cusparseSparseToDense);            \
+  __macro(cusparseDnMatSetStridedBatch);     \
+  __macro(cusparseCsrSetStridedBatch);
 
-CUSPARSE_ROUTINE_EACH_11020(DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP)
+CUSPARSE_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP)
+#endif
 
-// APIs available after CUDA 11.3
 #if CUDA_VERSION >= 11030
 #define CUSPARSE_ROUTINE_EACH_R2(__macro) \
   __macro(cusparseSDDMM_bufferSize);      \
@@ -84,8 +80,7 @@ CUSPARSE_ROUTINE_EACH_11020(DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP)
 
 CUSPARSE_ROUTINE_EACH_R2(DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP)
 #endif
-#endif
-#endif
+
 #endif
 
 #undef DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP

diff --git a/paddle/phi/backends/gpu/gpu_context.cc b/paddle/phi/backends/gpu/gpu_context.cc
@@ -402,7 +402,10 @@ struct GPUContext::Impl {
 
   void SetSolverHandle(solverHandle_t handle) { solver_handle_ = handle; }
 
-  sparseHandle_t GetSparseHandle() const {
+  sparseHandle_t GetSparseHandle() {
+    std::call_once(flag_sparse_, [=]() {
+      if (!sparse_handle_) phi::InitSparseHandle(&sparse_handle_, stream_);
+    });
     PD_CHECK(sparse_handle_ != nullptr, "the gpu sparse handle is nullptr.");
     return sparse_handle_;
   }
@@ -519,7 +522,12 @@ struct GPUContext::Impl {
   }
 
   inline void CusparseCall(
-      const std::function<void(sparseHandle_t)>& callback) const {
+      const std::function<void(sparseHandle_t)>& callback) {
+    std::call_once(flag_sparse_, [=]() {
+      if (!sparse_handle_) {
+        phi::InitSparseHandle(&sparse_handle_, stream_);
+      }
+    });
     std::lock_guard<std::mutex> guard(sparse_mtx_);
     callback(sparse_handle_);
   }
@@ -598,6 +606,7 @@ struct GPUContext::Impl {
   sparseHandle_t sparse_handle_{nullptr};
   DnnWorkspaceHandle* workspace_{nullptr};
 
+  std::once_flag flag_sparse_;
   std::once_flag flag_blas_;
   std::once_flag flag_blaslt_;
   std::once_flag flag_dnn_;

diff --git a/paddle/phi/backends/gpu/gpu_resources.cc b/paddle/phi/backends/gpu/gpu_resources.cc
@@ -250,7 +250,7 @@ void InitSparseHandle(sparseHandle_t* handle, gpuStream_t stream) {
 // ROCM is not yet supported
 #if defined(PADDLE_WITH_CUDA)
 // The generic APIs is supported from CUDA10.1
-#if CUDA_VERSION >= 10010
+#if CUDA_VERSION >= 11000
   PADDLE_RETRY_CUDA_SUCCESS(dynload::cusparseCreate(handle));
   PADDLE_RETRY_CUDA_SUCCESS(dynload::cusparseSetStream(*handle, stream));
 #endif
@@ -259,7 +259,7 @@ void InitSparseHandle(sparseHandle_t* handle, gpuStream_t stream) {
 
 void DestroySparseHandle(sparseHandle_t handle) {
 #ifdef PADDLE_WITH_CUDA
-#if CUDA_VERSION >= 10010
+#if CUDA_VERSION >= 11000
   if (handle != nullptr) {
     PADDLE_RETRY_CUDA_SUCCESS(dynload::cusparseDestroy(handle));
     handle = nullptr;

diff --git a/paddle/phi/core/sparse_csr_tensor.h b/paddle/phi/core/sparse_csr_tensor.h
@@ -85,6 +85,10 @@ class SparseCsrTensor : public TensorBase,
   /// \return The non zero elemetns in original dense tensor.
   const DenseTensor& non_zero_elements() const { return non_zero_elements_; }
 
+  /// \brief Returns the total number of non zero elements in original dense
+  /// tensor.
+  int64_t nnz() const { return non_zero_elements_.numel(); }
+
   /// \brief Return the number of elements contained in original dense tensor
   /// \return The number of elements contained in original dense tensor
   int64_t numel() const override { return product(dims_); }

diff --git a/paddle/phi/kernels/funcs/sparse/sparse_blas.h b/paddle/phi/kernels/funcs/sparse/sparse_blas.h
@@ -0,0 +1,96 @@
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/device_context.h"
+#include "paddle/phi/core/sparse_coo_tensor.h"
+#include "paddle/phi/core/sparse_csr_tensor.h"
+
+namespace phi {
+namespace funcs {
+namespace sparse {
+
+template <typename DeviceContext>
+class SparseBlas {
+ public:
+  explicit SparseBlas(const DeviceContext& dev_ctx) : dev_ctx_(dev_ctx) {}
+
+  // TODO(zhouwei25): implement "COO @ DENSE -> DENSE" of DSDMM
+  template <typename T>
+  void DSDMM(bool transa,
+             bool transb,
+             T alpha,
+             const phi::SparseCooTensor& mat_a,
+             const phi::DenseTensor& mat_b,
+             T beta,
+             phi::DenseTensor* mat_c) const;
+
+  template <typename T>
+  void DSDMM(bool transa,
+             bool transb,
+             T alpha,
+             const phi::SparseCsrTensor& mat_a,
+             const phi::DenseTensor& mat_b,
+             T beta,
+             phi::DenseTensor* mat_c) const;
+
+  template <typename T>
+  void SDDMM(bool transa,
+             bool transb,
+             T alpha,
+             const phi::DenseTensor& mat_a,
+             const phi::DenseTensor& mat_b,
+             T beta,
+             phi::SparseCsrTensor* mat_c) const;
+
+ private:
+  const DeviceContext& dev_ctx_;
+};
+
+template <typename DeviceContext, typename T>
+class SparseBlasT : private SparseBlas<DeviceContext> {
+ public:
+  using SparseBlas<DeviceContext>::SparseBlas;
+
+  template <typename... ARGS>
+  void DSDMM(ARGS... args) const {
+    Base()->template DSDMM<T>(args...);
+  }
+
+  template <typename... ARGS>
+  void SDDMM(ARGS... args) const {
+    Base()->template SDDMM<T>(args...);
+  }
+
+ private:
+  const SparseBlas<DeviceContext>* Base() const {
+    return static_cast<const SparseBlas<DeviceContext>*>(this);
+  }
+};
+
+template <typename DeviceContext, typename T>
+inline SparseBlasT<DeviceContext, T> GetSparseBlas(
+    const DeviceContext& dev_ctx) {
+  return SparseBlasT<DeviceContext, T>(dev_ctx);
+}
+
+}  // namespace sparse
+}  // namespace funcs
+}  // namespace phi
+
+#if defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 11000
+#include "paddle/phi/kernels/funcs/sparse/sparse_blas_impl.cu.h"
+#endif