PaddlePaddle · zhwesky2010 · Jun 28, 2022 · Jun 24, 2022 · Jun 27, 2022
diff --git a/paddle/fluid/platform/dynload/cusparse.h b/paddle/fluid/platform/dynload/cusparse.h
@@ -29,7 +29,6 @@ namespace dynload {
   extern DynLoad__##__name __name
 
 #if defined(PADDLE_WITH_CUDA)
-// APIs available after CUDA 11.0
 #if CUDA_VERSION >= 11000
 #define CUSPARSE_ROUTINE_EACH(__macro) \
   __macro(cusparseCreate);             \
@@ -43,10 +42,14 @@ namespace dynload {
   __macro(cusparseCreateCsr);          \
   __macro(cusparseCreateCoo);          \
   __macro(cusparseCreateDnMat);        \
+  __macro(cusparseCreateDnVec);        \
   __macro(cusparseSpMM_bufferSize);    \
   __macro(cusparseSpMM);               \
   __macro(cusparseDestroySpMat);       \
-  __macro(cusparseDestroyDnMat);
+  __macro(cusparseDestroyDnMat);       \
+  __macro(cusparseDestroyDnVec);       \
+  __macro(cusparseSpMV_bufferSize);    \
+  __macro(cusparseSpMV);
 
 CUSPARSE_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP)
 #endif

diff --git a/paddle/phi/backends/dynload/cusparse.h b/paddle/phi/backends/dynload/cusparse.h
@@ -41,7 +41,6 @@ extern void *cusparse_dso_handle;
   extern DynLoad__##__name __name
 
 #if defined(PADDLE_WITH_CUDA)
-// APIs available after CUDA 11.0
 #if CUDA_VERSION >= 11000
 #define CUSPARSE_ROUTINE_EACH(__macro) \
   __macro(cusparseCreate);             \
@@ -55,10 +54,14 @@ extern void *cusparse_dso_handle;
   __macro(cusparseCreateCsr);          \
   __macro(cusparseCreateCoo);          \
   __macro(cusparseCreateDnMat);        \
+  __macro(cusparseCreateDnVec);        \
   __macro(cusparseSpMM_bufferSize);    \
   __macro(cusparseSpMM);               \
   __macro(cusparseDestroySpMat);       \
-  __macro(cusparseDestroyDnMat);
+  __macro(cusparseDestroyDnMat);       \
+  __macro(cusparseDestroyDnVec);       \
+  __macro(cusparseSpMV_bufferSize);    \
+  __macro(cusparseSpMV);
 
 CUSPARSE_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP)
 #endif

diff --git a/paddle/phi/kernels/funcs/sparse/sparse_blas.h b/paddle/phi/kernels/funcs/sparse/sparse_blas.h
@@ -37,6 +37,14 @@ class SparseBlas {
             T beta,
             phi::DenseTensor* mat_out) const;
 
+  template <typename T, typename TensorType>
+  void SPMV(bool transa,
+            T alpha,
+            const TensorType& mat_a,
+            const phi::DenseTensor& vec_x,
+            T beta,
+            phi::DenseTensor* vec_out) const;
+
   template <typename T, typename TensorType>
   void SDDMM(bool transa,
              bool transb,
@@ -60,6 +68,11 @@ class SparseBlasT : private SparseBlas<DeviceContext> {
     Base()->template SPMM<T>(args...);
   }
 
+  template <typename... ARGS>
+  void SPMV(ARGS... args) const {
+    Base()->template SPMV<T>(args...);
+  }
+
   template <typename... ARGS>
   void SDDMM(ARGS... args) const {
     Base()->template SDDMM<T>(args...);

diff --git a/paddle/phi/kernels/funcs/sparse/sparse_blas_impl.cu.h b/paddle/phi/kernels/funcs/sparse/sparse_blas_impl.cu.h
@@ -20,6 +20,7 @@
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/ddim.h"
 #include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/sparse_coo_tensor.h"
 #include "paddle/phi/core/sparse_csr_tensor.h"
 #include "paddle/phi/core/visit_type.h"
@@ -47,6 +48,8 @@ inline cusparseOperation_t GetTransposeOperation(const bool trans) {
   }
 }
 
+/************* SPARSE MATRIX DESCRIPTOR (COO/CSR) ************/
+
 template <typename T, typename IntT>
 inline void CreateCsrDescriptor(const phi::SparseCsrTensor& x,
                                 const phi::GPUContext& dev_ctx,
@@ -102,19 +105,83 @@ inline void CreateCsrDescriptor(const phi::SparseCsrTensor& x,
   }
 }
 
+template <typename T, typename IntT>
+inline void CreateCooDescriptor(const phi::SparseCooTensor& x,
+                                const phi::GPUContext& dev_ctx,
+                                cusparseSpMatDescr_t* descriptor) {
+  std::vector<int64_t> xdim_vec = phi::vectorize(x.dims());
+  auto x_ndims = xdim_vec.size();
+  PADDLE_ENFORCE_GE(
+      x_ndims,
+      2,
+      phi::errors::InvalidArgument("the dim size of SparseCsrTensor must be "
+                                   "greater than or eaqual to 2."));
+
+  int64_t M = xdim_vec[x_ndims - 2];
+  int64_t N = xdim_vec[x_ndims - 1];
+  int batch_size = 1;
+  for (int i = 0; i < x_ndims - 2; i++) {
+    batch_size *= xdim_vec[i];
+  }
+  int64_t nnz = x.nnz();
+
+  const IntT* indices_data = x.non_zero_indices().data<IntT>();
+  const T* values_data = x.non_zero_elements().data<T>();
+  auto rows_data = indices_data + (x_ndims - 2) * nnz;
+  auto cols_data = indices_data + (x_ndims - 1) * nnz;
+
+  int64_t batch_nnz = nnz / batch_size;
+  cudaDataType_t gpu_type = GetGpuDataType<T>();
+  dev_ctx.CusparseCall([&](cusparseHandle_t handle) {
+    phi::dynload::cusparseCreateCoo(descriptor,
+                                    M,
+                                    N,
+                                    batch_nnz,
+                                    const_cast<IntT*>(rows_data),
+                                    const_cast<IntT*>(cols_data),
+                                    const_cast<T*>(values_data),
+                                    CUSPARSE_INDEX_64I,
+                                    CUSPARSE_INDEX_BASE_ZERO,
+                                    gpu_type);
+  });
+
+  if (batch_size > 1) {
+#if CUDA_VERSION >= 11070
+    dev_ctx.CusparseCall([&](cusparseHandle_t handle) {
+      phi::dynload::cusparseCooSetStridedBatch(
+          *descriptor, batch_size, batch_nnz);
+    });
+#else
+    PADDLE_THROW(phi::errors::Unimplemented(
+        "Batch Sparse matmul use 'cusparseCooSetStridedBatch', which is "
+        "supported from CUDA 11.7"));
+#endif
+  }
+}
+
 template <typename T>
 class CuSparseSpMatDescriptor {
  public:
   explicit CuSparseSpMatDescriptor(const phi::SparseCsrTensor& x,
                                    const phi::GPUContext& dev_ctx)
       : dev_ctx_(dev_ctx) {
     PD_VISIT_INTEGRAL_TYPES(
-        x.non_zero_crows().dtype(), "CuSparseSpMatDescriptor", ([&] {
+        x.non_zero_crows().dtype(), "Csr CuSparseSpMatDescriptor", ([&] {
           CreateCsrDescriptor<T, data_t>(x, dev_ctx_, &descriptor_);
         }));
     VLOG(6) << "Create csr cusparseSpMatDescr_t " << &descriptor_;
   }
 
+  explicit CuSparseSpMatDescriptor(const phi::SparseCooTensor& x,
+                                   const phi::GPUContext& dev_ctx)
+      : dev_ctx_(dev_ctx) {
+    PD_VISIT_INTEGRAL_TYPES(
+        x.non_zero_indices().dtype(), "Coo CuSparseSpMatDescriptor", ([&] {
+          CreateCooDescriptor<T, data_t>(x, dev_ctx_, &descriptor_);
+        }));
+    VLOG(6) << "Create coo cusparseSpMatDescr_t " << &descriptor_;
+  }
+
   ~CuSparseSpMatDescriptor() {
     dev_ctx_.CusparseCall([&](cusparseHandle_t handle) {
       phi::dynload::cusparseDestroySpMat(descriptor_);
@@ -129,6 +196,7 @@ class CuSparseSpMatDescriptor {
   cusparseSpMatDescr_t descriptor_;
 };
 
+/************* DENSE MATRIX DESCRIPTOR ************/
 template <typename T>
 class CuSparseDnMatDescriptor {
  public:
@@ -192,6 +260,44 @@ class CuSparseDnMatDescriptor {
   cusparseDnMatDescr_t descriptor_;
 };
 
+/************* DENSE VECTOR DESCRIPTOR ************/
+template <typename T>
+class CuSparseDnVecDescriptor {
+ public:
+  explicit CuSparseDnVecDescriptor(const phi::DenseTensor& x,
+                                   const phi::GPUContext& dev_ctx)
+      : dev_ctx_(dev_ctx) {
+    std::vector<int64_t> xdim_vec = phi::vectorize(x.dims());
+    auto x_ndims = xdim_vec.size();
+    PADDLE_ENFORCE_GE(x_ndims,
+                      1,
+                      phi::errors::InvalidArgument(
+                          "the dim size of Vec must be eaqual to 1."));
+
+    const T* x_data = x.data<T>();
+    cudaDataType_t gpu_type = GetGpuDataType<T>();
+    dev_ctx_.CusparseCall([&](cusparseHandle_t handle) {
+      phi::dynload::cusparseCreateDnVec(
+          &descriptor_, x.numel(), const_cast<T*>(x_data), gpu_type);
+    });
+
+    VLOG(6) << "Create cusparseDnVecDescr_t " << &descriptor_;
+  }
+
+  ~CuSparseDnVecDescriptor() {
+    dev_ctx_.CusparseCall([&](cusparseHandle_t handle) {
+      phi::dynload::cusparseDestroyDnVec(descriptor_);
+    });
+    VLOG(6) << "Destroy cusparseDnVecDescr_t " << &descriptor_;
+  }
+
+  const cusparseDnVecDescr_t& descriptor() const { return descriptor_; }
+
+ private:
+  const phi::GPUContext& dev_ctx_;
+  cusparseDnVecDescr_t descriptor_;
+};
+
 template <>
 template <typename T, typename TensorType>
 void SparseBlas<phi::GPUContext>::SPMM(bool transa,
@@ -239,6 +345,50 @@ void SparseBlas<phi::GPUContext>::SPMM(bool transa,
   });
 }
 
+template <>
+template <typename T, typename TensorType>
+void SparseBlas<phi::GPUContext>::SPMV(bool transa,
+                                       T alpha,
+                                       const TensorType& mat_a,
+                                       const phi::DenseTensor& vec_x,
+                                       T beta,
+                                       phi::DenseTensor* vec_out) const {
+  auto a_descriptor = CuSparseSpMatDescriptor<T>(mat_a, dev_ctx_);
+  auto x_descriptor = CuSparseDnVecDescriptor<T>(vec_x, dev_ctx_);
+  auto out_descriptor = CuSparseDnVecDescriptor<T>(*vec_out, dev_ctx_);
+
+  cudaDataType_t gpu_type = GetGpuDataType<T>();
+  size_t buffer_size = 0;
+  dev_ctx_.CusparseCall([&](cusparseHandle_t handle) {
+    phi::dynload::cusparseSpMV_bufferSize(handle,
+                                          GetTransposeOperation(transa),
+                                          &alpha,
+                                          a_descriptor.descriptor(),
+                                          x_descriptor.descriptor(),
+                                          &beta,
+                                          out_descriptor.descriptor(),
+                                          gpu_type,
+                                          CUSPARSE_MV_ALG_DEFAULT,
+                                          &buffer_size);
+  });
+
+  paddle::memory::allocation::AllocationPtr tmp_buffer =
+      paddle::memory::Alloc(dev_ctx_, buffer_size);
+  void* tmp_buffer_ptr = tmp_buffer->ptr();
+  dev_ctx_.CusparseCall([&](cusparseHandle_t handle) {
+    phi::dynload::cusparseSpMV(handle,
+                               GetTransposeOperation(transa),
+                               &alpha,
+                               a_descriptor.descriptor(),
+                               x_descriptor.descriptor(),
+                               &beta,
+                               out_descriptor.descriptor(),
+                               gpu_type,
+                               CUSPARSE_MV_ALG_DEFAULT,
+                               tmp_buffer_ptr);
+  });
+}
+
 #if CUDA_VERSION >= 11030
 template <>
 template <typename T, typename TensorType>
@@ -249,12 +399,11 @@ void SparseBlas<phi::GPUContext>::SDDMM(bool transa,
                                         const phi::DenseTensor& mat_b,
                                         T beta,
                                         TensorType* mat_out) const {
-  cudaDataType_t gpu_type = GetGpuDataType<T>();
-
   auto a_descriptor = CuSparseDnMatDescriptor<T>(mat_a, dev_ctx_);
   auto b_descriptor = CuSparseDnMatDescriptor<T>(mat_b, dev_ctx_);
   auto out_descriptor = CuSparseSpMatDescriptor<T>(*mat_out, dev_ctx_);
 
+  cudaDataType_t gpu_type = GetGpuDataType<T>();
   size_t buffer_size = 0;
   dev_ctx_.CusparseCall([&](cusparseHandle_t handle) {
     phi::dynload::cusparseSDDMM_bufferSize(handle,

diff --git a/paddle/phi/kernels/sparse/cpu/mv_grad_kernel.cc b/paddle/phi/kernels/sparse/cpu/mv_grad_kernel.cc
@@ -0,0 +1,56 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/sparse/mv_grad_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+namespace sparse {
+
+template <typename T, typename Context>
+void MvCooGradKernel(const Context& dev_ctx,
+                     const SparseCooTensor& x,
+                     const DenseTensor& vec,
+                     const DenseTensor& dout,
+                     SparseCooTensor* dx,
+                     DenseTensor* dvec) {
+  PADDLE_THROW(phi::errors::Unimplemented(
+      "Not support CPU backward kernel of 'sparse.mv' now."));
+}
+
+template <typename T, typename Context>
+void MvCsrGradKernel(const Context& dev_ctx,
+                     const SparseCsrTensor& x,
+                     const DenseTensor& vec,
+                     const DenseTensor& dout,
+                     SparseCsrTensor* dx,
+                     DenseTensor* dvec) {
+  PADDLE_THROW(phi::errors::Unimplemented(
+      "Not support CPU backward kernel of 'sparse.mv' now."));
+}
+
+}  // namespace sparse
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    mv_coo_grad, CPU, ALL_LAYOUT, phi::sparse::MvCooGradKernel, float, double) {
+  kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO);
+}
+
+PD_REGISTER_KERNEL(
+    mv_csr_grad, CPU, ALL_LAYOUT, phi::sparse::MvCsrGradKernel, float, double) {
+  kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_CSR);
+}