Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Sparse] Add sparse matmul kernel (coo*dense->dense) #44346

Merged
merged 1 commit into from Jul 18, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
4 changes: 4 additions & 0 deletions paddle/fluid/platform/dynload/cusparse.cc
Expand Up @@ -28,6 +28,10 @@ CUSPARSE_ROUTINE_EACH(DEFINE_WRAP);
CUSPARSE_ROUTINE_EACH_R2(DEFINE_WRAP);
#endif

#ifdef CUSPARSE_ROUTINE_EACH_R3
CUSPARSE_ROUTINE_EACH_R3(DEFINE_WRAP);
#endif

} // namespace dynload
} // namespace platform
} // namespace paddle
10 changes: 5 additions & 5 deletions paddle/phi/api/yaml/sparse_api.yaml
Expand Up @@ -297,18 +297,18 @@
args : (Tensor x, Tensor y, Tensor mask)
output : Tensor(out)
kernel :
func : csr_masked_matmul{dense, dense, sparse_csr -> sparse_csr}
func : masked_matmul_csr{dense, dense, sparse_csr -> sparse_csr}
layout : x
backward: masked_matmul_grad

- api: matmul
args : (Tensor x, Tensor y)
output : Tensor(out)
kernel :
func : csr_dense_matmul{sparse_csr, dense -> dense},
csr_csr_matmul{sparse_csr, sparse_csr -> sparse_csr},
coo_dense_matmul{sparse_coo, dense -> dense},
coo_coo_matmul{sparse_coo, sparse_coo -> sparse_coo}
func : matmul_csr_dense {sparse_csr, dense -> dense},
matmul_csr_csr {sparse_csr, sparse_csr -> sparse_csr},
matmul_coo_dense {sparse_coo, dense -> dense},
matmul_coo_coo {sparse_coo, sparse_coo -> sparse_coo}
layout : x
backward: matmul_grad

Expand Down
7 changes: 5 additions & 2 deletions paddle/phi/api/yaml/sparse_bw_api.yaml
Expand Up @@ -125,14 +125,17 @@
args : (Tensor x, Tensor y, Tensor out_grad)
output : Tensor(x_grad), Tensor(y_grad)
kernel :
func : csr_masked_matmul_grad{dense, dense, sparse_csr -> dense, dense}
func : masked_matmul_csr_grad{dense, dense, sparse_csr -> dense, dense}

- backward_api : matmul_grad
forward : matmul(Tensor x, Tensor y) -> Tensor(out)
args : (Tensor x, Tensor y, Tensor out_grad)
output : Tensor(x_grad), Tensor(y_grad)
kernel :
func : csr_dense_matmul_grad{sparse_csr, dense, dense -> sparse_csr, dense}
func : matmul_csr_dense_grad {sparse_csr, dense, dense -> sparse_csr, dense},
matmul_csr_csr_grad {sparse_csr, sparse_csr, sparse_csr -> sparse_csr, sparse_csr},
matmul_coo_dense_grad {sparse_coo, dense, dense -> sparse_coo, dense},
matmul_coo_coo_grad {sparse_coo, sparse_coo, sparse_coo -> sparse_coo, sparse_coo}

- backward_api : multiply_grad
forward : multiply(Tensor x, Tensor y) -> Tensor(out)
Expand Down
4 changes: 4 additions & 0 deletions paddle/phi/backends/dynload/cusparse.cc
Expand Up @@ -30,5 +30,9 @@ CUSPARSE_ROUTINE_EACH(DEFINE_WRAP);
CUSPARSE_ROUTINE_EACH_R2(DEFINE_WRAP);
#endif

#ifdef CUSPARSE_ROUTINE_EACH_R3
CUSPARSE_ROUTINE_EACH_R3(DEFINE_WRAP);
#endif

} // namespace dynload
} // namespace phi
3 changes: 3 additions & 0 deletions paddle/phi/kernels/funcs/sparse/sparse_blas_impl.cu.h
Expand Up @@ -298,6 +298,7 @@ class CuSparseDnVecDescriptor {
cusparseDnVecDescr_t descriptor_;
};

/************* SPARSE*DENSE->DENSE MATMUL ************/
template <>
template <typename T, typename TensorType>
void SparseBlas<phi::GPUContext>::SPMM(bool transa,
Expand Down Expand Up @@ -345,6 +346,7 @@ void SparseBlas<phi::GPUContext>::SPMM(bool transa,
});
}

/************* SPARSE*DENSE->DENSE MV ************/
template <>
template <typename T, typename TensorType>
void SparseBlas<phi::GPUContext>::SPMV(bool transa,
Expand Down Expand Up @@ -389,6 +391,7 @@ void SparseBlas<phi::GPUContext>::SPMV(bool transa,
});
}

/************* DENSE*DENSE->SPARSE MATMUL ************/
#if CUDA_VERSION >= 11030
template <>
template <typename T, typename TensorType>
Expand Down
12 changes: 6 additions & 6 deletions paddle/phi/kernels/sparse/cpu/matmul_grad_kernel.cc
Expand Up @@ -22,7 +22,7 @@ namespace sparse {

// TODO(zhouwei25): implement CPU backward kernel of " CSR @ DENSE -> DENSE"
template <typename T, typename Context>
void CsrDenseMatmulGradKernel(const Context& dev_ctx,
void MatmulCsrDenseGradKernel(const Context& dev_ctx,
const SparseCsrTensor& x,
const DenseTensor& y,
const DenseTensor& dout,
Expand All @@ -34,7 +34,7 @@ void CsrDenseMatmulGradKernel(const Context& dev_ctx,

// TODO(zhouwei25): implement CPU kernel of " DENSE @ DENSE * CSR_MASK -> CSR"
template <typename T, typename Context>
void CsrMaskedMatmulGradKernel(const Context& dev_ctx,
void MaskedMatmulCsrGradKernel(const Context& dev_ctx,
const DenseTensor& x,
const DenseTensor& y,
const SparseCsrTensor& dout,
Expand All @@ -47,18 +47,18 @@ void CsrMaskedMatmulGradKernel(const Context& dev_ctx,
} // namespace sparse
} // namespace phi

PD_REGISTER_KERNEL(csr_dense_matmul_grad,
PD_REGISTER_KERNEL(matmul_csr_dense_grad,
CPU,
ALL_LAYOUT,
phi::sparse::CsrDenseMatmulGradKernel,
phi::sparse::MatmulCsrDenseGradKernel,
float,
double) {
kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_CSR);
}

PD_REGISTER_KERNEL(csr_masked_matmul_grad,
PD_REGISTER_KERNEL(masked_matmul_csr_grad,
CPU,
ALL_LAYOUT,
phi::sparse::CsrMaskedMatmulGradKernel,
phi::sparse::MaskedMatmulCsrGradKernel,
float,
double) {}
12 changes: 6 additions & 6 deletions paddle/phi/kernels/sparse/cpu/matmul_kernel.cc
Expand Up @@ -22,7 +22,7 @@ namespace sparse {

// TODO(zhouwei25): implement CPU kernel of " CSR @ DENSE -> DENSE"
template <typename T, typename Context>
void CsrDenseMatmulKernel(const Context& dev_ctx,
void MatmulCsrDenseKernel(const Context& dev_ctx,
const SparseCsrTensor& x,
const DenseTensor& y,
DenseTensor* out) {
Expand All @@ -32,7 +32,7 @@ void CsrDenseMatmulKernel(const Context& dev_ctx,

// TODO(zhouwei25): implement CPU kernel of " DENSE @ DENSE * CSR_MASK -> CSR"
template <typename T, typename Context>
void CsrMaskedMatmulKernel(const Context& dev_ctx,
void MaskedMatmulCsrKernel(const Context& dev_ctx,
const DenseTensor& x,
const DenseTensor& y,
const SparseCsrTensor& mask,
Expand All @@ -44,18 +44,18 @@ void CsrMaskedMatmulKernel(const Context& dev_ctx,
} // namespace sparse
} // namespace phi

PD_REGISTER_KERNEL(csr_dense_matmul,
PD_REGISTER_KERNEL(matmul_csr_dense,
CPU,
ALL_LAYOUT,
phi::sparse::CsrDenseMatmulKernel,
phi::sparse::MatmulCsrDenseKernel,
float,
double) {
kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_CSR);
}

PD_REGISTER_KERNEL(csr_masked_matmul,
PD_REGISTER_KERNEL(masked_matmul_csr,
CPU,
ALL_LAYOUT,
phi::sparse::CsrMaskedMatmulKernel,
phi::sparse::MaskedMatmulCsrKernel,
float,
double) {}
24 changes: 7 additions & 17 deletions paddle/phi/kernels/sparse/empty_kernel.cc
Expand Up @@ -26,37 +26,27 @@ template <typename T, typename Context>
void EmptyLikeCooKernel(const Context& dev_ctx,
const SparseCooTensor& x,
SparseCooTensor* out) {
const DenseTensor& x_indices = x.non_zero_indices();
out->set_dims(x.dims());
*(out->mutable_non_zero_indices()) = x.non_zero_indices();

const DenseTensor& x_values = x.non_zero_elements();
DenseTensor* out_indices = out->mutable_non_zero_indices();
DenseTensor* out_values = out->mutable_non_zero_elements();

phi::Copy(dev_ctx, x_indices, dev_ctx.GetPlace(), false, out_indices);

out_values->Resize(x_values.dims());
dev_ctx.template Alloc<T>(out_values);

out->set_dims(x.dims());
}

template <typename T, typename Context>
void EmptyLikeCsrKernel(const Context& dev_ctx,
const SparseCsrTensor& x,
SparseCsrTensor* out) {
const DenseTensor& x_crows = x.non_zero_crows();
const DenseTensor& x_cols = x.non_zero_cols();
out->set_dims(x.dims());
*(out->mutable_non_zero_crows()) = x.non_zero_crows();
*(out->mutable_non_zero_cols()) = x.non_zero_cols();

const DenseTensor& x_values = x.non_zero_elements();
DenseTensor* out_crows = out->mutable_non_zero_crows();
DenseTensor* out_cols = out->mutable_non_zero_cols();
DenseTensor* out_values = out->mutable_non_zero_elements();

phi::Copy(dev_ctx, x_crows, dev_ctx.GetPlace(), false, out_crows);
phi::Copy(dev_ctx, x_cols, dev_ctx.GetPlace(), false, out_cols);

out_values->Resize(x_values.dims());
dev_ctx.template Alloc<T>(out_values);

out->set_dims(x.dims());
}

} // namespace sparse
Expand Down
60 changes: 54 additions & 6 deletions paddle/phi/kernels/sparse/gpu/matmul_grad_kernel.cu
Expand Up @@ -22,13 +22,52 @@ limitations under the License. */
#include "paddle/phi/kernels/empty_kernel.h"
#include "paddle/phi/kernels/funcs/sparse/sparse_blas.h"
#include "paddle/phi/kernels/sparse/empty_kernel.h"
#include "paddle/phi/kernels/sparse/sparse_utils_kernel.h"
#include "paddle/phi/kernels/transpose_kernel.h"

namespace phi {
namespace sparse {

template <typename T, typename Context>
void CsrDenseMatmulGradKernel(const Context& dev_ctx,
void MatmulCooDenseGradKernel(const Context& dev_ctx,
const SparseCooTensor& x,
const DenseTensor& y,
const DenseTensor& dout,
SparseCooTensor* dx,
DenseTensor* dy) {
#if CUDA_VERSION >= 11030
auto sparse_blas = phi::funcs::sparse::GetSparseBlas<Context, T>(dev_ctx);

// dx{SparseCoo} = dout{Dense} * y'{Dense}
if (dx) {
// 'cusparseSDDMM' only support CSR now, so use COO->CSR->COO,
// which will increase some expenses.
EmptyLikeCooKernel<T, Context>(dev_ctx, x, dx);
SparseCsrTensor dx_csr = SparseCooToCsr<T, Context>(dev_ctx, *dx);
sparse_blas.SDDMM(
false, true, static_cast<T>(1), dout, y, static_cast<T>(0), &dx_csr);
SparseCsrToCooKernel<T, Context>(dev_ctx, dx_csr, dx);
}

// dy{Dense} = x'{SparseCoo} * dout{Dense}
if (dy) {
MetaTensor meta_dy(dy);
meta_dy.set_dims(y.dims());
meta_dy.set_dtype(y.dtype());
dev_ctx.template Alloc<T>(dy);

sparse_blas.SPMM(
true, false, static_cast<T>(1), x, dout, static_cast<T>(0), dy);
}
#else
PADDLE_THROW(phi::errors::Unimplemented(
"backward of 'sparse.matmul' use cusparseSDDMM, which is supported from "
"CUDA 11.3"));
#endif
}

template <typename T, typename Context>
void MatmulCsrDenseGradKernel(const Context& dev_ctx,
const SparseCsrTensor& x,
const DenseTensor& y,
const DenseTensor& dout,
Expand Down Expand Up @@ -66,7 +105,7 @@ void CsrDenseMatmulGradKernel(const Context& dev_ctx,
}

template <typename T, typename Context>
void CsrMaskedMatmulGradKernel(const Context& dev_ctx,
void MaskedMatmulCsrGradKernel(const Context& dev_ctx,
const DenseTensor& x,
const DenseTensor& y,
const SparseCsrTensor& dout,
Expand Down Expand Up @@ -119,18 +158,27 @@ void CsrMaskedMatmulGradKernel(const Context& dev_ctx,
} // namespace sparse
} // namespace phi

PD_REGISTER_KERNEL(csr_dense_matmul_grad,
PD_REGISTER_KERNEL(matmul_coo_dense_grad,
GPU,
ALL_LAYOUT,
phi::sparse::MatmulCooDenseGradKernel,
float,
double) {
kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO);
}

PD_REGISTER_KERNEL(matmul_csr_dense_grad,
GPU,
ALL_LAYOUT,
phi::sparse::CsrDenseMatmulGradKernel,
phi::sparse::MatmulCsrDenseGradKernel,
float,
double) {
kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_CSR);
}

PD_REGISTER_KERNEL(csr_masked_matmul_grad,
PD_REGISTER_KERNEL(masked_matmul_csr_grad,
GPU,
ALL_LAYOUT,
phi::sparse::CsrMaskedMatmulGradKernel,
phi::sparse::MaskedMatmulCsrGradKernel,
float,
double) {}
45 changes: 35 additions & 10 deletions paddle/phi/kernels/sparse/gpu/matmul_kernel.cu
Expand Up @@ -31,11 +31,11 @@ limitations under the License. */
namespace phi {
namespace sparse {

template <typename T, typename Context>
void CsrDenseMatmulKernel(const Context& dev_ctx,
const SparseCsrTensor& x,
const DenseTensor& y,
DenseTensor* out) {
template <typename T, typename Context, typename TensorType>
void MatmulKernelImpl(const Context& dev_ctx,
const TensorType& x,
const DenseTensor& y,
DenseTensor* out) {
#if CUDA_VERSION >= 11000
std::vector<int64_t> xdim_vec = phi::vectorize(x.dims());
std::vector<int64_t> ydim_vec = phi::vectorize(y.dims());
Expand Down Expand Up @@ -91,7 +91,23 @@ void CsrDenseMatmulKernel(const Context& dev_ctx,
}

template <typename T, typename Context>
void CsrMaskedMatmulKernel(const Context& dev_ctx,
void MatmulCooDenseKernel(const Context& dev_ctx,
const SparseCooTensor& x,
const DenseTensor& y,
DenseTensor* out) {
MatmulKernelImpl<T>(dev_ctx, x, y, out);
}

template <typename T, typename Context>
void MatmulCsrDenseKernel(const Context& dev_ctx,
const SparseCsrTensor& x,
const DenseTensor& y,
DenseTensor* out) {
MatmulKernelImpl<T>(dev_ctx, x, y, out);
}

template <typename T, typename Context>
void MaskedMatmulCsrKernel(const Context& dev_ctx,
const DenseTensor& x,
const DenseTensor& y,
const SparseCsrTensor& mask,
Expand Down Expand Up @@ -176,18 +192,27 @@ void CsrMaskedMatmulKernel(const Context& dev_ctx,
} // namespace sparse
} // namespace phi

PD_REGISTER_KERNEL(csr_dense_matmul,
PD_REGISTER_KERNEL(matmul_csr_dense,
GPU,
ALL_LAYOUT,
phi::sparse::CsrDenseMatmulKernel,
phi::sparse::MatmulCsrDenseKernel,
float,
double) {
kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_CSR);
}

PD_REGISTER_KERNEL(csr_masked_matmul,
PD_REGISTER_KERNEL(matmul_coo_dense,
GPU,
ALL_LAYOUT,
phi::sparse::MatmulCooDenseKernel,
float,
double) {
kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO);
}

PD_REGISTER_KERNEL(masked_matmul_csr,
GPU,
ALL_LAYOUT,
phi::sparse::CsrMaskedMatmulKernel,
phi::sparse::MaskedMatmulCsrKernel,
float,
double) {}