Skip to content

Commit

Permalink
add some kernels(csr*dense->csr, dense*dense->csr) of SparseTensor ma…
Browse files Browse the repository at this point in the history
…tmul (PaddlePaddle#42935)

* add some kernel(csr*dense->csr, dense*dense->csr) of SparseTensor matmul

* fix CI

* fix CI

* fix comment

* fix comment
  • Loading branch information
zhwesky2010 authored and sneaxiy committed Jun 27, 2022
1 parent 3533b1b commit 07d1017
Show file tree
Hide file tree
Showing 27 changed files with 1,656 additions and 65 deletions.
5 changes: 5 additions & 0 deletions paddle/fluid/eager/grad_node_info.cc
Expand Up @@ -27,6 +27,7 @@
#include "paddle/phi/common/data_type.h"
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/core/sparse_coo_tensor.h"
#include "paddle/phi/core/sparse_csr_tensor.h"

/**
* Implementation of GradNodeBase, Edge and GradTensorHolder.
Expand Down Expand Up @@ -114,6 +115,10 @@ void GradNodeBase::SetGradInMeta(const paddle::experimental::Tensor& fwd_out,
phi::SparseCooTensor* coo_tensor =
static_cast<phi::SparseCooTensor*>(fwd_out.impl().get());
dense_tensor = coo_tensor->mutable_non_zero_elements();
} else if (phi::SparseCsrTensor::classof(fwd_out.impl().get())) {
phi::SparseCsrTensor* csr_tensor =
static_cast<phi::SparseCsrTensor*>(fwd_out.impl().get());
dense_tensor = csr_tensor->mutable_non_zero_elements();
} else {
VLOG(6) << "Unable to initialize the DenseTensorMeta of GradSlotMeta with "
"non-DenseTensor argument.";
Expand Down
13 changes: 11 additions & 2 deletions paddle/fluid/eager/grad_tensor_holder.cc
Expand Up @@ -66,8 +66,17 @@ void GradTensorHolder::CopyValueFromTensor(
// Create new tensor->impl and fill it with 1.0
if (t.defined()) {
// Fill 1.0, use full to support complex, one_like don't support it.
buffer_[slot_id][rank] =
paddle::experimental::full(t.shape(), 1, t.dtype(), t.place());
if (t.is_dense_tensor()) {
buffer_[slot_id][rank] =
paddle::experimental::full(t.shape(), 1, t.dtype(), t.place());
} else if (t.is_sparse_csr_tensor() || t.is_sparse_coo_tensor()) {
buffer_[slot_id][rank] =
paddle::experimental::sparse::full_like(t, 1, t.dtype());
} else {
PADDLE_THROW(paddle::platform::errors::Fatal(
"Only Support DENSE_TENSOR, SPARSE_COO_TENSOR, SPARSE_CSR_TENSOR "
"now."));
}
egr::EagerUtils::autograd_meta(&(buffer_[slot_id][rank]))
->SetStopGradient(false);
}
Expand Down
4 changes: 2 additions & 2 deletions paddle/fluid/platform/device/gpu/cuda/cusparse_helper.h
Expand Up @@ -31,7 +31,7 @@ class CusparseHandleHolder {
// ROCM is not yet supported
#if defined(PADDLE_WITH_CUDA)
// The generic APIs is supported from CUDA10.1
#if CUDA_VERSION >= 10010
#if CUDA_VERSION >= 11000
PADDLE_RETRY_CUDA_SUCCESS(dynload::cusparseCreate(&handle_));
PADDLE_RETRY_CUDA_SUCCESS(dynload::cusparseSetStream(handle_, stream));
#endif
Expand All @@ -41,7 +41,7 @@ class CusparseHandleHolder {

~CusparseHandleHolder() PADDLE_MAY_THROW {
#if defined(PADDLE_WITH_CUDA)
#if CUDA_VERSION >= 10010
#if CUDA_VERSION >= 11000
PADDLE_RETRY_CUDA_SUCCESS(dynload::cusparseDestroy(handle_));
#endif
#endif
Expand Down
4 changes: 0 additions & 4 deletions paddle/fluid/platform/dynload/cusparse.cc
Expand Up @@ -24,10 +24,6 @@ namespace dynload {
CUSPARSE_ROUTINE_EACH(DEFINE_WRAP);
#endif

#ifdef CUSPARSE_ROUTINE_EACH_11020
CUSPARSE_ROUTINE_EACH_11020(DEFINE_WRAP);
#endif

#ifdef CUSPARSE_ROUTINE_EACH_R2
CUSPARSE_ROUTINE_EACH_R2(DEFINE_WRAP);
#endif
Expand Down
39 changes: 17 additions & 22 deletions paddle/fluid/platform/dynload/cusparse.h
Expand Up @@ -29,23 +29,17 @@ namespace dynload {
extern DynLoad__##__name __name

#if defined(PADDLE_WITH_CUDA)
// The generic APIs is supported from CUDA10.1
#if CUDA_VERSION >= 10010
#define CUSPARSE_ROUTINE_EACH(__macro) \
__macro(cusparseCreate); \
__macro(cusparseSetStream); \
__macro(cusparseCreateMatDescr); \
__macro(cusparseDestroy); \
__macro(cusparseSnnz); \
__macro(cusparseDnnz); \
__macro(cusparseSetMatType); \
__macro(cusparseSetMatIndexBase);

CUSPARSE_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP);

// APIs available after CUDA 11.2
#if CUDA_VERSION >= 11020
#define CUSPARSE_ROUTINE_EACH_11020(__macro) \
// APIs available after CUDA 11.0
#if CUDA_VERSION >= 11000
#define CUSPARSE_ROUTINE_EACH(__macro) \
__macro(cusparseCreate); \
__macro(cusparseSetStream); \
__macro(cusparseCreateMatDescr); \
__macro(cusparseDestroy); \
__macro(cusparseSnnz); \
__macro(cusparseDnnz); \
__macro(cusparseSetMatType); \
__macro(cusparseSetMatIndexBase); \
__macro(cusparseCreateCsr); \
__macro(cusparseCreateCoo); \
__macro(cusparseCreateDnMat); \
Expand All @@ -59,11 +53,13 @@ CUSPARSE_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP);
__macro(cusparseDenseToSparse_analysis); \
__macro(cusparseDenseToSparse_convert); \
__macro(cusparseSparseToDense_bufferSize); \
__macro(cusparseSparseToDense);
__macro(cusparseSparseToDense); \
__macro(cusparseDnMatSetStridedBatch); \
__macro(cusparseCsrSetStridedBatch);

CUSPARSE_ROUTINE_EACH_11020(PLATFORM_DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP)
CUSPARSE_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP)
#endif

// APIs available after CUDA 11.3
#if CUDA_VERSION >= 11030
#define CUSPARSE_ROUTINE_EACH_R2(__macro) \
__macro(cusparseSDDMM_bufferSize); \
Expand All @@ -72,8 +68,7 @@ CUSPARSE_ROUTINE_EACH_11020(PLATFORM_DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP)

CUSPARSE_ROUTINE_EACH_R2(PLATFORM_DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP)
#endif
#endif
#endif

#endif

#undef PLATFORM_DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP
Expand Down
4 changes: 0 additions & 4 deletions paddle/phi/backends/dynload/cusparse.cc
Expand Up @@ -26,10 +26,6 @@ void *cusparse_dso_handle;
CUSPARSE_ROUTINE_EACH(DEFINE_WRAP);
#endif

#ifdef CUSPARSE_ROUTINE_EACH_11020
CUSPARSE_ROUTINE_EACH_11020(DEFINE_WRAP);
#endif

#ifdef CUSPARSE_ROUTINE_EACH_R2
CUSPARSE_ROUTINE_EACH_R2(DEFINE_WRAP);
#endif
Expand Down
43 changes: 19 additions & 24 deletions paddle/phi/backends/dynload/cusparse.h
Expand Up @@ -30,34 +30,28 @@ extern void *cusparse_dso_handle;
struct DynLoad__##__name { \
template <typename... Args> \
cusparseStatus_t operator()(Args... args) { \
using cusparseFunc = decltype(&::__name); \
using Func = decltype(&::__name); \
std::call_once(cusparse_dso_flag, []() { \
cusparse_dso_handle = phi::dynload::GetCusparseDsoHandle(); \
}); \
static void *p_##__name = dlsym(cusparse_dso_handle, #__name); \
return reinterpret_cast<cusparseFunc>(p_##__name)(args...); \
return reinterpret_cast<Func>(p_##__name)(args...); \
} \
}; \
extern DynLoad__##__name __name

#if defined(PADDLE_WITH_CUDA)
// The generic APIs is supported from CUDA10.1
#if CUDA_VERSION >= 10010
#define CUSPARSE_ROUTINE_EACH(__macro) \
__macro(cusparseCreate); \
__macro(cusparseSetStream); \
__macro(cusparseCreateMatDescr); \
__macro(cusparseDestroy); \
__macro(cusparseSnnz); \
__macro(cusparseDnnz); \
__macro(cusparseSetMatType); \
__macro(cusparseSetMatIndexBase);

CUSPARSE_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP);

// APIs available after CUDA 11.2
#if CUDA_VERSION >= 11020
#define CUSPARSE_ROUTINE_EACH_11020(__macro) \
// APIs available after CUDA 11.0
#if CUDA_VERSION >= 11000
#define CUSPARSE_ROUTINE_EACH(__macro) \
__macro(cusparseCreate); \
__macro(cusparseSetStream); \
__macro(cusparseCreateMatDescr); \
__macro(cusparseDestroy); \
__macro(cusparseSnnz); \
__macro(cusparseDnnz); \
__macro(cusparseSetMatType); \
__macro(cusparseSetMatIndexBase); \
__macro(cusparseCreateCsr); \
__macro(cusparseCreateCoo); \
__macro(cusparseCreateDnMat); \
Expand All @@ -71,11 +65,13 @@ CUSPARSE_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP);
__macro(cusparseDenseToSparse_analysis); \
__macro(cusparseDenseToSparse_convert); \
__macro(cusparseSparseToDense_bufferSize); \
__macro(cusparseSparseToDense);
__macro(cusparseSparseToDense); \
__macro(cusparseDnMatSetStridedBatch); \
__macro(cusparseCsrSetStridedBatch);

CUSPARSE_ROUTINE_EACH_11020(DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP)
CUSPARSE_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP)
#endif

// APIs available after CUDA 11.3
#if CUDA_VERSION >= 11030
#define CUSPARSE_ROUTINE_EACH_R2(__macro) \
__macro(cusparseSDDMM_bufferSize); \
Expand All @@ -84,8 +80,7 @@ CUSPARSE_ROUTINE_EACH_11020(DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP)

CUSPARSE_ROUTINE_EACH_R2(DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP)
#endif
#endif
#endif

#endif

#undef DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP
Expand Down
13 changes: 11 additions & 2 deletions paddle/phi/backends/gpu/gpu_context.cc
Expand Up @@ -402,7 +402,10 @@ struct GPUContext::Impl {

void SetSolverHandle(solverHandle_t handle) { solver_handle_ = handle; }

sparseHandle_t GetSparseHandle() const {
sparseHandle_t GetSparseHandle() {
std::call_once(flag_sparse_, [=]() {
if (!sparse_handle_) phi::InitSparseHandle(&sparse_handle_, stream_);
});
PD_CHECK(sparse_handle_ != nullptr, "the gpu sparse handle is nullptr.");
return sparse_handle_;
}
Expand Down Expand Up @@ -519,7 +522,12 @@ struct GPUContext::Impl {
}

inline void CusparseCall(
const std::function<void(sparseHandle_t)>& callback) const {
const std::function<void(sparseHandle_t)>& callback) {
std::call_once(flag_sparse_, [=]() {
if (!sparse_handle_) {
phi::InitSparseHandle(&sparse_handle_, stream_);
}
});
std::lock_guard<std::mutex> guard(sparse_mtx_);
callback(sparse_handle_);
}
Expand Down Expand Up @@ -598,6 +606,7 @@ struct GPUContext::Impl {
sparseHandle_t sparse_handle_{nullptr};
DnnWorkspaceHandle* workspace_{nullptr};

std::once_flag flag_sparse_;
std::once_flag flag_blas_;
std::once_flag flag_blaslt_;
std::once_flag flag_dnn_;
Expand Down
4 changes: 2 additions & 2 deletions paddle/phi/backends/gpu/gpu_resources.cc
Expand Up @@ -250,7 +250,7 @@ void InitSparseHandle(sparseHandle_t* handle, gpuStream_t stream) {
// ROCM is not yet supported
#if defined(PADDLE_WITH_CUDA)
// The generic APIs is supported from CUDA10.1
#if CUDA_VERSION >= 10010
#if CUDA_VERSION >= 11000
PADDLE_RETRY_CUDA_SUCCESS(dynload::cusparseCreate(handle));
PADDLE_RETRY_CUDA_SUCCESS(dynload::cusparseSetStream(*handle, stream));
#endif
Expand All @@ -259,7 +259,7 @@ void InitSparseHandle(sparseHandle_t* handle, gpuStream_t stream) {

void DestroySparseHandle(sparseHandle_t handle) {
#ifdef PADDLE_WITH_CUDA
#if CUDA_VERSION >= 10010
#if CUDA_VERSION >= 11000
if (handle != nullptr) {
PADDLE_RETRY_CUDA_SUCCESS(dynload::cusparseDestroy(handle));
handle = nullptr;
Expand Down
4 changes: 4 additions & 0 deletions paddle/phi/core/sparse_csr_tensor.h
Expand Up @@ -85,6 +85,10 @@ class SparseCsrTensor : public TensorBase,
/// \return The non zero elemetns in original dense tensor.
const DenseTensor& non_zero_elements() const { return non_zero_elements_; }

/// \brief Returns the total number of non zero elements in original dense
/// tensor.
int64_t nnz() const { return non_zero_elements_.numel(); }

/// \brief Return the number of elements contained in original dense tensor
/// \return The number of elements contained in original dense tensor
int64_t numel() const override { return product(dims_); }
Expand Down
96 changes: 96 additions & 0 deletions paddle/phi/kernels/funcs/sparse/sparse_blas.h
@@ -0,0 +1,96 @@
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#pragma once

#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/core/device_context.h"
#include "paddle/phi/core/sparse_coo_tensor.h"
#include "paddle/phi/core/sparse_csr_tensor.h"

namespace phi {
namespace funcs {
namespace sparse {

template <typename DeviceContext>
class SparseBlas {
public:
explicit SparseBlas(const DeviceContext& dev_ctx) : dev_ctx_(dev_ctx) {}

// TODO(zhouwei25): implement "COO @ DENSE -> DENSE" of DSDMM
template <typename T>
void DSDMM(bool transa,
bool transb,
T alpha,
const phi::SparseCooTensor& mat_a,
const phi::DenseTensor& mat_b,
T beta,
phi::DenseTensor* mat_c) const;

template <typename T>
void DSDMM(bool transa,
bool transb,
T alpha,
const phi::SparseCsrTensor& mat_a,
const phi::DenseTensor& mat_b,
T beta,
phi::DenseTensor* mat_c) const;

template <typename T>
void SDDMM(bool transa,
bool transb,
T alpha,
const phi::DenseTensor& mat_a,
const phi::DenseTensor& mat_b,
T beta,
phi::SparseCsrTensor* mat_c) const;

private:
const DeviceContext& dev_ctx_;
};

template <typename DeviceContext, typename T>
class SparseBlasT : private SparseBlas<DeviceContext> {
public:
using SparseBlas<DeviceContext>::SparseBlas;

template <typename... ARGS>
void DSDMM(ARGS... args) const {
Base()->template DSDMM<T>(args...);
}

template <typename... ARGS>
void SDDMM(ARGS... args) const {
Base()->template SDDMM<T>(args...);
}

private:
const SparseBlas<DeviceContext>* Base() const {
return static_cast<const SparseBlas<DeviceContext>*>(this);
}
};

template <typename DeviceContext, typename T>
inline SparseBlasT<DeviceContext, T> GetSparseBlas(
const DeviceContext& dev_ctx) {
return SparseBlasT<DeviceContext, T>(dev_ctx);
}

} // namespace sparse
} // namespace funcs
} // namespace phi

#if defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 11000
#include "paddle/phi/kernels/funcs/sparse/sparse_blas_impl.cu.h"
#endif

0 comments on commit 07d1017

Please sign in to comment.