From d04f2e5ab06f83af5b6a26fccfde742dcc0903cf Mon Sep 17 00:00:00 2001
From: zkh2016 <zhangkaihuo@baidu.com>
Date: Thu, 7 Apr 2022 09:13:51 +0000
Subject: [PATCH 01/23] batch_norm

---
 .../fluid/tests/unittests/test_sparse_norm.py | 32 +++++++++
 python/paddle/sparse/__init__.py              |  3 +-
 python/paddle/sparse/layer/__init__.py        |  1 +
 python/paddle/sparse/layer/norm.py            | 70 +++++++++++++++++++
 4 files changed, 105 insertions(+), 1 deletion(-)
 create mode 100644 python/paddle/fluid/tests/unittests/test_sparse_norm.py
 create mode 100644 python/paddle/sparse/layer/norm.py

diff --git a/python/paddle/fluid/tests/unittests/test_sparse_norm.py b/python/paddle/fluid/tests/unittests/test_sparse_norm.py
new file mode 100644
index 0000000000000..ac505e3b7b42e
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_sparse_norm.py
@@ -0,0 +1,32 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import unittest
+import numpy as np
+import paddle
+from paddle.fluid.framework import _test_eager_guard
+
+
+class TestSparseBatchNorm(unittest.TestCase):
+    def test_sparse_batch_norm(self):
+        with _test_eager_guard():
+            shape = [2, 6, 6, 6, 4]
+            dense_x = paddle.randn(shape)
+            print(dense_x)
+            batch_norm = paddle.nn.BatchNorm3D(4, data_format="NDHWC")
+            dense_y = batch_norm(dense_x)
+            sparse_dim = 4
+            sparse_x = dense_x.to_sparse_coo(sparse_dim)
+            batch_norm = paddle.sparse.BatchNorm(4)
diff --git a/python/paddle/sparse/__init__.py b/python/paddle/sparse/__init__.py
index aff9625469ef2..1bcd87efe6956 100644
--- a/python/paddle/sparse/__init__.py
+++ b/python/paddle/sparse/__init__.py
@@ -15,5 +15,6 @@
 from .creation import sparse_coo_tensor
 from .creation import sparse_csr_tensor
 from .layer.activation import ReLU
+from .layer.norm import BatchNorm
 
-__all__ = ['sparse_coo_tensor', 'sparse_csr_tensor', 'ReLU']
+__all__ = ['sparse_coo_tensor', 'sparse_csr_tensor', 'ReLU', 'BatchNorm']
diff --git a/python/paddle/sparse/layer/__init__.py b/python/paddle/sparse/layer/__init__.py
index 66abce260b6f7..5a5f59183f12e 100644
--- a/python/paddle/sparse/layer/__init__.py
+++ b/python/paddle/sparse/layer/__init__.py
@@ -13,5 +13,6 @@
 # limitations under the License.
 
 from .activation import ReLU
+from .norm import BatchNorm
 
 __all__ = []
diff --git a/python/paddle/sparse/layer/norm.py b/python/paddle/sparse/layer/norm.py
new file mode 100644
index 0000000000000..4f2818cd2dcb2
--- /dev/null
+++ b/python/paddle/sparse/layer/norm.py
@@ -0,0 +1,70 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+class BatchNorm(paddle.fluid.dygraph.BatchNorm):
+    def __init__(self,
+                 num_channels,
+                 act=None,
+                 is_test=False,
+                 momentum=0.9,
+                 epsilon=1e-05,
+                 param_attr=None,
+                 bias_attr=None,
+                 dtype='float32',
+                 data_layout='NCHW',
+                 in_place=False,
+                 moving_mean_name=None,
+                 moving_variance_name=None,
+                 do_model_average_for_mean_and_var=True,
+                 use_global_stats=False,
+                 trainable_statistics=False):
+        super(BatchNorm, self).__init__(
+            num_channels,
+            act=act,
+            is_test=is_test,
+            momentum=momentum,
+            epsilon=epsilon,
+            param_attr=param_attr,
+            bias_attr=bias_attr,
+            dtype=dtype,
+            data_layout,
+            in_place=in_place,
+            moving_mean_name=moving_mean_name,
+            moving_variance_name=moving_variance_name,
+            do_model_average_for_mean_and_var=do_model_average_for_mean_and_var,
+            use_global_stats=use_global_stats,
+            trainable_statistics=tranable_statistics)
+
+        def forward(self, input):
+            values = input.values()
+            out = super(BatchNorm, self).forward(values)
+            return paddle.sparse.sparse_coo_tensor(
+                input.indices(),
+                out,
+                shape=input.shape,
+                stop_gradient=input.stop_gradient)

From bfe56c73aebdcdd86c768225e26b458fc73a65c6 Mon Sep 17 00:00:00 2001
From: zkh2016 <zhangkaihuo@baidu.com>
Date: Sat, 9 Apr 2022 14:21:50 +0000
Subject: [PATCH 02/23] add kernel sparse_mask_helper; sparse_coo_tensor_grad

---
 .../phi/kernels/funcs/sparse/common_shape.h   |  27 +++
 .../kernels/sparse/cpu/sparse_mask_kernel.cc  | 103 ++++++++++--
 .../kernels/sparse/cpu/sparse_utils_kernel.cc |  12 ++
 .../kernels/sparse/gpu/sparse_mask_kernel.cu  | 156 +++++++++++++++++-
 .../kernels/sparse/gpu/sparse_utils_kernel.cu |  12 ++
 .../phi/kernels/sparse/sparse_mask_kernel.h   |   6 +
 .../sparse/sparse_utils_grad_kernel.cc        |  25 +++
 .../kernels/sparse/sparse_utils_grad_kernel.h |   9 +
 .../phi/kernels/sparse/sparse_utils_kernel.h  |  12 ++
 .../tests/unittests/test_sparse_utils_op.py   |  21 +++
 python/paddle/sparse/creation.py              |   4 +-
 python/paddle/utils/code_gen/sparse_api.yaml  |   8 +
 .../paddle/utils/code_gen/sparse_bw_api.yaml  |   7 +
 13 files changed, 389 insertions(+), 13 deletions(-)

diff --git a/paddle/phi/kernels/funcs/sparse/common_shape.h b/paddle/phi/kernels/funcs/sparse/common_shape.h
index 3617e3cd2f406..ea7583a16f9e2 100644
--- a/paddle/phi/kernels/funcs/sparse/common_shape.h
+++ b/paddle/phi/kernels/funcs/sparse/common_shape.h
@@ -40,6 +40,33 @@ inline const DDim InferDenseDims(const DDim& x_dims,
   return values_dims;
 }
 
+template <typename IntT>
+inline IntT HOSTDEVICE IndicesToIndex(const IntT* indices,
+                                      const IntT* sparse_offsets,
+                                      const int64_t non_zero_num,
+                                      const int64_t sparse_dim,
+                                      const int i) {
+  IntT index = 0;
+  for (IntT j = 0; j < sparse_dim; j++) {
+    index += indices[j * non_zero_num + i] * sparse_offsets[j];
+  }
+  return index;
+}
+
+// 1. indices.dims().size() == 2
+template <typename IntT>
+inline const void CalcOffsetsPerDim(const DenseTensor& indices,
+                                    const DDim& dims,
+                                    std::vector<IntT>* offsets) {
+  const DDim& indices_dims = indices.dims();
+  const IntT sparse_dim = indices_dims[0];
+  IntT offset = 1;
+  for (IntT i = sparse_dim - 1; i >= 0; i--) {
+    (*offsets)[i] = offset;
+    offset *= dims[i];
+  }
+}
+
 }  // namespace sparse
 }  // namespace funcs
 }  // namespace phi
diff --git a/paddle/phi/kernels/sparse/cpu/sparse_mask_kernel.cc b/paddle/phi/kernels/sparse/cpu/sparse_mask_kernel.cc
index 0a5e145312e0e..8d9f0c37df18c 100644
--- a/paddle/phi/kernels/sparse/cpu/sparse_mask_kernel.cc
+++ b/paddle/phi/kernels/sparse/cpu/sparse_mask_kernel.cc
@@ -19,6 +19,7 @@ limitations under the License. */
 #include "paddle/phi/kernels/copy_kernel.h"
 #include "paddle/phi/kernels/empty_kernel.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/funcs/sparse/common_shape.h"
 
 #include "paddle/phi/api/ext/dispatch.h"
 
@@ -38,12 +39,6 @@ void SparseMaskCPUKernel(const CPUContext& dev_ctx,
   const DenseTensor& indices = mask.non_zero_indices();
   const DenseTensor& values = mask.non_zero_elements();
   int sparse_dim = indices.dims().size();
-  std::vector<int64_t> sparse_offsets(sparse_dim);
-  int64_t offset = 1;
-  for (int i = sparse_dim - 1; i >= 0; i--) {
-    sparse_offsets[i] = offset;
-    offset *= dims[i];
-  }
 
   DenseTensor out_indices = phi::EmptyLike<T>(dev_ctx, indices);
   DenseTensor out_values = phi::EmptyLike<T>(dev_ctx, values);
@@ -51,19 +46,23 @@ void SparseMaskCPUKernel(const CPUContext& dev_ctx,
   // the out_indices is same as indices of mask
   phi::Copy(dev_ctx, indices, dev_ctx.GetPlace(), false, &out_indices);
 
-  const IntT* indices_ptr = indices.data<IntT>();
   T* out_values_ptr = out_values.data<T>();
   const T* x_ptr = x.data<T>();
 
   const int64_t non_zero_num = mask.nnz();
   auto dims_2d = flatten_to_2d(dims, sparse_dim);
   const int cols = dims_2d[1];
+  const IntT* indices_ptr = indices.data<IntT>();
 
+  std::vector<IntT> out_indexs(non_zero_num), sparse_offsets(sparse_dim);
+  // phi::funcs::sparse::FlattenIndices<IntT>(indices, x.dims(), &out_indexs);
   for (int64_t i = 0; i < non_zero_num; i++) {
-    int64_t index = 0;
-    for (int j = 0; j < sparse_dim; j++) {
-      index += indices_ptr[j * non_zero_num + i] * sparse_offsets[j];
-    }
+    out_indexs[i] = phi::funcs::sparse::IndicesToIndex<IntT>(
+        indices_ptr, sparse_offsets.data(), non_zero_num, sparse_dim, i);
+  }
+
+  for (int64_t i = 0; i < non_zero_num; i++) {
+    int64_t index = out_indexs[i];
     memcpy(out_values_ptr + i * cols, x_ptr + index * cols, cols * sizeof(T));
   }
   out->SetMember(out_indices, out_values, dims, true);
@@ -85,6 +84,75 @@ void SparseMaskKernel(const Context& dev_ctx,
       }));
 }
 
+template <typename T, typename IntT>
+void SparseMaskHelperCPUKernel(const CPUContext& dev_ctx,
+                               const SparseCooTensor& x,
+                               const DenseTensor& mask_indices,
+                               DenseTensor* out) {
+  PADDLE_ENFORCE_EQ(
+      mask_indices.dims().size(),
+      2,
+      phi::errors::InvalidArgument("the mask_indices must be 2-D tensor"));
+
+  const int64_t sparse_dim = x.non_zero_indices().dims()[0];
+
+  std::vector<IntT> sparse_offsets(sparse_dim), x_indexs(x.nnz()),
+      mask_indexs(mask_indices.dims()[1]);
+  phi::funcs::sparse::CalcOffsetsPerDim<IntT>(
+      x.non_zero_indices(), x.dims(), &sparse_offsets);
+
+  auto FlattenIndices = [](const IntT* indices,
+                           const IntT* sparse_offsets,
+                           const int64_t non_zero_num,
+                           const int64_t sparse_dim,
+                           std::vector<IntT>* out) {
+    for (int64_t i = 0; i < non_zero_num; i++) {
+      (*out)[i] = phi::funcs::sparse::IndicesToIndex<IntT>(
+          indices, sparse_offsets, non_zero_num, sparse_dim, i);
+    }
+  };
+
+  FlattenIndices(x.non_zero_indices().data<IntT>(),
+                 sparse_offsets.data(),
+                 x.nnz(),
+                 sparse_dim,
+                 &x_indexs);
+  FlattenIndices(mask_indices.data<IntT>(),
+                 sparse_offsets.data(),
+                 x.nnz(),
+                 sparse_dim,
+                 &mask_indexs);
+
+  std::set<IntT> x_indexs_set(x_indexs.begin(), x_indexs.end());
+  *out = phi::EmptyLike<T>(dev_ctx, x.non_zero_elements());
+  T* out_ptr = out->data<T>();
+  memset(out_ptr, static_cast<T>(0), out->numel() * sizeof(T));
+  const int stride = x.non_zero_elements().dims()[1];
+  const T* in_ptr = x.non_zero_elements().data<T>();
+  for (uint64_t i = 0; i < mask_indexs.size(); i++) {
+    auto iter = x_indexs_set.find(mask_indexs[i]);
+    if (iter != x_indexs_set.end()) {
+      memcpy(out_ptr + i * stride,
+             in_ptr + mask_indexs[i] * stride,
+             stride * sizeof(T));
+    }
+  }
+}
+
+/**
+ * @brief filter values from x.values() using mask_indices
+ */
+template <typename T, typename Context>
+void SparseMaskHelperKernel(const Context& dev_ctx,
+                            const SparseCooTensor& x,
+                            const DenseTensor& mask_indices,
+                            DenseTensor* out) {
+  PD_DISPATCH_INTEGRAL_TYPES(
+      x.non_zero_indices().dtype(), "SparseMaskHelperCPUKernel", ([&] {
+        SparseMaskHelperCPUKernel<T, data_t>(dev_ctx, x, mask_indices, out);
+      }));
+}
+
 }  // namespace sparse
 }  // namespace phi
 
@@ -101,3 +169,16 @@ PD_REGISTER_KERNEL(sparse_mask,
                    int64_t) {
   kernel->InputAt(1).SetDataLayout(phi::DataLayout::SPARSE_COO);
 }
+
+PD_REGISTER_KERNEL(sparse_mask_helper,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::sparse::SparseMaskHelperKernel,
+                   float,
+                   double,
+                   uint8_t,
+                   int16_t,
+                   int,
+                   int64_t) {
+  kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO);
+}
diff --git a/paddle/phi/kernels/sparse/cpu/sparse_utils_kernel.cc b/paddle/phi/kernels/sparse/cpu/sparse_utils_kernel.cc
index acc834269663d..0499371a4dd17 100644
--- a/paddle/phi/kernels/sparse/cpu/sparse_utils_kernel.cc
+++ b/paddle/phi/kernels/sparse/cpu/sparse_utils_kernel.cc
@@ -394,3 +394,15 @@ PD_REGISTER_KERNEL(csr_values,
                    int64_t) {
   kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO);
 }
+
+PD_REGISTER_KERNEL(sparse_coo_tensor,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::sparse::SparseCooTensorKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   uint8_t,
+                   int16_t,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/sparse/gpu/sparse_mask_kernel.cu b/paddle/phi/kernels/sparse/gpu/sparse_mask_kernel.cu
index d206d6bbc195c..1f6b32d90b14e 100644
--- a/paddle/phi/kernels/sparse/gpu/sparse_mask_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/sparse_mask_kernel.cu
@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include <thrust/binary_search.h>
+
 #include "paddle/phi/backends/gpu/gpu_info.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/core/ddim.h"
@@ -20,6 +22,7 @@ limitations under the License. */
 #include "paddle/phi/kernels/copy_kernel.h"
 #include "paddle/phi/kernels/empty_kernel.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/funcs/sparse/common_shape.h"
 #include "paddle/phi/kernels/sparse/sparse_mask_kernel.h"
 
 #include "paddle/phi/api/ext/dispatch.h"
@@ -59,7 +62,7 @@ void SparseMaskGPUKernel(const GPUContext& dev_ctx,
   const DenseTensor& indices = mask.non_zero_indices();
   const DenseTensor& values = mask.non_zero_elements();
   int sparse_dim = indices.dims().size();
-  DenseTensor sparse_offsets = phi::Empty(
+  DenseTensor sparse_offsets = phi::Empty<GPUContext>(
       dev_ctx,
       DenseTensorMeta(DataType::INT64, {sparse_dim}, DataLayout::NCHW));
   std::vector<int64_t> h_sparse_offsets(sparse_dim);
@@ -121,6 +124,143 @@ void SparseMaskKernel(const Context& dev_ctx,
       }));
 }
 
+template <typename IntT>
+__global__ void FlattenIndicesKernel(const IntT* indices,
+                                     const IntT* sparse_offsets,
+                                     const int64_t non_zero_num,
+                                     const int64_t sparse_dim,
+                                     IntT* out) {
+  CUDA_KERNEL_LOOP_TYPE(i, non_zero_num, int64_t) {
+    out[i] = phi::funcs::sparse::IndicesToIndex<IntT>(
+        indices, sparse_offsets, non_zero_num, sparse_dim, i);
+  }
+}
+
+template <typename T, typename IntT>
+__global__ void SparseMaskCopyKernel(const IntT* x_indexs,
+                                     const IntT* mask_indexs,
+                                     const IntT* bound_out,
+                                     const T* x_values,
+                                     const int64_t n,
+                                     const int64_t stride,
+                                     T* out_values) {
+  CUDA_KERNEL_LOOP_TYPE(i, n, int64_t) {
+    const IntT j = bound_out[i];
+    if (j >= 0 && j < n && mask_indexs[i] == x_indexs[j]) {
+      for (int k = 0; k < stride; k++) {
+        out_values[i * stride + k] = x_values[j * stride + k];
+      }
+    }
+  }
+}
+
+template <typename T, typename IntT>
+void SparseMaskHelperGPUKernel(const GPUContext& dev_ctx,
+                               const SparseCooTensor& x,
+                               const DenseTensor& mask_indices,
+                               DenseTensor* out) {
+  PADDLE_ENFORCE_EQ(
+      mask_indices.dims().size(),
+      2,
+      phi::errors::InvalidArgument("the mask_indices must be 2-D tensor"));
+
+  const int64_t sparse_dim = x.non_zero_indices().dims()[0];
+  auto indices_dtype = paddle::experimental::CppTypeToDataType<IntT>::Type();
+
+  std::vector<IntT> sparse_offsets(
+      sparse_dim);  //, x_indexs(x.nnz()), mask_indexs(mask_indices.dims()[1]);
+
+  DenseTensorMeta x_indices_meta(indices_dtype, {x.nnz()}, DataLayout::NCHW);
+  DenseTensorMeta mask_indices_meta(
+      indices_dtype, {mask_indices.dims()[1]}, DataLayout::NCHW);
+  DenseTensorMeta sparse_offset_meta(
+      indices_dtype, {sparse_dim}, DataLayout::NCHW);
+
+  DenseTensor x_indexs =
+      phi::Empty<GPUContext>(dev_ctx, std::move(x_indices_meta));
+  DenseTensor mask_indexs =
+      phi::Empty<GPUContext>(dev_ctx, std::move(mask_indices_meta));
+  DenseTensor bound_out =
+      phi::Empty<GPUContext>(dev_ctx, std::move(mask_indices_meta));
+  DenseTensor d_sparse_offsets =
+      phi::Empty<GPUContext>(dev_ctx, std::move(sparse_offset_meta));
+  IntT* x_indexs_ptr = x_indexs.data<IntT>();
+  IntT* mask_indexs_ptr = mask_indexs.data<IntT>();
+  IntT* bound_out_ptr = bound_out.data<IntT>();
+
+  // 1. calc the offsets of per dim
+  phi::funcs::sparse::CalcOffsetsPerDim(
+      x.non_zero_indices(), x.dims(), &sparse_offsets);
+  // 2. copy sparse_offsets to device
+  phi::backends::gpu::GpuMemcpyAsync(d_sparse_offsets.data<IntT>(),
+                                     sparse_offsets.data(),
+                                     sizeof(IntT) * sparse_dim,
+#ifdef PADDLE_WITH_HIP
+                                     hipMemcpyHostToDevice,
+#else
+                                     cudaMemcpyHostToDevice,
+#endif
+                                     dev_ctx.stream());
+
+  // 3. flatten x indices and mask indices
+  auto config =
+      phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, x_indexs.numel(), 1);
+  FlattenIndicesKernel<<<config.block_per_grid,
+                         config.thread_per_block,
+                         0,
+                         dev_ctx.stream()>>>(x.non_zero_indices().data<IntT>(),
+                                             d_sparse_offsets.data<IntT>(),
+                                             x_indexs.numel(),
+                                             sparse_dim,
+                                             x_indexs_ptr);
+
+  config =
+      phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, mask_indexs.numel(), 1);
+  FlattenIndicesKernel<<<config.block_per_grid,
+                         config.thread_per_block,
+                         0,
+                         dev_ctx.stream()>>>(mask_indices.data<IntT>(),
+                                             d_sparse_offsets.data<IntT>(),
+                                             mask_indices.numel(),
+                                             sparse_dim,
+                                             mask_indexs_ptr);
+  // 4. call thrust::lower_bound
+  thrust::lower_bound(thrust::cuda::par.on(dev_ctx.stream()),
+                      x_indexs_ptr,
+                      x_indexs_ptr + x_indexs.numel(),
+                      mask_indexs_ptr,
+                      mask_indexs_ptr + mask_indexs.numel(),
+                      bound_out_ptr);
+
+  // 5. copy value to out
+  *out = phi::EmptyLike<T>(dev_ctx, x.non_zero_elements());
+  phi::funcs::SetConstant<GPUContext, T> set_zero;
+  set_zero(dev_ctx, out, static_cast<T>(0));
+  T* out_ptr = out->data<T>();
+
+  SparseMaskCopyKernel<<<config.block_per_grid,
+                         config.thread_per_block,
+                         0,
+                         dev_ctx.stream()>>>(x_indexs_ptr,
+                                             mask_indexs_ptr,
+                                             bound_out_ptr,
+                                             x.non_zero_elements().data<T>(),
+                                             mask_indexs.numel(),
+                                             x.non_zero_elements().dims()[1],
+                                             out_ptr);
+}
+
+template <typename T, typename Context>
+void SparseMaskHelperKernel(const Context& dev_ctx,
+                            const SparseCooTensor& x,
+                            const DenseTensor& mask_indices,
+                            DenseTensor* out) {
+  PD_DISPATCH_INTEGRAL_TYPES(
+      x.non_zero_indices().dtype(), "SparseMaskHelperGPUKernel", ([&] {
+        SparseMaskHelperGPUKernel<T, data_t>(dev_ctx, x, mask_indices, out);
+      }));
+}
+
 }  // namespace sparse
 }  // namespace phi
 
@@ -138,3 +278,17 @@ PD_REGISTER_KERNEL(sparse_mask,
                    int64_t) {
   kernel->InputAt(1).SetDataLayout(phi::DataLayout::SPARSE_COO);
 }
+
+PD_REGISTER_KERNEL(sparse_mask_helper,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::sparse::SparseMaskHelperKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   uint8_t,
+                   int16_t,
+                   int,
+                   int64_t) {
+  kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO);
+}
diff --git a/paddle/phi/kernels/sparse/gpu/sparse_utils_kernel.cu b/paddle/phi/kernels/sparse/gpu/sparse_utils_kernel.cu
index 1109baf92e302..0b6ac1aed0147 100644
--- a/paddle/phi/kernels/sparse/gpu/sparse_utils_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/sparse_utils_kernel.cu
@@ -665,3 +665,15 @@ PD_REGISTER_KERNEL(csr_values,
                    int64_t) {
   kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO);
 }
+
+PD_REGISTER_KERNEL(sparse_coo_tensor,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::sparse::SparseCooTensorKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   uint8_t,
+                   int16_t,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/sparse/sparse_mask_kernel.h b/paddle/phi/kernels/sparse/sparse_mask_kernel.h
index 210412abd8620..88899e3dc672e 100644
--- a/paddle/phi/kernels/sparse/sparse_mask_kernel.h
+++ b/paddle/phi/kernels/sparse/sparse_mask_kernel.h
@@ -26,5 +26,11 @@ void SparseMaskKernel(const Context& dev_ctx,
                       const SparseCooTensor& mask,
                       SparseCooTensor* out);
 
+template <typename T, typename Context>
+void SparseMaskHelperKernel(const Context& dev_ctx,
+                            const SparseCooTensor& x,
+                            const DenseTensor& mask_indices,
+                            DenseTensor* out);
+
 }  // namespace sparse
 }  // namespace phi
diff --git a/paddle/phi/kernels/sparse/sparse_utils_grad_kernel.cc b/paddle/phi/kernels/sparse/sparse_utils_grad_kernel.cc
index 35329807e7798..71958db1d3fd8 100644
--- a/paddle/phi/kernels/sparse/sparse_utils_grad_kernel.cc
+++ b/paddle/phi/kernels/sparse/sparse_utils_grad_kernel.cc
@@ -66,6 +66,19 @@ PD_REGISTER_KERNEL(sparse_coo_to_dense_grad,
   kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO);
 }
 
+PD_REGISTER_KERNEL(sparse_coo_tensor_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::sparse::SparseCooTensorGradKernel,
+                   float,
+                   double,
+                   uint8_t,
+                   int16_t,
+                   int,
+                   int64_t) {
+  kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO);
+}
+
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 PD_REGISTER_KERNEL(coo_values_grad,
                    GPU,
@@ -95,4 +108,16 @@ PD_REGISTER_KERNEL(sparse_coo_to_dense_grad,
                    int64_t) {
   kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO);
 }
+PD_REGISTER_KERNEL(sparse_coo_tensor_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::sparse::SparseCooTensorGradKernel,
+                   float,
+                   double,
+                   uint8_t,
+                   int16_t,
+                   int,
+                   int64_t) {
+  kernel->InputAt(1).SetDataLayout(phi::DataLayout::SPARSE_COO);
+}
 #endif
diff --git a/paddle/phi/kernels/sparse/sparse_utils_grad_kernel.h b/paddle/phi/kernels/sparse/sparse_utils_grad_kernel.h
index 0775582bf1fb8..a00b9c275c292 100644
--- a/paddle/phi/kernels/sparse/sparse_utils_grad_kernel.h
+++ b/paddle/phi/kernels/sparse/sparse_utils_grad_kernel.h
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/sparse_coo_tensor.h"
+#include "paddle/phi/kernels/sparse/sparse_mask_kernel.h"
 
 namespace phi {
 namespace sparse {
@@ -32,5 +33,13 @@ void SparseCooToDenseGradKernel(const Context& dev_ctx,
                                 const DenseTensor& out_grad,
                                 SparseCooTensor* x_grad);
 
+template <typename T, typename Context>
+void SparseCooTensorGradKernel(const Context& dev_ctx,
+                               const DenseTensor& indices,
+                               const SparseCooTensor& out_grad,
+                               DenseTensor* values_grad) {
+  SparseMaskHelperKernel<T, Context>(dev_ctx, out_grad, indices, values_grad);
+}
+
 }  // namespace sparse
 }  // namespace phi
diff --git a/paddle/phi/kernels/sparse/sparse_utils_kernel.h b/paddle/phi/kernels/sparse/sparse_utils_kernel.h
index 961cd9f829eb2..8ead6034b589b 100644
--- a/paddle/phi/kernels/sparse/sparse_utils_kernel.h
+++ b/paddle/phi/kernels/sparse/sparse_utils_kernel.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include "paddle/phi/api/lib/utils/storage.h"
+#include "paddle/phi/common/int_array.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/sparse_coo_tensor.h"
 #include "paddle/phi/core/sparse_csr_tensor.h"
@@ -147,5 +148,16 @@ void CsrValuesKernel(const Context& dev_ctx,
   *out = x.non_zero_elements();
 }
 
+template <typename T, typename Context>
+void SparseCooTensorKernel(const Context& dev_ctx,
+                           const DenseTensor& indices,
+                           const DenseTensor& values,
+                           const IntArray& dense_shape,
+                           SparseCooTensor* out) {
+  *out =
+      SparseCooTensor(indices, values, phi::make_ddim(dense_shape.GetData()));
+  // TODO(zhangkaihuo): sort and merge the dumplicate indices
+}
+
 }  // namespace sparse
 }  // namespace phi
diff --git a/python/paddle/fluid/tests/unittests/test_sparse_utils_op.py b/python/paddle/fluid/tests/unittests/test_sparse_utils_op.py
index 04488ac58c5fb..f486f6469e599 100644
--- a/python/paddle/fluid/tests/unittests/test_sparse_utils_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sparse_utils_op.py
@@ -186,6 +186,27 @@ def test_coo_values_grad(self):
             values_tensor.backward(paddle.to_tensor(out_grad))
             assert np.array_equal(out_grad, sparse_x.grad.values().numpy())
 
+    def test_sparse_coo_tensor_grad(self):
+        with _test_eager_guard():
+            indices = [[0, 1], [0, 1]]
+            values = [1, 2]
+            indices = paddle.to_tensor(indices, dtype='int32')
+            values = paddle.to_tensor(
+                values, dtype='float32', stop_gradient=False)
+            sparse_x = paddle.sparse.sparse_coo_tensor(
+                indices, values, shape=[2, 2], stop_gradient=False)
+            print(sparse_x)
+            grad_indices = [[0, 1], [1, 0]]
+            grad_values = [2, 3]
+            grad_indices = paddle.to_tensor(grad_indices, dtype='int32')
+            grad_values = paddle.to_tensor(grad_values, dtype='float32')
+            sparse_out_grad = paddle.sparse.sparse_coo_tensor(
+                grad_indices, grad_values, shape=[2, 2])
+            sparse_x.backward(sparse_out_grad)
+            print(sparse_x.grad)
+            print(values.grad)
+            print(indices.grad)
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/sparse/creation.py b/python/paddle/sparse/creation.py
index e29351e3d179c..34a62b947ec62 100644
--- a/python/paddle/sparse/creation.py
+++ b/python/paddle/sparse/creation.py
@@ -107,7 +107,9 @@ def sparse_coo_tensor(indices,
     values = _handle_dtype(values, dtype)
     if shape is None:
         shape = _infer_dense_shape(indices)
-    return core.eager.sparse_coo_tensor(indices, values, shape, stop_gradient)
+#return core.eager.sparse_coo_tensor(indices, values, shape, stop_gradient)
+    return _C_ops.final_state_sparse_create_sparse_coo_tensor(indices, values,
+                                                              shape)
 
 
 #TODO: need to support shape is None
diff --git a/python/paddle/utils/code_gen/sparse_api.yaml b/python/paddle/utils/code_gen/sparse_api.yaml
index 7bdd77e27bcef..4f30e966e9eb7 100644
--- a/python/paddle/utils/code_gen/sparse_api.yaml
+++ b/python/paddle/utils/code_gen/sparse_api.yaml
@@ -21,6 +21,14 @@
     layout : x
   backward : coo_values_grad
 
+- api : create_sparse_coo_tensor
+  args : (Tensor indices, Tensor values, IntArray dense_shape)
+  output : Tensor(out@SparseCooTensor)
+  kernel :
+    func : sparse_coo_tensor
+    layout : values
+  backward : create_sparse_coo_tensor_grad
+
 - api : csr_values
   args : (Tensor x)
   output : Tensor(out@DenseTensor)
diff --git a/python/paddle/utils/code_gen/sparse_bw_api.yaml b/python/paddle/utils/code_gen/sparse_bw_api.yaml
index 800145b06e0b6..2280ed6ad61a4 100644
--- a/python/paddle/utils/code_gen/sparse_bw_api.yaml
+++ b/python/paddle/utils/code_gen/sparse_bw_api.yaml
@@ -19,6 +19,13 @@
   kernel :
     func : coo_values_grad
 
+- backward_api : create_sparse_coo_tensor_grad
+  forward : create_sparse_coo_tensor(Tensor indices, Tensor values, IntArray dense_shape) -> Tensor(out@SparseCooTensor)
+  args : (Tensor indices, Tensor out_grad)
+  output : Tensor(values_grad@DenseTensor)
+  kernel :
+    func : sparse_coo_tensor_grad
+
 - backward_api : dense_to_coo_grad
   forward : dense_to_coo(Tensor x, int64_t sparse_dim) -> Tensor(out@SparseCooTensor)
   args : (Tensor out_grad)

From 311700fd9a2329b23985abd4dfe8fcc9491343eb Mon Sep 17 00:00:00 2001
From: zkh2016 <zhangkaihuo@baidu.com>
Date: Mon, 11 Apr 2022 09:28:31 +0000
Subject: [PATCH 03/23] fix bug

---
 .../phi/kernels/funcs/sparse/common_shape.h   | 16 ++++-----
 .../kernels/sparse/cpu/sparse_mask_kernel.cc  | 14 +++++---
 .../kernels/sparse/gpu/sparse_mask_kernel.cu  | 26 ++++++++------
 .../sparse/sparse_utils_grad_kernel.cc        |  2 +-
 .../phi/kernels/sparse/sparse_utils_kernel.h  |  2 +-
 .../tests/unittests/test_sparse_utils_op.py   | 25 ++++++++++---
 python/paddle/sparse/creation.py              | 36 ++++++++++++++++---
 python/paddle/utils/code_gen/sparse_api.yaml  |  2 +-
 .../paddle/utils/code_gen/sparse_bw_api.yaml  |  2 +-
 9 files changed, 89 insertions(+), 36 deletions(-)

diff --git a/paddle/phi/kernels/funcs/sparse/common_shape.h b/paddle/phi/kernels/funcs/sparse/common_shape.h
index ea7583a16f9e2..b28bc30e1ffcf 100644
--- a/paddle/phi/kernels/funcs/sparse/common_shape.h
+++ b/paddle/phi/kernels/funcs/sparse/common_shape.h
@@ -41,11 +41,11 @@ inline const DDim InferDenseDims(const DDim& x_dims,
 }
 
 template <typename IntT>
-inline IntT HOSTDEVICE IndicesToIndex(const IntT* indices,
-                                      const IntT* sparse_offsets,
-                                      const int64_t non_zero_num,
-                                      const int64_t sparse_dim,
-                                      const int i) {
+inline const IntT HOSTDEVICE IndicesToIndex(const IntT* indices,
+                                            const IntT* sparse_offsets,
+                                            const int64_t non_zero_num,
+                                            const int64_t sparse_dim,
+                                            const int i) {
   IntT index = 0;
   for (IntT j = 0; j < sparse_dim; j++) {
     index += indices[j * non_zero_num + i] * sparse_offsets[j];
@@ -55,9 +55,9 @@ inline IntT HOSTDEVICE IndicesToIndex(const IntT* indices,
 
 // 1. indices.dims().size() == 2
 template <typename IntT>
-inline const void CalcOffsetsPerDim(const DenseTensor& indices,
-                                    const DDim& dims,
-                                    std::vector<IntT>* offsets) {
+inline void CalcOffsetsPerDim(const DenseTensor& indices,
+                              const DDim& dims,
+                              std::vector<IntT>* offsets) {
   const DDim& indices_dims = indices.dims();
   const IntT sparse_dim = indices_dims[0];
   IntT offset = 1;
diff --git a/paddle/phi/kernels/sparse/cpu/sparse_mask_kernel.cc b/paddle/phi/kernels/sparse/cpu/sparse_mask_kernel.cc
index 8d9f0c37df18c..7f61ab7765a12 100644
--- a/paddle/phi/kernels/sparse/cpu/sparse_mask_kernel.cc
+++ b/paddle/phi/kernels/sparse/cpu/sparse_mask_kernel.cc
@@ -123,17 +123,21 @@ void SparseMaskHelperCPUKernel(const CPUContext& dev_ctx,
                  sparse_dim,
                  &mask_indexs);
 
-  std::set<IntT> x_indexs_set(x_indexs.begin(), x_indexs.end());
+  std::unordered_map<IntT, uint64_t> x_indexs_map;
+  for (uint64_t i = 0; i < x_indexs.size(); i++) {
+    x_indexs_map[x_indexs[i]] = i;
+  }
   *out = phi::EmptyLike<T>(dev_ctx, x.non_zero_elements());
   T* out_ptr = out->data<T>();
   memset(out_ptr, static_cast<T>(0), out->numel() * sizeof(T));
-  const int stride = x.non_zero_elements().dims()[1];
+  const int64_t stride =
+      x.dims().size() == sparse_dim ? 1 : x.dims().size() - sparse_dim;
   const T* in_ptr = x.non_zero_elements().data<T>();
   for (uint64_t i = 0; i < mask_indexs.size(); i++) {
-    auto iter = x_indexs_set.find(mask_indexs[i]);
-    if (iter != x_indexs_set.end()) {
+    auto iter = x_indexs_map.find(mask_indexs[i]);
+    if (iter != x_indexs_map.end()) {
       memcpy(out_ptr + i * stride,
-             in_ptr + mask_indexs[i] * stride,
+             in_ptr + iter->second * stride,
              stride * sizeof(T));
     }
   }
diff --git a/paddle/phi/kernels/sparse/gpu/sparse_mask_kernel.cu b/paddle/phi/kernels/sparse/gpu/sparse_mask_kernel.cu
index 1f6b32d90b14e..a9f0408cd2a8d 100644
--- a/paddle/phi/kernels/sparse/gpu/sparse_mask_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/sparse_mask_kernel.cu
@@ -167,21 +167,20 @@ void SparseMaskHelperGPUKernel(const GPUContext& dev_ctx,
   const int64_t sparse_dim = x.non_zero_indices().dims()[0];
   auto indices_dtype = paddle::experimental::CppTypeToDataType<IntT>::Type();
 
-  std::vector<IntT> sparse_offsets(
-      sparse_dim);  //, x_indexs(x.nnz()), mask_indexs(mask_indices.dims()[1]);
+  std::vector<IntT> sparse_offsets(sparse_dim);
 
-  DenseTensorMeta x_indices_meta(indices_dtype, {x.nnz()}, DataLayout::NCHW);
-  DenseTensorMeta mask_indices_meta(
+  DenseTensorMeta x_indexs_meta(indices_dtype, {x.nnz()}, DataLayout::NCHW);
+  DenseTensorMeta mask_indexs_meta(
       indices_dtype, {mask_indices.dims()[1]}, DataLayout::NCHW);
   DenseTensorMeta sparse_offset_meta(
       indices_dtype, {sparse_dim}, DataLayout::NCHW);
 
   DenseTensor x_indexs =
-      phi::Empty<GPUContext>(dev_ctx, std::move(x_indices_meta));
+      phi::Empty<GPUContext>(dev_ctx, std::move(x_indexs_meta));
   DenseTensor mask_indexs =
-      phi::Empty<GPUContext>(dev_ctx, std::move(mask_indices_meta));
+      phi::Empty<GPUContext>(dev_ctx, std::move(mask_indexs_meta));
   DenseTensor bound_out =
-      phi::Empty<GPUContext>(dev_ctx, std::move(mask_indices_meta));
+      phi::Empty<GPUContext>(dev_ctx, std::move(mask_indexs_meta));
   DenseTensor d_sparse_offsets =
       phi::Empty<GPUContext>(dev_ctx, std::move(sparse_offset_meta));
   IntT* x_indexs_ptr = x_indexs.data<IntT>();
@@ -221,11 +220,15 @@ void SparseMaskHelperGPUKernel(const GPUContext& dev_ctx,
                          0,
                          dev_ctx.stream()>>>(mask_indices.data<IntT>(),
                                              d_sparse_offsets.data<IntT>(),
-                                             mask_indices.numel(),
+                                             mask_indexs.numel(),
                                              sparse_dim,
                                              mask_indexs_ptr);
-  // 4. call thrust::lower_bound
+// 4. call thrust::lower_bound
+#ifdef PADDLE_WITH_HIP
+  thrust::lower_bound(thrust::hip::par.on(dev_ctx.stream()),
+#else
   thrust::lower_bound(thrust::cuda::par.on(dev_ctx.stream()),
+#endif
                       x_indexs_ptr,
                       x_indexs_ptr + x_indexs.numel(),
                       mask_indexs_ptr,
@@ -238,6 +241,9 @@ void SparseMaskHelperGPUKernel(const GPUContext& dev_ctx,
   set_zero(dev_ctx, out, static_cast<T>(0));
   T* out_ptr = out->data<T>();
 
+  const int64_t stride =
+      x.dims().size() == sparse_dim ? 1 : x.dims().size() - sparse_dim;
+
   SparseMaskCopyKernel<<<config.block_per_grid,
                          config.thread_per_block,
                          0,
@@ -246,7 +252,7 @@ void SparseMaskHelperGPUKernel(const GPUContext& dev_ctx,
                                              bound_out_ptr,
                                              x.non_zero_elements().data<T>(),
                                              mask_indexs.numel(),
-                                             x.non_zero_elements().dims()[1],
+                                             stride,
                                              out_ptr);
 }
 
diff --git a/paddle/phi/kernels/sparse/sparse_utils_grad_kernel.cc b/paddle/phi/kernels/sparse/sparse_utils_grad_kernel.cc
index 71958db1d3fd8..15d78692f4f35 100644
--- a/paddle/phi/kernels/sparse/sparse_utils_grad_kernel.cc
+++ b/paddle/phi/kernels/sparse/sparse_utils_grad_kernel.cc
@@ -76,7 +76,7 @@ PD_REGISTER_KERNEL(sparse_coo_tensor_grad,
                    int16_t,
                    int,
                    int64_t) {
-  kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO);
+  kernel->InputAt(1).SetDataLayout(phi::DataLayout::SPARSE_COO);
 }
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
diff --git a/paddle/phi/kernels/sparse/sparse_utils_kernel.h b/paddle/phi/kernels/sparse/sparse_utils_kernel.h
index 8ead6034b589b..8cf9c0a28648a 100644
--- a/paddle/phi/kernels/sparse/sparse_utils_kernel.h
+++ b/paddle/phi/kernels/sparse/sparse_utils_kernel.h
@@ -150,8 +150,8 @@ void CsrValuesKernel(const Context& dev_ctx,
 
 template <typename T, typename Context>
 void SparseCooTensorKernel(const Context& dev_ctx,
-                           const DenseTensor& indices,
                            const DenseTensor& values,
+                           const DenseTensor& indices,
                            const IntArray& dense_shape,
                            SparseCooTensor* out) {
   *out =
diff --git a/python/paddle/fluid/tests/unittests/test_sparse_utils_op.py b/python/paddle/fluid/tests/unittests/test_sparse_utils_op.py
index f486f6469e599..933e8b952d972 100644
--- a/python/paddle/fluid/tests/unittests/test_sparse_utils_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sparse_utils_op.py
@@ -18,6 +18,7 @@
 import paddle
 import paddle.fluid.core as core
 from paddle.fluid.framework import _test_eager_guard
+from paddle import _C_ops
 
 
 class TestSparseCreate(unittest.TestCase):
@@ -195,17 +196,31 @@ def test_sparse_coo_tensor_grad(self):
                 values, dtype='float32', stop_gradient=False)
             sparse_x = paddle.sparse.sparse_coo_tensor(
                 indices, values, shape=[2, 2], stop_gradient=False)
-            print(sparse_x)
-            grad_indices = [[0, 1], [1, 0]]
+            grad_indices = [[0, 1], [1, 1]]
             grad_values = [2, 3]
             grad_indices = paddle.to_tensor(grad_indices, dtype='int32')
             grad_values = paddle.to_tensor(grad_values, dtype='float32')
             sparse_out_grad = paddle.sparse.sparse_coo_tensor(
                 grad_indices, grad_values, shape=[2, 2])
             sparse_x.backward(sparse_out_grad)
-            print(sparse_x.grad)
-            print(values.grad)
-            print(indices.grad)
+            correct_values_grad = [0, 3]
+            assert np.array_equal(correct_values_grad, values.grad.numpy())
+
+            place = core.CPUPlace()
+            indices_cpu = paddle.to_tensor(indices, dtype='int32', place=place)
+            values_cpu = paddle.to_tensor(
+                values, dtype='float32', place=place, stop_gradient=False)
+            sparse_x_cpu = paddle.sparse.sparse_coo_tensor(
+                indices_cpu,
+                values_cpu,
+                shape=[2, 2],
+                place=place,
+                stop_gradient=False)
+
+            sparse_out_grad_cpu = paddle.sparse.sparse_coo_tensor(
+                grad_indices, grad_values, shape=[2, 2], place=place)
+            sparse_x_cpu.backward(sparse_out_grad_cpu)
+            assert np.array_equal(correct_values_grad, values_cpu.grad.numpy())
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/sparse/creation.py b/python/paddle/sparse/creation.py
index 34a62b947ec62..ac9276f3142c0 100644
--- a/python/paddle/sparse/creation.py
+++ b/python/paddle/sparse/creation.py
@@ -14,6 +14,7 @@
 
 from paddle import _C_ops
 from ..framework import core, dygraph_only
+from ..framework import _current_expected_place, _get_paddle_place
 from ..tensor import to_tensor
 from ..tensor import max
 from ..fluid.data_feeder import check_variable_and_dtype, check_type, check_dtype, convert_dtype
@@ -38,6 +39,18 @@ def _infer_dense_shape(indices):
     return list(lens.numpy())
 
 
+def _get_place(place):
+    place = _get_paddle_place(place)
+    if place is None:
+        place = _current_expected_place()
+    elif not isinstance(place, (core.Place, core.CPUPlace, core.CUDAPinnedPlace,
+                                core.CUDAPlace)):
+        raise ValueError(
+            "'place' must be any of paddle.Place, paddle.CPUPlace, paddle.CUDAPinnedPlace, paddle.CUDAPlace"
+        )
+    return place
+
+
 @dygraph_only
 def sparse_coo_tensor(indices,
                       values,
@@ -94,6 +107,8 @@ def sparse_coo_tensor(indices,
             #       values=[1., 2., 3.])
     """
 
+    place = _get_place(place)
+
     if not isinstance(indices, core.eager.Tensor):
         indices = to_tensor(
             indices, dtype=None, place=place, stop_gradient=True)
@@ -101,14 +116,19 @@ def sparse_coo_tensor(indices,
         values = to_tensor(values, dtype, place, stop_gradient)
     if len(indices.shape) != 2:
         raise ValueError("'indices' must be 2-D.")
-    if place is not None:
+
+    if not indices.place._equals(place):
         indices = indices._copy_to(place, False)
+
+    if not values.place._equals(place):
         values = values._copy_to(place, False)
     values = _handle_dtype(values, dtype)
+    values.stop_gradient = stop_gradient
+
     if shape is None:
         shape = _infer_dense_shape(indices)
-#return core.eager.sparse_coo_tensor(indices, values, shape, stop_gradient)
-    return _C_ops.final_state_sparse_create_sparse_coo_tensor(indices, values,
+
+    return _C_ops.final_state_sparse_create_sparse_coo_tensor(values, indices,
                                                               shape)
 
 
@@ -173,6 +193,9 @@ def sparse_csr_tensor(crows,
             #       cols=[1, 3, 2, 0, 1],
             #       values=[1, 2, 3, 4, 5])
     """
+
+    place = _get_place(place)
+
     if not isinstance(crows, core.eager.Tensor):
         crows = to_tensor(crows, dtype=None, place=place, stop_gradient=True)
     if not isinstance(cols, core.eager.Tensor):
@@ -184,10 +207,15 @@ def sparse_csr_tensor(crows,
             "SparseCsrTensor only support 2-D or 3-D matrix. The 'crows', 'cols' and 'values' must be 1-D."
         )
 
-    if place is not None:
+    if not crows.place._equals(place):
         crows = crows._copy_to(place, False)
+
+    if not cols.place._equals(place):
         cols = cols._copy_to(place, False)
+
+    if not values.place._equals(place):
         values = values._copy_to(place, False)
     values = _handle_dtype(values, dtype)
+    values.stop_gradient = stop_gradient
     return core.eager.sparse_csr_tensor(crows, cols, values, shape,
                                         stop_gradient)
diff --git a/python/paddle/utils/code_gen/sparse_api.yaml b/python/paddle/utils/code_gen/sparse_api.yaml
index 4f30e966e9eb7..2187d4abb2d63 100644
--- a/python/paddle/utils/code_gen/sparse_api.yaml
+++ b/python/paddle/utils/code_gen/sparse_api.yaml
@@ -22,7 +22,7 @@
   backward : coo_values_grad
 
 - api : create_sparse_coo_tensor
-  args : (Tensor indices, Tensor values, IntArray dense_shape)
+  args : (Tensor values, Tensor indices, IntArray dense_shape)
   output : Tensor(out@SparseCooTensor)
   kernel :
     func : sparse_coo_tensor
diff --git a/python/paddle/utils/code_gen/sparse_bw_api.yaml b/python/paddle/utils/code_gen/sparse_bw_api.yaml
index 2280ed6ad61a4..e3946cbf72bc2 100644
--- a/python/paddle/utils/code_gen/sparse_bw_api.yaml
+++ b/python/paddle/utils/code_gen/sparse_bw_api.yaml
@@ -20,7 +20,7 @@
     func : coo_values_grad
 
 - backward_api : create_sparse_coo_tensor_grad
-  forward : create_sparse_coo_tensor(Tensor indices, Tensor values, IntArray dense_shape) -> Tensor(out@SparseCooTensor)
+  forward : create_sparse_coo_tensor(Tensor values, Tensor indices, IntArray dense_shape) -> Tensor(out@SparseCooTensor)
   args : (Tensor indices, Tensor out_grad)
   output : Tensor(values_grad@DenseTensor)
   kernel :

From 8e7ae8b2c670f28ace7633b036035057c3c56465 Mon Sep 17 00:00:00 2001
From: zkh2016 <zhangkaihuo@baidu.com>
Date: Mon, 11 Apr 2022 11:45:55 +0000
Subject: [PATCH 04/23] opt code

---
 .../phi/kernels/funcs/sparse/common_shape.h   | 14 ++++++
 .../kernels/sparse/cpu/sparse_mask_kernel.cc  | 44 +++++++------------
 .../kernels/sparse/gpu/sparse_mask_kernel.cu  | 12 +++--
 3 files changed, 39 insertions(+), 31 deletions(-)

diff --git a/paddle/phi/kernels/funcs/sparse/common_shape.h b/paddle/phi/kernels/funcs/sparse/common_shape.h
index b28bc30e1ffcf..722d3e5e8b2ce 100644
--- a/paddle/phi/kernels/funcs/sparse/common_shape.h
+++ b/paddle/phi/kernels/funcs/sparse/common_shape.h
@@ -53,6 +53,20 @@ inline const IntT HOSTDEVICE IndicesToIndex(const IntT* indices,
   return index;
 }
 
+template <typename IntT>
+inline void HOSTDEVICE FlattenIndices(const IntT* indices,
+                                      const IntT* sparse_offsets,
+                                      const int64_t non_zero_num,
+                                      const int64_t sparse_dim,
+                                      const int start,
+                                      const int stride,
+                                      IntT* out) {
+  for (int i = start; i < non_zero_num; i += stride) {
+    out[i] =
+        IndicesToIndex(indices, sparse_offsets, non_zero_num, sparse_dim, i);
+  }
+}
+
 // 1. indices.dims().size() == 2
 template <typename IntT>
 inline void CalcOffsetsPerDim(const DenseTensor& indices,
diff --git a/paddle/phi/kernels/sparse/cpu/sparse_mask_kernel.cc b/paddle/phi/kernels/sparse/cpu/sparse_mask_kernel.cc
index 7f61ab7765a12..9dc20ed8b95c8 100644
--- a/paddle/phi/kernels/sparse/cpu/sparse_mask_kernel.cc
+++ b/paddle/phi/kernels/sparse/cpu/sparse_mask_kernel.cc
@@ -55,16 +55,12 @@ void SparseMaskCPUKernel(const CPUContext& dev_ctx,
   const IntT* indices_ptr = indices.data<IntT>();
 
   std::vector<IntT> out_indexs(non_zero_num), sparse_offsets(sparse_dim);
-  // phi::funcs::sparse::FlattenIndices<IntT>(indices, x.dims(), &out_indexs);
   for (int64_t i = 0; i < non_zero_num; i++) {
-    out_indexs[i] = phi::funcs::sparse::IndicesToIndex<IntT>(
+    int64_t index = phi::funcs::sparse::IndicesToIndex<IntT>(
         indices_ptr, sparse_offsets.data(), non_zero_num, sparse_dim, i);
-  }
-
-  for (int64_t i = 0; i < non_zero_num; i++) {
-    int64_t index = out_indexs[i];
     memcpy(out_values_ptr + i * cols, x_ptr + index * cols, cols * sizeof(T));
   }
+
   out->SetMember(out_indices, out_values, dims, true);
 }
 
@@ -101,27 +97,20 @@ void SparseMaskHelperCPUKernel(const CPUContext& dev_ctx,
   phi::funcs::sparse::CalcOffsetsPerDim<IntT>(
       x.non_zero_indices(), x.dims(), &sparse_offsets);
 
-  auto FlattenIndices = [](const IntT* indices,
-                           const IntT* sparse_offsets,
-                           const int64_t non_zero_num,
-                           const int64_t sparse_dim,
-                           std::vector<IntT>* out) {
-    for (int64_t i = 0; i < non_zero_num; i++) {
-      (*out)[i] = phi::funcs::sparse::IndicesToIndex<IntT>(
-          indices, sparse_offsets, non_zero_num, sparse_dim, i);
-    }
-  };
-
-  FlattenIndices(x.non_zero_indices().data<IntT>(),
-                 sparse_offsets.data(),
-                 x.nnz(),
-                 sparse_dim,
-                 &x_indexs);
-  FlattenIndices(mask_indices.data<IntT>(),
-                 sparse_offsets.data(),
-                 x.nnz(),
-                 sparse_dim,
-                 &mask_indexs);
+  phi::funcs::sparse::FlattenIndices(x.non_zero_indices().data<IntT>(),
+                                     sparse_offsets.data(),
+                                     x.nnz(),
+                                     sparse_dim,
+                                     0,
+                                     1,
+                                     x_indexs.data());
+  phi::funcs::sparse::FlattenIndices(mask_indices.data<IntT>(),
+                                     sparse_offsets.data(),
+                                     x.nnz(),
+                                     sparse_dim,
+                                     0,
+                                     1,
+                                     mask_indexs.data());
 
   std::unordered_map<IntT, uint64_t> x_indexs_map;
   for (uint64_t i = 0; i < x_indexs.size(); i++) {
@@ -133,6 +122,7 @@ void SparseMaskHelperCPUKernel(const CPUContext& dev_ctx,
   const int64_t stride =
       x.dims().size() == sparse_dim ? 1 : x.dims().size() - sparse_dim;
   const T* in_ptr = x.non_zero_elements().data<T>();
+  // TODO(zhangkaihuo): multithreading can be used for acceleration
   for (uint64_t i = 0; i < mask_indexs.size(); i++) {
     auto iter = x_indexs_map.find(mask_indexs[i]);
     if (iter != x_indexs_map.end()) {
diff --git a/paddle/phi/kernels/sparse/gpu/sparse_mask_kernel.cu b/paddle/phi/kernels/sparse/gpu/sparse_mask_kernel.cu
index a9f0408cd2a8d..624f2b468e3cc 100644
--- a/paddle/phi/kernels/sparse/gpu/sparse_mask_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/sparse_mask_kernel.cu
@@ -130,10 +130,14 @@ __global__ void FlattenIndicesKernel(const IntT* indices,
                                      const int64_t non_zero_num,
                                      const int64_t sparse_dim,
                                      IntT* out) {
-  CUDA_KERNEL_LOOP_TYPE(i, non_zero_num, int64_t) {
-    out[i] = phi::funcs::sparse::IndicesToIndex<IntT>(
-        indices, sparse_offsets, non_zero_num, sparse_dim, i);
-  }
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  phi::funcs::sparse::FlattenIndices<IntT>(indices,
+                                           sparse_offsets,
+                                           non_zero_num,
+                                           sparse_dim,
+                                           tid,
+                                           gridDim.x * blockDim.x,
+                                           out);
 }
 
 template <typename T, typename IntT>

From 4622c74c6b06130ce6880b2851f846356e62f80d Mon Sep 17 00:00:00 2001
From: zkh2016 <zhangkaihuo@baidu.com>
Date: Mon, 11 Apr 2022 11:52:40 +0000
Subject: [PATCH 05/23] add todo

---
 paddle/phi/kernels/sparse/gpu/sparse_mask_kernel.cu | 1 +
 1 file changed, 1 insertion(+)

diff --git a/paddle/phi/kernels/sparse/gpu/sparse_mask_kernel.cu b/paddle/phi/kernels/sparse/gpu/sparse_mask_kernel.cu
index 624f2b468e3cc..ca5097d0ea872 100644
--- a/paddle/phi/kernels/sparse/gpu/sparse_mask_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/sparse_mask_kernel.cu
@@ -124,6 +124,7 @@ void SparseMaskKernel(const Context& dev_ctx,
       }));
 }
 
+// TODO(zhangkaihuo): Use an op to realize the function of FlattenIndices
 template <typename IntT>
 __global__ void FlattenIndicesKernel(const IntT* indices,
                                      const IntT* sparse_offsets,

From 760262a6f4635d6ef6d142d3ec8e946ea87a42a0 Mon Sep 17 00:00:00 2001
From: zkh2016 <zhangkaihuo@baidu.com>
Date: Tue, 12 Apr 2022 02:17:28 +0000
Subject: [PATCH 06/23] fix sparse_mask_cpu

---
 .../phi/kernels/funcs/sparse/common_shape.h   |  6 ++--
 .../kernels/sparse/cpu/sparse_mask_kernel.cc  |  6 +++-
 .../kernels/sparse/gpu/sparse_mask_kernel.cu  |  3 +-
 .../tests/unittests/test_sparse_utils_op.py   | 30 ++++++++++++++-----
 4 files changed, 31 insertions(+), 14 deletions(-)

diff --git a/paddle/phi/kernels/funcs/sparse/common_shape.h b/paddle/phi/kernels/funcs/sparse/common_shape.h
index 722d3e5e8b2ce..e4c836d116252 100644
--- a/paddle/phi/kernels/funcs/sparse/common_shape.h
+++ b/paddle/phi/kernels/funcs/sparse/common_shape.h
@@ -69,11 +69,9 @@ inline void HOSTDEVICE FlattenIndices(const IntT* indices,
 
 // 1. indices.dims().size() == 2
 template <typename IntT>
-inline void CalcOffsetsPerDim(const DenseTensor& indices,
-                              const DDim& dims,
+inline void CalcOffsetsPerDim(const DDim& dims,
+                              const int64_t sparse_dim,
                               std::vector<IntT>* offsets) {
-  const DDim& indices_dims = indices.dims();
-  const IntT sparse_dim = indices_dims[0];
   IntT offset = 1;
   for (IntT i = sparse_dim - 1; i >= 0; i--) {
     (*offsets)[i] = offset;
diff --git a/paddle/phi/kernels/sparse/cpu/sparse_mask_kernel.cc b/paddle/phi/kernels/sparse/cpu/sparse_mask_kernel.cc
index 9dc20ed8b95c8..a07a7fb2ecf44 100644
--- a/paddle/phi/kernels/sparse/cpu/sparse_mask_kernel.cc
+++ b/paddle/phi/kernels/sparse/cpu/sparse_mask_kernel.cc
@@ -55,6 +55,10 @@ void SparseMaskCPUKernel(const CPUContext& dev_ctx,
   const IntT* indices_ptr = indices.data<IntT>();
 
   std::vector<IntT> out_indexs(non_zero_num), sparse_offsets(sparse_dim);
+
+  phi::funcs::sparse::CalcOffsetsPerDim<IntT>(
+      dims, sparse_dim, &sparse_offsets);
+
   for (int64_t i = 0; i < non_zero_num; i++) {
     int64_t index = phi::funcs::sparse::IndicesToIndex<IntT>(
         indices_ptr, sparse_offsets.data(), non_zero_num, sparse_dim, i);
@@ -95,7 +99,7 @@ void SparseMaskHelperCPUKernel(const CPUContext& dev_ctx,
   std::vector<IntT> sparse_offsets(sparse_dim), x_indexs(x.nnz()),
       mask_indexs(mask_indices.dims()[1]);
   phi::funcs::sparse::CalcOffsetsPerDim<IntT>(
-      x.non_zero_indices(), x.dims(), &sparse_offsets);
+      x.dims(), sparse_dim, &sparse_offsets);
 
   phi::funcs::sparse::FlattenIndices(x.non_zero_indices().data<IntT>(),
                                      sparse_offsets.data(),
diff --git a/paddle/phi/kernels/sparse/gpu/sparse_mask_kernel.cu b/paddle/phi/kernels/sparse/gpu/sparse_mask_kernel.cu
index ca5097d0ea872..96ab56697b9b0 100644
--- a/paddle/phi/kernels/sparse/gpu/sparse_mask_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/sparse_mask_kernel.cu
@@ -193,8 +193,7 @@ void SparseMaskHelperGPUKernel(const GPUContext& dev_ctx,
   IntT* bound_out_ptr = bound_out.data<IntT>();
 
   // 1. calc the offsets of per dim
-  phi::funcs::sparse::CalcOffsetsPerDim(
-      x.non_zero_indices(), x.dims(), &sparse_offsets);
+  phi::funcs::sparse::CalcOffsetsPerDim(x.dims(), sparse_dim, &sparse_offsets);
   // 2. copy sparse_offsets to device
   phi::backends::gpu::GpuMemcpyAsync(d_sparse_offsets.data<IntT>(),
                                      sparse_offsets.data(),
diff --git a/python/paddle/fluid/tests/unittests/test_sparse_utils_op.py b/python/paddle/fluid/tests/unittests/test_sparse_utils_op.py
index 933e8b952d972..89cfc711910ce 100644
--- a/python/paddle/fluid/tests/unittests/test_sparse_utils_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sparse_utils_op.py
@@ -18,7 +18,6 @@
 import paddle
 import paddle.fluid.core as core
 from paddle.fluid.framework import _test_eager_guard
-from paddle import _C_ops
 
 
 class TestSparseCreate(unittest.TestCase):
@@ -135,9 +134,11 @@ def test_to_sparse_coo(self):
             #test to_sparse_coo_grad backward
             out_grad_indices = [[0, 1], [0, 1]]
             out_grad_values = [2.0, 3.0]
-            out_grad = core.eager.sparse_coo_tensor(
+            out_grad = paddle.sparse.sparse_coo_tensor(
                 paddle.to_tensor(out_grad_indices),
-                paddle.to_tensor(out_grad_values), out.shape, True)
+                paddle.to_tensor(out_grad_values),
+                shape=out.shape,
+                stop_gradient=True)
             out.backward(out_grad)
             assert np.array_equal(dense_x.grad.numpy(),
                                   out_grad.to_dense().numpy())
@@ -146,9 +147,11 @@ def test_coo_to_dense(self):
         with _test_eager_guard():
             indices = [[0, 0, 1, 2, 2], [1, 3, 2, 0, 1]]
             values = [1.0, 2.0, 3.0, 4.0, 5.0]
-            sparse_x = core.eager.sparse_coo_tensor(
+            sparse_x = paddle.sparse.sparse_coo_tensor(
                 paddle.to_tensor(indices),
-                paddle.to_tensor(values), [3, 4], False)
+                paddle.to_tensor(values),
+                shape=[3, 4],
+                stop_gradient=False)
             dense_tensor = sparse_x.to_dense()
             #test to_dense_grad backward
             out_grad = [[1.0, 2.0, 3.0, 4.0], [5.0, 6.0, 7.0, 8.0],
@@ -159,6 +162,17 @@ def test_coo_to_dense(self):
             assert np.array_equal(correct_x_grad,
                                   sparse_x.grad.values().numpy())
 
+            paddle.device.set_device("cpu")
+            sparse_x_cpu = paddle.sparse.sparse_coo_tensor(
+                paddle.to_tensor(indices),
+                paddle.to_tensor(values),
+                shape=[3, 4],
+                stop_gradient=False)
+            dense_tensor_cpu = sparse_x_cpu.to_dense()
+            dense_tensor_cpu.backward(paddle.to_tensor(out_grad))
+            assert np.array_equal(correct_x_grad,
+                                  sparse_x_cpu.grad.values().numpy())
+
     def test_to_sparse_csr(self):
         with _test_eager_guard():
             x = [[0, 1, 0, 2], [0, 0, 3, 0], [4, 5, 0, 0]]
@@ -178,9 +192,11 @@ def test_coo_values_grad(self):
         with _test_eager_guard():
             indices = [[0, 0, 1, 2, 2], [1, 3, 2, 0, 1]]
             values = [1.0, 2.0, 3.0, 4.0, 5.0]
-            sparse_x = core.eager.sparse_coo_tensor(
+            sparse_x = paddle.sparse.sparse_coo_tensor(
                 paddle.to_tensor(indices),
-                paddle.to_tensor(values), [3, 4], False)
+                paddle.to_tensor(values),
+                shape=[3, 4],
+                stop_gradient=False)
             values_tensor = sparse_x.values()
             out_grad = [2.0, 3.0, 5.0, 8.0, 9.0]
             # test coo_values_grad

From b6bfb2a48a0a59211d8afbaccfed60042102f3e7 Mon Sep 17 00:00:00 2001
From: zkh2016 <zhangkaihuo@baidu.com>
Date: Thu, 14 Apr 2022 02:52:21 +0000
Subject: [PATCH 07/23] sparse sort

---
 .../phi/kernels/funcs/sparse/common_shape.h   |  39 -----
 .../kernels/funcs/sparse/flatten_indices.cu.h |  57 +++++++
 .../kernels/funcs/sparse/flatten_indices.h    |  93 ++++++++++
 paddle/phi/kernels/sparse/cpu/sort_kernel.cc  |  98 +++++++++++
 .../kernels/sparse/cpu/sparse_mask_kernel.cc  |   8 +-
 .../kernels/sparse/cpu/sparse_utils_kernel.cc |   1 -
 paddle/phi/kernels/sparse/gpu/sort_kernel.cu  | 161 ++++++++++++++++++
 .../kernels/sparse/gpu/sparse_mask_kernel.cu  |  56 +++---
 paddle/phi/kernels/sparse/sort_kernel.h       |  30 ++++
 .../phi/kernels/sparse/sparse_utils_kernel.h  |   9 +-
 .../tests/unittests/test_sparse_utils_op.py   |  68 ++++----
 python/paddle/utils/code_gen/sparse_api.yaml  |   1 +
 12 files changed, 507 insertions(+), 114 deletions(-)
 create mode 100644 paddle/phi/kernels/funcs/sparse/flatten_indices.cu.h
 create mode 100644 paddle/phi/kernels/funcs/sparse/flatten_indices.h
 create mode 100644 paddle/phi/kernels/sparse/cpu/sort_kernel.cc
 create mode 100644 paddle/phi/kernels/sparse/gpu/sort_kernel.cu
 create mode 100644 paddle/phi/kernels/sparse/sort_kernel.h

diff --git a/paddle/phi/kernels/funcs/sparse/common_shape.h b/paddle/phi/kernels/funcs/sparse/common_shape.h
index e4c836d116252..3617e3cd2f406 100644
--- a/paddle/phi/kernels/funcs/sparse/common_shape.h
+++ b/paddle/phi/kernels/funcs/sparse/common_shape.h
@@ -40,45 +40,6 @@ inline const DDim InferDenseDims(const DDim& x_dims,
   return values_dims;
 }
 
-template <typename IntT>
-inline const IntT HOSTDEVICE IndicesToIndex(const IntT* indices,
-                                            const IntT* sparse_offsets,
-                                            const int64_t non_zero_num,
-                                            const int64_t sparse_dim,
-                                            const int i) {
-  IntT index = 0;
-  for (IntT j = 0; j < sparse_dim; j++) {
-    index += indices[j * non_zero_num + i] * sparse_offsets[j];
-  }
-  return index;
-}
-
-template <typename IntT>
-inline void HOSTDEVICE FlattenIndices(const IntT* indices,
-                                      const IntT* sparse_offsets,
-                                      const int64_t non_zero_num,
-                                      const int64_t sparse_dim,
-                                      const int start,
-                                      const int stride,
-                                      IntT* out) {
-  for (int i = start; i < non_zero_num; i += stride) {
-    out[i] =
-        IndicesToIndex(indices, sparse_offsets, non_zero_num, sparse_dim, i);
-  }
-}
-
-// 1. indices.dims().size() == 2
-template <typename IntT>
-inline void CalcOffsetsPerDim(const DDim& dims,
-                              const int64_t sparse_dim,
-                              std::vector<IntT>* offsets) {
-  IntT offset = 1;
-  for (IntT i = sparse_dim - 1; i >= 0; i--) {
-    (*offsets)[i] = offset;
-    offset *= dims[i];
-  }
-}
-
 }  // namespace sparse
 }  // namespace funcs
 }  // namespace phi
diff --git a/paddle/phi/kernels/funcs/sparse/flatten_indices.cu.h b/paddle/phi/kernels/funcs/sparse/flatten_indices.cu.h
new file mode 100644
index 0000000000000..26b8549aaafdc
--- /dev/null
+++ b/paddle/phi/kernels/funcs/sparse/flatten_indices.cu.h
@@ -0,0 +1,57 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/kernels/funcs/sparse/flatten_indices.h"
+
+namespace phi {
+namespace funcs {
+namespace sparse {
+
+template <typename IntT>
+__global__ void FlattenIndicesKernel(const IntT* indices,
+                                     const IntT* sparse_offsets,
+                                     const int64_t non_zero_num,
+                                     const int64_t sparse_dim,
+                                     IntT* out) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  phi::funcs::sparse::FlattenIndices<IntT>(indices,
+                                           sparse_offsets,
+                                           non_zero_num,
+                                           sparse_dim,
+                                           tid,
+                                           gridDim.x * blockDim.x,
+                                           out);
+}
+
+template <typename IntT>
+__global__ void IndexToCoordinateKernel(const IntT* indexs,
+                                        const Dim<DDim::kMaxRank> dims,
+                                        const int64_t non_zero_num,
+                                        const int64_t sparse_dim,
+                                        IntT* indices) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  IndexToCoordinate(indexs,
+                    dims,
+                    non_zero_num,
+                    sparse_dim,
+                    tid,
+                    gridDim.x * blockDim.x,
+                    indices);
+}
+
+}  // namespace sparse
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/kernels/funcs/sparse/flatten_indices.h b/paddle/phi/kernels/funcs/sparse/flatten_indices.h
new file mode 100644
index 0000000000000..ca212e4366ec4
--- /dev/null
+++ b/paddle/phi/kernels/funcs/sparse/flatten_indices.h
@@ -0,0 +1,93 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <stdint.h>
+#include "paddle/phi/core/ddim.h"
+
+namespace phi {
+namespace funcs {
+namespace sparse {
+
+template <typename IntT>
+inline const IntT HOSTDEVICE CoordinateToIndex(const IntT* indices,
+                                               const IntT* sparse_offsets,
+                                               const int64_t non_zero_num,
+                                               const int64_t sparse_dim,
+                                               const int i) {
+  IntT index = 0;
+  for (IntT j = 0; j < sparse_dim; j++) {
+    index += indices[j * non_zero_num + i] * sparse_offsets[j];
+  }
+  return index;
+}
+
+template <typename IntT>
+inline void HOSTDEVICE FlattenIndices(const IntT* indices,
+                                      const IntT* sparse_offsets,
+                                      const int64_t non_zero_num,
+                                      const int64_t sparse_dim,
+                                      const int64_t start,
+                                      const int64_t stride,
+                                      IntT* out) {
+  for (int64_t i = start; i < non_zero_num; i += stride) {
+    out[i] =
+        CoordinateToIndex(indices, sparse_offsets, non_zero_num, sparse_dim, i);
+  }
+}
+
+// 1. indices.dims().size() == 2
+template <typename IntT>
+inline void CalcOffsetsPerDim(const DDim& dims,
+                              const int64_t sparse_dim,
+                              IntT* offsets) {
+  IntT offset = 1;
+  for (IntT i = sparse_dim - 1; i >= 0; i--) {
+    offsets[i] = offset;
+    offset *= dims[i];
+  }
+}
+
+template <typename IntT>
+inline void HOSTDEVICE IndexToCoordinate(const IntT index,
+                                         const Dim<DDim::kMaxRank>& dims,
+                                         const int64_t non_zero_num,
+                                         const int64_t sparse_dim,
+                                         const int indices_offset,
+                                         IntT* indices) {
+  IntT tmp_index = index;
+  for (int j = sparse_dim - 1; j >= 0; j--) {
+    indices[j * non_zero_num + indices_offset] = tmp_index % dims[j];
+    tmp_index /= dims[j];
+  }
+}
+
+template <typename IntT>
+inline void HOSTDEVICE IndexToCoordinate(const IntT* indexs,
+                                         const Dim<DDim::kMaxRank>& dims,
+                                         const int64_t non_zero_num,
+                                         const int64_t sparse_dim,
+                                         const int64_t start,
+                                         const int64_t stride,
+                                         IntT* indices) {
+  for (int64_t i = start; i < non_zero_num; i += stride) {
+    IntT tmp_index = indexs[i];
+    IndexToCoordinate(tmp_index, dims, non_zero_num, sparse_dim, i, indices);
+  }
+}
+
+}  // namespace sparse
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/kernels/sparse/cpu/sort_kernel.cc b/paddle/phi/kernels/sparse/cpu/sort_kernel.cc
new file mode 100644
index 0000000000000..eee44ebda7f6f
--- /dev/null
+++ b/paddle/phi/kernels/sparse/cpu/sort_kernel.cc
@@ -0,0 +1,98 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/sparse/sort_kernel.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/sparse/flatten_indices.h"
+
+#include "paddle/phi/api/ext/dispatch.h"
+
+namespace phi {
+namespace sparse {
+
+template <typename T, typename IntT>
+void SortCPUKernel(const CPUContext& dev_ctx,
+                   const SparseCooTensor& x,
+                   SparseCooTensor* out) {
+  const DenseTensor& x_indices = x.non_zero_indices();
+  const DenseTensor& x_values = x.non_zero_elements();
+  DenseTensor out_indices = phi::EmptyLike<IntT>(dev_ctx, x_indices);
+  DenseTensor out_values = phi::EmptyLike<T>(dev_ctx, x_values);
+
+  const int64_t sparse_dim = x.non_zero_indices().dims()[0];
+  std::vector<IntT> sparse_offsets(sparse_dim), x_indexs(x.nnz());
+  phi::funcs::sparse::CalcOffsetsPerDim<IntT>(
+      x.dims(), sparse_dim, sparse_offsets.data());
+
+  phi::funcs::sparse::FlattenIndices(x.non_zero_indices().data<IntT>(),
+                                     sparse_offsets.data(),
+                                     x.nnz(),
+                                     sparse_dim,
+                                     0,
+                                     1,
+                                     x_indexs.data());
+
+  const T* x_values_ptr = x_values.data<T>();
+  const int64_t stride =
+      x.dims().size() == sparse_dim ? 1 : x.dims().size() - sparse_dim;
+
+  std::map<IntT, int64_t> indices_to_index;
+  for (uint64_t i = 0; i < x_indexs.size(); i++) {
+    indices_to_index[x_indexs[i]] = i;
+  }
+
+  IntT* out_indices_ptr = out_indices.data<IntT>();
+  T* out_values_ptr = out_values.data<T>();
+  auto iter = indices_to_index.begin();
+
+  Dim<DDim::kMaxRank> const_dims;
+  for (int i = 0; i < x.dims().size(); i++) {
+    const_dims[i] = x.dims()[i];
+  }
+
+  for (int i = 0; iter != indices_to_index.end(); iter++, i++) {
+    phi::funcs::sparse::IndexToCoordinate(
+        iter->first, const_dims, x.nnz(), sparse_dim, i, out_indices_ptr);
+    memcpy(out_values_ptr + i * stride,
+           x_values_ptr + iter->second * stride,
+           stride * sizeof(T));
+  }
+  out->SetMember(out_indices, out_values, x.dims(), true);
+}
+
+template <typename T, typename Context>
+void SortKernel(const Context& dev_ctx,
+                const SparseCooTensor& x,
+                SparseCooTensor* out) {
+  PD_DISPATCH_INTEGRAL_TYPES(
+      x.non_zero_indices().dtype(), "SortCPUKernel", ([&] {
+        SortCPUKernel<T, data_t>(dev_ctx, x, out);
+      }));
+}
+
+}  // namespace sparse
+}  // namespace phi
+
+PD_REGISTER_KERNEL(sort,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::sparse::SortKernel,
+                   float,
+                   double,
+                   uint8_t,
+                   int16_t,
+                   int,
+                   int64_t) {
+  kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO);
+}
diff --git a/paddle/phi/kernels/sparse/cpu/sparse_mask_kernel.cc b/paddle/phi/kernels/sparse/cpu/sparse_mask_kernel.cc
index a07a7fb2ecf44..33e62c5bfe88a 100644
--- a/paddle/phi/kernels/sparse/cpu/sparse_mask_kernel.cc
+++ b/paddle/phi/kernels/sparse/cpu/sparse_mask_kernel.cc
@@ -19,7 +19,7 @@ limitations under the License. */
 #include "paddle/phi/kernels/copy_kernel.h"
 #include "paddle/phi/kernels/empty_kernel.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
-#include "paddle/phi/kernels/funcs/sparse/common_shape.h"
+#include "paddle/phi/kernels/funcs/sparse/flatten_indices.h"
 
 #include "paddle/phi/api/ext/dispatch.h"
 
@@ -57,10 +57,10 @@ void SparseMaskCPUKernel(const CPUContext& dev_ctx,
   std::vector<IntT> out_indexs(non_zero_num), sparse_offsets(sparse_dim);
 
   phi::funcs::sparse::CalcOffsetsPerDim<IntT>(
-      dims, sparse_dim, &sparse_offsets);
+      dims, sparse_dim, sparse_offsets.data());
 
   for (int64_t i = 0; i < non_zero_num; i++) {
-    int64_t index = phi::funcs::sparse::IndicesToIndex<IntT>(
+    int64_t index = phi::funcs::sparse::CoordinateToIndex<IntT>(
         indices_ptr, sparse_offsets.data(), non_zero_num, sparse_dim, i);
     memcpy(out_values_ptr + i * cols, x_ptr + index * cols, cols * sizeof(T));
   }
@@ -99,7 +99,7 @@ void SparseMaskHelperCPUKernel(const CPUContext& dev_ctx,
   std::vector<IntT> sparse_offsets(sparse_dim), x_indexs(x.nnz()),
       mask_indexs(mask_indices.dims()[1]);
   phi::funcs::sparse::CalcOffsetsPerDim<IntT>(
-      x.dims(), sparse_dim, &sparse_offsets);
+      x.dims(), sparse_dim, sparse_offsets.data());
 
   phi::funcs::sparse::FlattenIndices(x.non_zero_indices().data<IntT>(),
                                      sparse_offsets.data(),
diff --git a/paddle/phi/kernels/sparse/cpu/sparse_utils_kernel.cc b/paddle/phi/kernels/sparse/cpu/sparse_utils_kernel.cc
index 0499371a4dd17..081981a0fb8ec 100644
--- a/paddle/phi/kernels/sparse/cpu/sparse_utils_kernel.cc
+++ b/paddle/phi/kernels/sparse/cpu/sparse_utils_kernel.cc
@@ -401,7 +401,6 @@ PD_REGISTER_KERNEL(sparse_coo_tensor,
                    phi::sparse::SparseCooTensorKernel,
                    float,
                    double,
-                   phi::dtype::float16,
                    uint8_t,
                    int16_t,
                    int,
diff --git a/paddle/phi/kernels/sparse/gpu/sort_kernel.cu b/paddle/phi/kernels/sparse/gpu/sort_kernel.cu
new file mode 100644
index 0000000000000..f1cf91425cb36
--- /dev/null
+++ b/paddle/phi/kernels/sparse/gpu/sort_kernel.cu
@@ -0,0 +1,161 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/backends/gpu/gpu_info.h"
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/index_impl.cu.h"
+#include "paddle/phi/kernels/funcs/scatter.cu.h"
+#include "paddle/phi/kernels/funcs/sparse/flatten_indices.cu.h"
+#include "paddle/phi/kernels/sparse/sort_kernel.h"
+
+#include "paddle/phi/api/ext/dispatch.h"
+
+namespace phi {
+namespace sparse {
+
+template <typename T, typename IntT>
+__global__ void CopyValuesKernel(const T* x_values,
+                                 const IntT* indexs,
+                                 const int64_t n,
+                                 const int64_t stride,
+                                 T* out_values) {
+  CUDA_KERNEL_LOOP_TYPE(i, n * stride, int64_t) {
+    int64_t indices_i = i / stride;
+    int64_t slice_i = i - indices_i * stride;  // offset inside the slice
+    IntT in_i = indexs[indices_i];
+    out_values[i * stride + slice_i] = x_values[in_i * stride + slice_i];
+  }
+}
+
+template <typename T, typename IntT>
+void SortGPUKernel(const GPUContext& dev_ctx,
+                   const SparseCooTensor& x,
+                   SparseCooTensor* out) {
+  const DenseTensor& x_indices = x.non_zero_indices();
+  const DenseTensor& x_values = x.non_zero_elements();
+  DenseTensor out_indices = phi::EmptyLike<IntT>(dev_ctx, x_indices);
+  DenseTensor out_values = phi::EmptyLike<T>(dev_ctx, x_values);
+
+  const int64_t nnz = x.nnz();
+  const int64_t sparse_dim = x.non_zero_indices().dims()[0];
+  std::vector<IntT> sparse_offsets(sparse_dim);
+
+  phi::funcs::sparse::CalcOffsetsPerDim<IntT>(
+      x.dims(), sparse_dim, sparse_offsets.data());
+
+  DenseTensorMeta sparse_offset_meta(
+      paddle::experimental::CppTypeToDataType<IntT>::Type(),
+      {sparse_dim},
+      DataLayout::NCHW);
+  DenseTensor d_sparse_offsets =
+      phi::Empty<GPUContext>(dev_ctx, std::move(sparse_offset_meta));
+  DenseTensor indexs = phi::Empty(
+      dev_ctx, DenseTensorMeta(x_indices.dtype(), {nnz}, x_indices.layout()));
+  IntT* indexs_ptr = indexs.data<IntT>();
+
+  phi::backends::gpu::GpuMemcpyAsync(d_sparse_offsets.data<IntT>(),
+                                     sparse_offsets.data(),
+                                     sizeof(IntT) * sparse_dim,
+#ifdef PADDLE_WITH_HIP
+                                     hipMemcpyHostToDevice,
+#else
+                                     cudaMemcpyHostToDevice,
+#endif
+                                     dev_ctx.stream());
+
+  // 1. flatten indices
+  auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, nnz, 1);
+  phi::funcs::sparse::FlattenIndicesKernel<<<config.block_per_grid,
+                                             config.thread_per_block,
+                                             0,
+                                             dev_ctx.stream()>>>(
+      x.non_zero_indices().data<IntT>(),
+      d_sparse_offsets.data<IntT>(),
+      indexs.numel(),
+      sparse_dim,
+      indexs_ptr);
+
+  // 2. get the address of each non-zero values
+  const T* x_values_ptr = x_values.data<T>();
+  const int64_t stride =
+      x.dims().size() == sparse_dim ? 1 : x.dims().size() - sparse_dim;
+  DenseTensor values_indexs = phi::Empty(
+      dev_ctx, DenseTensorMeta(DataType::INT32, {nnz}, DataLayout::NCHW));
+  int* values_indexs_ptr = values_indexs.data<int>();
+
+  // values_indexs = [0,1,2,,,nnz-1]
+  phi::IndexKernel<int, kps::IdentityFunctor<int>>(
+      dev_ctx, &values_indexs, kps::IdentityFunctor<int>());
+
+// 3. sort (indices, values index)
+#ifdef PADDLE_WITH_HIP
+  thrust::sort_by_key(thrust::hip::par.on(dev_ctx.stream()),
+#else
+  thrust::sort_by_key(thrust::cuda::par.on(dev_ctx.stream()),
+#endif
+                      indexs_ptr,
+                      indexs_ptr + nnz,
+                      values_indexs_ptr);
+
+  config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, nnz * stride, 1);
+  // 4. scatter the values
+  CopyValuesKernel<T, int><<<config.block_per_grid,
+                             config.thread_per_block,
+                             0,
+                             dev_ctx.stream()>>>(
+      x_values_ptr, values_indexs_ptr, nnz, stride, out_values.data<T>());
+
+  // 6. convert index to coordinate
+  Dim<DDim::kMaxRank> const_dims;
+  for (int i = 0; i < x.dims().size(); i++) {
+    const_dims[i] = x.dims()[i];
+  }
+
+  config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, nnz, 1);
+  phi::funcs::sparse::IndexToCoordinateKernel<<<config.block_per_grid,
+                                                config.thread_per_block,
+                                                0,
+                                                dev_ctx.stream()>>>(
+      indexs_ptr, const_dims, nnz, sparse_dim, out_indices.data<IntT>());
+
+  out->SetMember(out_indices, out_values, x.dims(), true);
+}
+
+template <typename T, typename Context>
+void SortKernel(const Context& dev_ctx,
+                const SparseCooTensor& x,
+                SparseCooTensor* out) {
+  PD_DISPATCH_INTEGRAL_TYPES(
+      x.non_zero_indices().dtype(), "SortGPUKernel", ([&] {
+        SortGPUKernel<T, data_t>(dev_ctx, x, out);
+      }));
+}
+
+}  // namespace sparse
+}  // namespace phi
+
+PD_REGISTER_KERNEL(sort,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::sparse::SortKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   uint8_t,
+                   int16_t,
+                   int,
+                   int64_t) {
+  kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO);
+}
diff --git a/paddle/phi/kernels/sparse/gpu/sparse_mask_kernel.cu b/paddle/phi/kernels/sparse/gpu/sparse_mask_kernel.cu
index 96ab56697b9b0..c20234897bb93 100644
--- a/paddle/phi/kernels/sparse/gpu/sparse_mask_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/sparse_mask_kernel.cu
@@ -22,7 +22,7 @@ limitations under the License. */
 #include "paddle/phi/kernels/copy_kernel.h"
 #include "paddle/phi/kernels/empty_kernel.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
-#include "paddle/phi/kernels/funcs/sparse/common_shape.h"
+#include "paddle/phi/kernels/funcs/sparse/flatten_indices.cu.h"
 #include "paddle/phi/kernels/sparse/sparse_mask_kernel.h"
 
 #include "paddle/phi/api/ext/dispatch.h"
@@ -124,23 +124,6 @@ void SparseMaskKernel(const Context& dev_ctx,
       }));
 }
 
-// TODO(zhangkaihuo): Use an op to realize the function of FlattenIndices
-template <typename IntT>
-__global__ void FlattenIndicesKernel(const IntT* indices,
-                                     const IntT* sparse_offsets,
-                                     const int64_t non_zero_num,
-                                     const int64_t sparse_dim,
-                                     IntT* out) {
-  int tid = threadIdx.x + blockIdx.x * blockDim.x;
-  phi::funcs::sparse::FlattenIndices<IntT>(indices,
-                                           sparse_offsets,
-                                           non_zero_num,
-                                           sparse_dim,
-                                           tid,
-                                           gridDim.x * blockDim.x,
-                                           out);
-}
-
 template <typename T, typename IntT>
 __global__ void SparseMaskCopyKernel(const IntT* x_indexs,
                                      const IntT* mask_indexs,
@@ -193,7 +176,8 @@ void SparseMaskHelperGPUKernel(const GPUContext& dev_ctx,
   IntT* bound_out_ptr = bound_out.data<IntT>();
 
   // 1. calc the offsets of per dim
-  phi::funcs::sparse::CalcOffsetsPerDim(x.dims(), sparse_dim, &sparse_offsets);
+  phi::funcs::sparse::CalcOffsetsPerDim(
+      x.dims(), sparse_dim, sparse_offsets.data());
   // 2. copy sparse_offsets to device
   phi::backends::gpu::GpuMemcpyAsync(d_sparse_offsets.data<IntT>(),
                                      sparse_offsets.data(),
@@ -208,25 +192,27 @@ void SparseMaskHelperGPUKernel(const GPUContext& dev_ctx,
   // 3. flatten x indices and mask indices
   auto config =
       phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, x_indexs.numel(), 1);
-  FlattenIndicesKernel<<<config.block_per_grid,
-                         config.thread_per_block,
-                         0,
-                         dev_ctx.stream()>>>(x.non_zero_indices().data<IntT>(),
-                                             d_sparse_offsets.data<IntT>(),
-                                             x_indexs.numel(),
-                                             sparse_dim,
-                                             x_indexs_ptr);
+  phi::funcs::sparse::FlattenIndicesKernel<<<config.block_per_grid,
+                                             config.thread_per_block,
+                                             0,
+                                             dev_ctx.stream()>>>(
+      x.non_zero_indices().data<IntT>(),
+      d_sparse_offsets.data<IntT>(),
+      x_indexs.numel(),
+      sparse_dim,
+      x_indexs_ptr);
 
   config =
       phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, mask_indexs.numel(), 1);
-  FlattenIndicesKernel<<<config.block_per_grid,
-                         config.thread_per_block,
-                         0,
-                         dev_ctx.stream()>>>(mask_indices.data<IntT>(),
-                                             d_sparse_offsets.data<IntT>(),
-                                             mask_indexs.numel(),
-                                             sparse_dim,
-                                             mask_indexs_ptr);
+  phi::funcs::sparse::FlattenIndicesKernel<<<config.block_per_grid,
+                                             config.thread_per_block,
+                                             0,
+                                             dev_ctx.stream()>>>(
+      mask_indices.data<IntT>(),
+      d_sparse_offsets.data<IntT>(),
+      mask_indexs.numel(),
+      sparse_dim,
+      mask_indexs_ptr);
 // 4. call thrust::lower_bound
 #ifdef PADDLE_WITH_HIP
   thrust::lower_bound(thrust::hip::par.on(dev_ctx.stream()),
diff --git a/paddle/phi/kernels/sparse/sort_kernel.h b/paddle/phi/kernels/sparse/sort_kernel.h
new file mode 100644
index 0000000000000..4fb7714d5bfab
--- /dev/null
+++ b/paddle/phi/kernels/sparse/sort_kernel.h
@@ -0,0 +1,30 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/sparse_coo_tensor.h"
+#include "paddle/phi/kernels/empty_kernel.h"
+
+namespace phi {
+namespace sparse {
+
+template <typename T, typename Context>
+void SortKernel(const Context& dev_ctx,
+                const SparseCooTensor& x,
+                SparseCooTensor* out);
+
+}  // namespace sparse
+}  // namespace phi
diff --git a/paddle/phi/kernels/sparse/sparse_utils_kernel.h b/paddle/phi/kernels/sparse/sparse_utils_kernel.h
index 8cf9c0a28648a..83de8bfec5d1b 100644
--- a/paddle/phi/kernels/sparse/sparse_utils_kernel.h
+++ b/paddle/phi/kernels/sparse/sparse_utils_kernel.h
@@ -20,6 +20,7 @@ limitations under the License. */
 #include "paddle/phi/core/sparse_coo_tensor.h"
 #include "paddle/phi/core/sparse_csr_tensor.h"
 #include "paddle/phi/kernels/empty_kernel.h"
+#include "paddle/phi/kernels/sparse/sort_kernel.h"
 
 namespace phi {
 namespace sparse {
@@ -154,9 +155,11 @@ void SparseCooTensorKernel(const Context& dev_ctx,
                            const DenseTensor& indices,
                            const IntArray& dense_shape,
                            SparseCooTensor* out) {
-  *out =
-      SparseCooTensor(indices, values, phi::make_ddim(dense_shape.GetData()));
-  // TODO(zhangkaihuo): sort and merge the dumplicate indices
+  SparseCooTensor before_sorted(
+      indices, values, phi::make_ddim(dense_shape.GetData()));
+  // sort
+  SortKernel<T, Context>(dev_ctx, before_sorted, out);
+  // TODO(zhangkaihuo): merge the dumplicate indices
 }
 
 }  // namespace sparse
diff --git a/python/paddle/fluid/tests/unittests/test_sparse_utils_op.py b/python/paddle/fluid/tests/unittests/test_sparse_utils_op.py
index 89cfc711910ce..2ebd0edb20f77 100644
--- a/python/paddle/fluid/tests/unittests/test_sparse_utils_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sparse_utils_op.py
@@ -19,6 +19,8 @@
 import paddle.fluid.core as core
 from paddle.fluid.framework import _test_eager_guard
 
+devices = ['cpu', 'gpu']
+
 
 class TestSparseCreate(unittest.TestCase):
     def test_create_coo_by_tensor(self):
@@ -39,6 +41,7 @@ def test_create_coo_by_np(self):
             values = [1.0, 2.0, 3.0]
             dense_shape = [2, 3]
             coo = paddle.sparse.sparse_coo_tensor(indices, values, dense_shape)
+            print(coo)
             assert np.array_equal(indices, coo.indices().numpy())
             assert np.array_equal(values, coo.values().numpy())
 
@@ -205,38 +208,39 @@ def test_coo_values_grad(self):
 
     def test_sparse_coo_tensor_grad(self):
         with _test_eager_guard():
-            indices = [[0, 1], [0, 1]]
-            values = [1, 2]
-            indices = paddle.to_tensor(indices, dtype='int32')
-            values = paddle.to_tensor(
-                values, dtype='float32', stop_gradient=False)
-            sparse_x = paddle.sparse.sparse_coo_tensor(
-                indices, values, shape=[2, 2], stop_gradient=False)
-            grad_indices = [[0, 1], [1, 1]]
-            grad_values = [2, 3]
-            grad_indices = paddle.to_tensor(grad_indices, dtype='int32')
-            grad_values = paddle.to_tensor(grad_values, dtype='float32')
-            sparse_out_grad = paddle.sparse.sparse_coo_tensor(
-                grad_indices, grad_values, shape=[2, 2])
-            sparse_x.backward(sparse_out_grad)
-            correct_values_grad = [0, 3]
-            assert np.array_equal(correct_values_grad, values.grad.numpy())
-
-            place = core.CPUPlace()
-            indices_cpu = paddle.to_tensor(indices, dtype='int32', place=place)
-            values_cpu = paddle.to_tensor(
-                values, dtype='float32', place=place, stop_gradient=False)
-            sparse_x_cpu = paddle.sparse.sparse_coo_tensor(
-                indices_cpu,
-                values_cpu,
-                shape=[2, 2],
-                place=place,
-                stop_gradient=False)
-
-            sparse_out_grad_cpu = paddle.sparse.sparse_coo_tensor(
-                grad_indices, grad_values, shape=[2, 2], place=place)
-            sparse_x_cpu.backward(sparse_out_grad_cpu)
-            assert np.array_equal(correct_values_grad, values_cpu.grad.numpy())
+            for device in devices:
+                paddle.device.set_device(device)
+                indices = [[0, 1], [0, 1]]
+                values = [1, 2]
+                indices = paddle.to_tensor(indices, dtype='int32')
+                values = paddle.to_tensor(
+                    values, dtype='float32', stop_gradient=False)
+                sparse_x = paddle.sparse.sparse_coo_tensor(
+                    indices, values, shape=[2, 2], stop_gradient=False)
+                grad_indices = [[0, 1], [1, 1]]
+                grad_values = [2, 3]
+                grad_indices = paddle.to_tensor(grad_indices, dtype='int32')
+                grad_values = paddle.to_tensor(grad_values, dtype='float32')
+                sparse_out_grad = paddle.sparse.sparse_coo_tensor(
+                    grad_indices, grad_values, shape=[2, 2])
+                sparse_x.backward(sparse_out_grad)
+                correct_values_grad = [0, 3]
+                assert np.array_equal(correct_values_grad, values.grad.numpy())
+
+    def test_sparse_coo_tensor_sorted(self):
+        with _test_eager_guard():
+            for device in devices:
+                paddle.device.set_device(device)
+                indices = [[1, 0], [0, 1]]
+                values = [1.0, 2.0]
+                indices = paddle.to_tensor(indices, dtype='int32')
+                values = paddle.to_tensor(values, dtype='float32')
+                sparse_x = paddle.sparse.sparse_coo_tensor(indices, values)
+                indices_sorted = [[0, 1], [1, 0]]
+                values_sorted = [2.0, 1.0]
+                assert np.array_equal(indices_sorted,
+                                      sparse_x.indices().numpy())
+                assert np.array_equal(values_sorted, sparse_x.values().numpy())
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/utils/code_gen/sparse_api.yaml b/python/paddle/utils/code_gen/sparse_api.yaml
index 2187d4abb2d63..100d7ad78319b 100644
--- a/python/paddle/utils/code_gen/sparse_api.yaml
+++ b/python/paddle/utils/code_gen/sparse_api.yaml
@@ -27,6 +27,7 @@
   kernel :
     func : sparse_coo_tensor
     layout : values
+    data_type : values
   backward : create_sparse_coo_tensor_grad
 
 - api : csr_values

From 8689f7ae3465eec860423c1c7817c547ab182cc1 Mon Sep 17 00:00:00 2001
From: zkh2016 <zhangkaihuo@baidu.com>
Date: Thu, 14 Apr 2022 03:38:54 +0000
Subject: [PATCH 08/23] check shape

---
 .../fluid/tests/unittests/test_sparse_utils_op.py  | 14 ++++++++++++--
 python/paddle/sparse/creation.py                   |  7 ++++++-
 2 files changed, 18 insertions(+), 3 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/test_sparse_utils_op.py b/python/paddle/fluid/tests/unittests/test_sparse_utils_op.py
index 2ebd0edb20f77..157da73ad07de 100644
--- a/python/paddle/fluid/tests/unittests/test_sparse_utils_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sparse_utils_op.py
@@ -39,9 +39,8 @@ def test_create_coo_by_np(self):
         with _test_eager_guard():
             indices = [[0, 1, 2], [1, 2, 0]]
             values = [1.0, 2.0, 3.0]
-            dense_shape = [2, 3]
+            dense_shape = [3, 3]
             coo = paddle.sparse.sparse_coo_tensor(indices, values, dense_shape)
-            print(coo)
             assert np.array_equal(indices, coo.indices().numpy())
             assert np.array_equal(values, coo.values().numpy())
 
@@ -243,5 +242,16 @@ def test_sparse_coo_tensor_sorted(self):
                 assert np.array_equal(values_sorted, sparse_x.values().numpy())
 
 
+class TestError(unittest.TestCase):
+    def test_sparse_coo_tensor(self):
+        with self.assertRaises(ValueError):
+            indices = [[2, 3], [0, 2]]
+            values = [1, 2]
+            #the shape too small
+            dense_shape = [2, 2]
+            sparse_x = paddle.sparse.sparse_coo_tensor(
+                indices, values, shape=dense_shape)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/sparse/creation.py b/python/paddle/sparse/creation.py
index ac9276f3142c0..b48cc60851c99 100644
--- a/python/paddle/sparse/creation.py
+++ b/python/paddle/sparse/creation.py
@@ -125,8 +125,13 @@ def sparse_coo_tensor(indices,
     values = _handle_dtype(values, dtype)
     values.stop_gradient = stop_gradient
 
+    min_shape = _infer_dense_shape(indices)
     if shape is None:
-        shape = _infer_dense_shape(indices)
+        shape = min_shape
+    else:
+        if shape < min_shape:
+            raise ValueError("the minimun shape required is {}, but get {}".
+                             format(min_shape, shape))
 
     return _C_ops.final_state_sparse_create_sparse_coo_tensor(values, indices,
                                                               shape)

From 30f85d8c0d056288275d86aa766481b85a3c2756 Mon Sep 17 00:00:00 2001
From: zkh2016 <zhangkaihuo@baidu.com>
Date: Thu, 14 Apr 2022 05:02:57 +0000
Subject: [PATCH 09/23] fix float16

---
 paddle/phi/kernels/sparse/cpu/sort_kernel.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/paddle/phi/kernels/sparse/cpu/sort_kernel.cc b/paddle/phi/kernels/sparse/cpu/sort_kernel.cc
index 46e6acfd3a9de..6cbd3ba59a597 100644
--- a/paddle/phi/kernels/sparse/cpu/sort_kernel.cc
+++ b/paddle/phi/kernels/sparse/cpu/sort_kernel.cc
@@ -88,6 +88,7 @@ PD_REGISTER_KERNEL(sort,
                    phi::sparse::SortKernel,
                    float,
                    double,
+                   phi::dtype::float16,
                    uint8_t,
                    int16_t,
                    int,

From 09b94482cc7045d3aeaf0cd83033ff2c6447d9ce Mon Sep 17 00:00:00 2001
From: zkh2016 <zhangkaihuo@baidu.com>
Date: Fri, 15 Apr 2022 05:48:43 +0000
Subject: [PATCH 10/23] add coalesced

---
 paddle/phi/kernels/funcs/sparse/scatter.cu.h  | 63 ++++++++++++
 paddle/phi/kernels/funcs/sparse/utils.cu.h    | 31 ++++++
 .../{sort_kernel.h => coalesced_kernel.h}     |  6 +-
 .../{sort_kernel.cc => coalesced_kernel.cc}   | 54 +++++++---
 .../{sort_kernel.cu => coalesced_kernel.cu}   | 98 ++++++++++++-------
 .../phi/kernels/sparse/gpu/convolution.cu.h   | 57 +----------
 .../sparse/gpu/convolution_grad_kernel.cu     | 24 ++---
 .../kernels/sparse/gpu/convolution_kernel.cu  | 22 +++--
 .../phi/kernels/sparse/sparse_utils_kernel.h  |  8 +-
 .../tests/unittests/test_sparse_utils_op.py   |  7 +-
 10 files changed, 237 insertions(+), 133 deletions(-)
 create mode 100644 paddle/phi/kernels/funcs/sparse/scatter.cu.h
 create mode 100644 paddle/phi/kernels/funcs/sparse/utils.cu.h
 rename paddle/phi/kernels/sparse/{sort_kernel.h => coalesced_kernel.h} (86%)
 rename paddle/phi/kernels/sparse/cpu/{sort_kernel.cc => coalesced_kernel.cc} (65%)
 rename paddle/phi/kernels/sparse/gpu/{sort_kernel.cu => coalesced_kernel.cu} (64%)

diff --git a/paddle/phi/kernels/funcs/sparse/scatter.cu.h b/paddle/phi/kernels/funcs/sparse/scatter.cu.h
new file mode 100644
index 0000000000000..9ed7cef12a148
--- /dev/null
+++ b/paddle/phi/kernels/funcs/sparse/scatter.cu.h
@@ -0,0 +1,63 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+namespace phi {
+namespace funcs {
+namespace sparse {
+
+/**
+ * brief: scatter add
+ * input: the inputs
+ * unique_value: refer to UpdateIndexKernel notes
+ * out_index: the output feature index
+ * non_zero_num: the number of output features
+ * rulebook_len: the length of rulebook
+ * channels: the output channel size
+ * out: the outputs
+**/
+template <typename T>
+__global__ void ScatterKernel(const T* input,
+                              const int* unique_value,
+                              const int* out_index,
+                              const int non_zero_num,
+                              const int rulebook_len,
+                              const int channels,
+                              T* out,
+                              const bool subm = false) {
+  int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  for (int i = tid; i < non_zero_num * channels; i += gridDim.x * blockDim.x) {
+    int indices_i = i / channels;
+    int channels_i = i - indices_i * channels;
+
+    int start = unique_value[indices_i];
+    int end = indices_i == non_zero_num - 1 ? rulebook_len
+                                            : unique_value[indices_i + 1];
+    // max(end-start) = kernel_size
+    T sum = static_cast<T>(0);
+    if (subm) {
+      sum = out[indices_i * channels + channels_i];
+    }
+    for (int j = start; j < end; j++) {
+      const int out_feature_i = out_index[j];
+      sum += input[out_feature_i * channels + channels_i];
+    }
+    out[indices_i * channels + channels_i] = sum;
+  }
+}
+
+}  // namespace sparse
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/kernels/funcs/sparse/utils.cu.h b/paddle/phi/kernels/funcs/sparse/utils.cu.h
new file mode 100644
index 0000000000000..074fe1ca42049
--- /dev/null
+++ b/paddle/phi/kernels/funcs/sparse/utils.cu.h
@@ -0,0 +1,31 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+namespace phi {
+namespace funcs {
+namespace sparse {
+
+// brief: calculation the distance between start and end
+template <typename T>
+__global__ void DistanceKernel(const T* start, const T* end, T* distance) {
+  if (threadIdx.x == 0) {
+    *distance = end - start;
+  }
+}
+
+}  // namespace sparse
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/kernels/sparse/sort_kernel.h b/paddle/phi/kernels/sparse/coalesced_kernel.h
similarity index 86%
rename from paddle/phi/kernels/sparse/sort_kernel.h
rename to paddle/phi/kernels/sparse/coalesced_kernel.h
index 4fb7714d5bfab..0755579a57ade 100644
--- a/paddle/phi/kernels/sparse/sort_kernel.h
+++ b/paddle/phi/kernels/sparse/coalesced_kernel.h
@@ -22,9 +22,9 @@ namespace phi {
 namespace sparse {
 
 template <typename T, typename Context>
-void SortKernel(const Context& dev_ctx,
-                const SparseCooTensor& x,
-                SparseCooTensor* out);
+void CoalescedKernel(const Context& dev_ctx,
+                     const SparseCooTensor& x,
+                     SparseCooTensor* out);
 
 }  // namespace sparse
 }  // namespace phi
diff --git a/paddle/phi/kernels/sparse/cpu/sort_kernel.cc b/paddle/phi/kernels/sparse/cpu/coalesced_kernel.cc
similarity index 65%
rename from paddle/phi/kernels/sparse/cpu/sort_kernel.cc
rename to paddle/phi/kernels/sparse/cpu/coalesced_kernel.cc
index 6cbd3ba59a597..0ebddf9b683f0 100644
--- a/paddle/phi/kernels/sparse/cpu/sort_kernel.cc
+++ b/paddle/phi/kernels/sparse/cpu/coalesced_kernel.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/phi/kernels/sparse/sort_kernel.h"
+#include "paddle/phi/kernels/sparse/coalesced_kernel.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/visit_type.h"
 #include "paddle/phi/kernels/funcs/sparse/flatten_indices.h"
@@ -21,9 +21,9 @@ namespace phi {
 namespace sparse {
 
 template <typename T, typename IntT>
-void SortCPUKernel(const CPUContext& dev_ctx,
-                   const SparseCooTensor& x,
-                   SparseCooTensor* out) {
+void CoalescedCPUKernel(const CPUContext& dev_ctx,
+                        const SparseCooTensor& x,
+                        SparseCooTensor* out) {
   const DenseTensor& x_indices = x.non_zero_indices();
   const DenseTensor& x_values = x.non_zero_elements();
   DenseTensor out_indices = phi::EmptyLike<IntT>(dev_ctx, x_indices);
@@ -46,9 +46,25 @@ void SortCPUKernel(const CPUContext& dev_ctx,
   const int64_t stride =
       x.dims().size() == sparse_dim ? 1 : x.dims().size() - sparse_dim;
 
-  std::map<IntT, int64_t> indices_to_index;
+  std::map<IntT, std::vector<int64_t>> indices_to_index;
   for (uint64_t i = 0; i < x_indexs.size(); i++) {
-    indices_to_index[x_indexs[i]] = i;
+    IntT index = x_indexs[i];
+    if (indices_to_index.find(index) == indices_to_index.end()) {
+      std::vector<int64_t> indexs;
+      indexs.push_back(i);
+      indices_to_index[index] = indexs;
+    } else {
+      indices_to_index[index].push_back(i);
+    }
+  }
+
+  const int64_t out_nnz = indices_to_index.size();
+
+  out_indices.Resize({x_indices.dims()[0], out_nnz});
+  if (out_values.dims().size() == 1) {
+    out_values.Resize(phi::make_ddim({out_nnz}));
+  } else {
+    out_values.Resize(phi::make_ddim({out_nnz, x_values.dims()[1]}));
   }
 
   IntT* out_indices_ptr = out_indices.data<IntT>();
@@ -62,21 +78,29 @@ void SortCPUKernel(const CPUContext& dev_ctx,
 
   for (int i = 0; iter != indices_to_index.end(); iter++, i++) {
     phi::funcs::sparse::IndexToCoordinate(
-        iter->first, const_dims, x.nnz(), sparse_dim, i, out_indices_ptr);
+        iter->first, const_dims, out_nnz, sparse_dim, i, out_indices_ptr);
     memcpy(out_values_ptr + i * stride,
-           x_values_ptr + iter->second * stride,
+           x_values_ptr + iter->second[0] * stride,
            stride * sizeof(T));
+    for (uint64_t j = 1; j < iter->second.size(); j++) {
+      for (int k = 0; k < stride; k++) {
+        out_values_ptr[i * stride + k] +=
+            x_values_ptr[iter->second[j] * stride + k];
+      }
+    }
   }
+
   out->SetMember(out_indices, out_values, x.dims(), true);
 }
 
 template <typename T, typename Context>
-void SortKernel(const Context& dev_ctx,
-                const SparseCooTensor& x,
-                SparseCooTensor* out) {
-  PD_VISIT_INTEGRAL_TYPES(x.non_zero_indices().dtype(), "SortCPUKernel", ([&] {
-                            SortCPUKernel<T, data_t>(dev_ctx, x, out);
-                          }));
+void CoalescedKernel(const Context& dev_ctx,
+                     const SparseCooTensor& x,
+                     SparseCooTensor* out) {
+  PD_VISIT_INTEGRAL_TYPES(
+      x.non_zero_indices().dtype(), "CoalescedCPUKernel", ([&] {
+        CoalescedCPUKernel<T, data_t>(dev_ctx, x, out);
+      }));
 }
 
 }  // namespace sparse
@@ -85,7 +109,7 @@ void SortKernel(const Context& dev_ctx,
 PD_REGISTER_KERNEL(sort,
                    CPU,
                    ALL_LAYOUT,
-                   phi::sparse::SortKernel,
+                   phi::sparse::CoalescedKernel,
                    float,
                    double,
                    phi::dtype::float16,
diff --git a/paddle/phi/kernels/sparse/gpu/sort_kernel.cu b/paddle/phi/kernels/sparse/gpu/coalesced_kernel.cu
similarity index 64%
rename from paddle/phi/kernels/sparse/gpu/sort_kernel.cu
rename to paddle/phi/kernels/sparse/gpu/coalesced_kernel.cu
index 39369616eca7e..3ffcd28955a53 100644
--- a/paddle/phi/kernels/sparse/gpu/sort_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/coalesced_kernel.cu
@@ -17,31 +17,18 @@ limitations under the License. */
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/visit_type.h"
 #include "paddle/phi/kernels/funcs/index_impl.cu.h"
-#include "paddle/phi/kernels/funcs/scatter.cu.h"
 #include "paddle/phi/kernels/funcs/sparse/flatten_indices.cu.h"
-#include "paddle/phi/kernels/sparse/sort_kernel.h"
+#include "paddle/phi/kernels/funcs/sparse/scatter.cu.h"
+#include "paddle/phi/kernels/funcs/sparse/utils.cu.h"
+#include "paddle/phi/kernels/sparse/coalesced_kernel.h"
 
 namespace phi {
 namespace sparse {
 
 template <typename T, typename IntT>
-__global__ void CopyValuesKernel(const T* x_values,
-                                 const IntT* indexs,
-                                 const int64_t n,
-                                 const int64_t stride,
-                                 T* out_values) {
-  CUDA_KERNEL_LOOP_TYPE(i, n * stride, int64_t) {
-    int64_t indices_i = i / stride;
-    int64_t slice_i = i - indices_i * stride;  // offset inside the slice
-    IntT in_i = indexs[indices_i];
-    out_values[i * stride + slice_i] = x_values[in_i * stride + slice_i];
-  }
-}
-
-template <typename T, typename IntT>
-void SortGPUKernel(const GPUContext& dev_ctx,
-                   const SparseCooTensor& x,
-                   SparseCooTensor* out) {
+void CoalescedGPUKernel(const GPUContext& dev_ctx,
+                        const SparseCooTensor& x,
+                        SparseCooTensor* out) {
   const DenseTensor& x_indices = x.non_zero_indices();
   const DenseTensor& x_values = x.non_zero_elements();
   DenseTensor out_indices = phi::EmptyLike<IntT>(dev_ctx, x_indices);
@@ -93,10 +80,13 @@ void SortGPUKernel(const GPUContext& dev_ctx,
   DenseTensor values_indexs = phi::Empty(
       dev_ctx, DenseTensorMeta(DataType::INT32, {nnz}, DataLayout::NCHW));
   int* values_indexs_ptr = values_indexs.data<int>();
+  DenseTensor public_indexs = phi::EmptyLike<int>(dev_ctx, values_indexs);
 
   // values_indexs = [0,1,2,,,nnz-1]
   phi::IndexKernel<int, kps::IdentityFunctor<int>>(
       dev_ctx, &values_indexs, kps::IdentityFunctor<int>());
+  phi::IndexKernel<int, kps::IdentityFunctor<int>>(
+      dev_ctx, &public_indexs, kps::IdentityFunctor<int>());
 
 // 3. sort (indices, values index)
 #ifdef PADDLE_WITH_HIP
@@ -108,13 +98,52 @@ void SortGPUKernel(const GPUContext& dev_ctx,
                       indexs_ptr + nnz,
                       values_indexs_ptr);
 
+  // 4. unique index
+  thrust::pair<IntT*, int*> new_end =
+#ifdef PADDLE_WITH_HIP
+      thrust::unique_by_key(thrust::hip::par.on(dev_ctx.stream()),
+#else
+      thrust::unique_by_key(thrust::cuda::par.on(dev_ctx.stream()),
+#endif
+                            indexs_ptr,
+                            indexs_ptr + nnz,
+                            public_indexs.data<int>());
+
+  phi::funcs::sparse::DistanceKernel<<<1, 1, 0, dev_ctx.stream()>>>(
+      indexs_ptr, new_end.first, out_indices.data<IntT>());
+
+  IntT out_nnz = 0;
+  phi::backends::gpu::GpuMemcpyAsync(&out_nnz,
+                                     out_indices.data<IntT>(),
+                                     sizeof(IntT),
+#ifdef PADDLE_WITH_HIP
+                                     hipMemcpyDeviceToHost,
+#else
+                                     cudaMemcpyDeviceToHost,
+#endif
+                                     dev_ctx.stream());
+  dev_ctx.Wait();
+
+  out_indices.Resize({x_indices.dims()[0], out_nnz});
+  if (out_values.dims().size() == 1) {
+    out_values.Resize(phi::make_ddim({out_nnz}));
+  } else {
+    out_values.Resize(phi::make_ddim({out_nnz, x_values.dims()[1]}));
+  }
+
+  // 5. scatter the values
   config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, nnz * stride, 1);
-  // 4. scatter the values
-  CopyValuesKernel<T, int><<<config.block_per_grid,
-                             config.thread_per_block,
-                             0,
-                             dev_ctx.stream()>>>(
-      x_values_ptr, values_indexs_ptr, nnz, stride, out_values.data<T>());
+  phi::funcs::sparse::ScatterKernel<T><<<config.block_per_grid,
+                                         config.thread_per_block,
+                                         0,
+                                         dev_ctx.stream()>>>(
+      x_values_ptr,
+      public_indexs.data<int>(),
+      values_indexs_ptr,
+      out_nnz,
+      nnz,
+      stride,
+      out_values.data<T>());
 
   // 6. convert index to coordinate
   Dim<DDim::kMaxRank> const_dims;
@@ -122,23 +151,24 @@ void SortGPUKernel(const GPUContext& dev_ctx,
     const_dims[i] = x.dims()[i];
   }
 
-  config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, nnz, 1);
+  config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, out_nnz, 1);
   phi::funcs::sparse::IndexToCoordinateKernel<<<config.block_per_grid,
                                                 config.thread_per_block,
                                                 0,
                                                 dev_ctx.stream()>>>(
-      indexs_ptr, const_dims, nnz, sparse_dim, out_indices.data<IntT>());
+      indexs_ptr, const_dims, out_nnz, sparse_dim, out_indices.data<IntT>());
 
   out->SetMember(out_indices, out_values, x.dims(), true);
 }
 
 template <typename T, typename Context>
-void SortKernel(const Context& dev_ctx,
-                const SparseCooTensor& x,
-                SparseCooTensor* out) {
-  PD_VISIT_INTEGRAL_TYPES(x.non_zero_indices().dtype(), "SortGPUKernel", ([&] {
-                            SortGPUKernel<T, data_t>(dev_ctx, x, out);
-                          }));
+void CoalescedKernel(const Context& dev_ctx,
+                     const SparseCooTensor& x,
+                     SparseCooTensor* out) {
+  PD_VISIT_INTEGRAL_TYPES(
+      x.non_zero_indices().dtype(), "CoalescedGPUKernel", ([&] {
+        CoalescedGPUKernel<T, data_t>(dev_ctx, x, out);
+      }));
 }
 
 }  // namespace sparse
@@ -147,7 +177,7 @@ void SortKernel(const Context& dev_ctx,
 PD_REGISTER_KERNEL(sort,
                    GPU,
                    ALL_LAYOUT,
-                   phi::sparse::SortKernel,
+                   phi::sparse::CoalescedKernel,
                    float,
                    double,
                    phi::dtype::float16,
diff --git a/paddle/phi/kernels/sparse/gpu/convolution.cu.h b/paddle/phi/kernels/sparse/gpu/convolution.cu.h
index 1bceb767b6708..c5f6f2746f681 100644
--- a/paddle/phi/kernels/sparse/gpu/convolution.cu.h
+++ b/paddle/phi/kernels/sparse/gpu/convolution.cu.h
@@ -24,6 +24,7 @@ limitations under the License. */
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
 #include "paddle/phi/kernels/funcs/index_impl.cu.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/funcs/sparse/utils.cu.h"
 #include "paddle/phi/kernels/primitive/compute_primitives.h"
 #include "paddle/phi/kernels/sparse/convolution_kernel.h"
 
@@ -58,46 +59,6 @@ __global__ void GatherKernel(const T* params,
   }
 }
 
-/**
- * brief: scatter add
- * input: the inputs
- * unique_value: refer to UpdateIndexKernel notes
- * out_index: the output feature index
- * non_zero_num: the number of output features
- * rulebook_len: the length of rulebook
- * channels: the output channel size
- * out: the outputs
-**/
-template <typename T>
-__global__ void ScatterKernel(const T* input,
-                              const int* unique_value,
-                              const int* out_index,
-                              const int non_zero_num,
-                              const int rulebook_len,
-                              const int channels,
-                              T* out,
-                              const bool subm = false) {
-  int tid = threadIdx.x + blockIdx.x * blockDim.x;
-  for (int i = tid; i < non_zero_num * channels; i += gridDim.x * blockDim.x) {
-    int indices_i = i / channels;
-    int channels_i = i - indices_i * channels;
-
-    int start = unique_value[indices_i];
-    int end = indices_i == non_zero_num - 1 ? rulebook_len
-                                            : unique_value[indices_i + 1];
-    // max(end-start) = kernel_size
-    T sum = static_cast<T>(0);
-    if (subm) {
-      sum = out[indices_i * channels + channels_i];
-    }
-    for (int j = start; j < end; j++) {
-      const int out_feature_i = out_index[j];
-      sum += input[out_feature_i * channels + channels_i];
-    }
-    out[indices_i * channels + channels_i] = sum;
-  }
-}
-
 template <typename Context, typename IntT = int>
 inline IntT* SortedAndUniqueIndex(const Context& dev_ctx,
                                   const IntT* rulebook_ptr,
@@ -213,14 +174,6 @@ __global__ void UpdateIndexKernel(const T* unique_keys,
   }
 }
 
-// brief: calculation the distance between start and end
-template <typename T>
-__global__ void DistanceKernel(const T* start, const T* end, T* distance) {
-  if (threadIdx.x == 0) {
-    *distance = end - start;
-  }
-}
-
 /**
  * @brief product rulebook
  * for input_i in x_indices:
@@ -395,7 +348,7 @@ int ProductRuleBook(const Context& dev_ctx,
                               rulebook_ptr + rulebook_rows * rulebook_cols,
                               -1);
 
-  DistanceKernel<IntT><<<1, 1, 0, dev_ctx.stream()>>>(
+  phi::funcs::sparse::DistanceKernel<IntT><<<1, 1, 0, dev_ctx.stream()>>>(
       rulebook_ptr, last, rulebook_ptr + 3 * kernel_size * non_zero_num - 1);
   IntT rulebook_len = 0;
   phi::backends::gpu::GpuMemcpyAsync(
@@ -480,7 +433,7 @@ int ProductRuleBook(const Context& dev_ctx,
                                         val_result_start);
     }
 
-    DistanceKernel<IntT><<<1, 1, 0, dev_ctx.stream()>>>(
+    phi::funcs::sparse::DistanceKernel<IntT><<<1, 1, 0, dev_ctx.stream()>>>(
         key_result.data<IntT>(),
         end.first,
         key_result.data<IntT>() + rulebook_len);
@@ -516,7 +469,7 @@ int ProductRuleBook(const Context& dev_ctx,
                                 rulebook_ptr,
                                 rulebook_ptr + 3 * rulebook_len,
                                 -1);
-    DistanceKernel<IntT><<<1, 1, 0, dev_ctx.stream()>>>(
+    phi::funcs::sparse::DistanceKernel<IntT><<<1, 1, 0, dev_ctx.stream()>>>(
         rulebook_ptr, last, key_result.data<IntT>() + rulebook_len);
     phi::backends::gpu::GpuMemcpyAsync(&rulebook_len,
                                        key_result.data<IntT>() + rulebook_len,
@@ -587,7 +540,7 @@ int ProductRuleBook(const Context& dev_ctx,
   // thrust::distance doesn't support stream parameters
   // const int out_non_zero_num = thrust::distance(unique_key_ptr,
   // new_end.first);
-  DistanceKernel<IntT><<<1, 1>>>(
+  phi::funcs::sparse::DistanceKernel<IntT><<<1, 1>>>(
       unique_key_ptr,
       new_end,
       rulebook_ptr + rulebook_rows * rulebook_cols - 1);
diff --git a/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu b/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu
index 6c37f759923c3..5e7de7518e298 100644
--- a/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu
@@ -22,6 +22,7 @@ limitations under the License. */
 #include "paddle/phi/kernels/copy_kernel.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/funcs/sparse/scatter.cu.h"
 #include "paddle/phi/kernels/sparse/convolution_grad_kernel.h"
 #include "paddle/phi/kernels/sparse/gpu/convolution.cu.h"
 
@@ -222,17 +223,18 @@ void Conv3dGradGPUKernel(const GPUContext& dev_ctx,
   config = phi::backends::gpu::GetGpuLaunchConfig1D(
       dev_ctx, rulebook_len * in_channels, 1);
 
-  ScatterKernel<T><<<config.block_per_grid.x,
-                     config.thread_per_block.x,
-                     0,
-                     dev_ctx.stream()>>>(d_x_features_ptr,
-                                         unique_value.data<int>(),
-                                         out_index.data<int>(),
-                                         x.nnz(),
-                                         rulebook_len,
-                                         in_channels,
-                                         x_grad_values_ptr,
-                                         subm);
+  phi::funcs::sparse::ScatterKernel<T><<<config.block_per_grid.x,
+                                         config.thread_per_block.x,
+                                         0,
+                                         dev_ctx.stream()>>>(
+      d_x_features_ptr,
+      unique_value.data<int>(),
+      out_index.data<int>(),
+      x.nnz(),
+      rulebook_len,
+      in_channels,
+      x_grad_values_ptr,
+      subm);
 }
 
 template <typename T, typename Context>
diff --git a/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu b/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu
index 83f19ce5785df..abf84dc7d739b 100644
--- a/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/convolution_kernel.cu
@@ -17,6 +17,7 @@ limitations under the License. */
 #include "paddle/phi/core/tensor_meta.h"
 #include "paddle/phi/core/visit_type.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/funcs/sparse/scatter.cu.h"
 #include "paddle/phi/kernels/sparse/convolution_kernel.h"
 #include "paddle/phi/kernels/sparse/gpu/convolution.cu.h"
 
@@ -149,16 +150,17 @@ void Conv3dGPUKernel(const GPUContext& dev_ctx,
   // 4. scatter
   config = phi::backends::gpu::GetGpuLaunchConfig1D(
       dev_ctx, out->nnz() * out_channels, 1);
-  ScatterKernel<T><<<config.block_per_grid.x,
-                     config.thread_per_block.x,
-                     0,
-                     dev_ctx.stream()>>>(out_features_ptr,
-                                         unique_value.data<int>(),
-                                         out_index.data<int>(),
-                                         out->nnz(),
-                                         n,
-                                         out_channels,
-                                         out_values_ptr);
+  phi::funcs::sparse::ScatterKernel<T><<<config.block_per_grid.x,
+                                         config.thread_per_block.x,
+                                         0,
+                                         dev_ctx.stream()>>>(
+      out_features_ptr,
+      unique_value.data<int>(),
+      out_index.data<int>(),
+      out->nnz(),
+      n,
+      out_channels,
+      out_values_ptr);
 }
 /**
  * x: (N, D, H, W, C)
diff --git a/paddle/phi/kernels/sparse/sparse_utils_kernel.h b/paddle/phi/kernels/sparse/sparse_utils_kernel.h
index 83de8bfec5d1b..072e6f141f8f1 100644
--- a/paddle/phi/kernels/sparse/sparse_utils_kernel.h
+++ b/paddle/phi/kernels/sparse/sparse_utils_kernel.h
@@ -20,7 +20,7 @@ limitations under the License. */
 #include "paddle/phi/core/sparse_coo_tensor.h"
 #include "paddle/phi/core/sparse_csr_tensor.h"
 #include "paddle/phi/kernels/empty_kernel.h"
-#include "paddle/phi/kernels/sparse/sort_kernel.h"
+#include "paddle/phi/kernels/sparse/coalesced_kernel.h"
 
 namespace phi {
 namespace sparse {
@@ -155,11 +155,9 @@ void SparseCooTensorKernel(const Context& dev_ctx,
                            const DenseTensor& indices,
                            const IntArray& dense_shape,
                            SparseCooTensor* out) {
-  SparseCooTensor before_sorted(
+  SparseCooTensor before_coalesced(
       indices, values, phi::make_ddim(dense_shape.GetData()));
-  // sort
-  SortKernel<T, Context>(dev_ctx, before_sorted, out);
-  // TODO(zhangkaihuo): merge the dumplicate indices
+  CoalescedKernel<T, Context>(dev_ctx, before_coalesced, out);
 }
 
 }  // namespace sparse
diff --git a/python/paddle/fluid/tests/unittests/test_sparse_utils_op.py b/python/paddle/fluid/tests/unittests/test_sparse_utils_op.py
index 157da73ad07de..a13cb58520d9d 100644
--- a/python/paddle/fluid/tests/unittests/test_sparse_utils_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sparse_utils_op.py
@@ -230,13 +230,14 @@ def test_sparse_coo_tensor_sorted(self):
         with _test_eager_guard():
             for device in devices:
                 paddle.device.set_device(device)
-                indices = [[1, 0], [0, 1]]
-                values = [1.0, 2.0]
+                #test unsorted and duplicate indices 
+                indices = [[1, 0, 0], [0, 1, 1]]
+                values = [1.0, 2.0, 3.0]
                 indices = paddle.to_tensor(indices, dtype='int32')
                 values = paddle.to_tensor(values, dtype='float32')
                 sparse_x = paddle.sparse.sparse_coo_tensor(indices, values)
                 indices_sorted = [[0, 1], [1, 0]]
-                values_sorted = [2.0, 1.0]
+                values_sorted = [5.0, 1.0]
                 assert np.array_equal(indices_sorted,
                                       sparse_x.indices().numpy())
                 assert np.array_equal(values_sorted, sparse_x.values().numpy())

From 1385e1cb81c9c83a4546909bb9103cfbb8b607ef Mon Sep 17 00:00:00 2001
From: zkh2016 <zhangkaihuo@baidu.com>
Date: Fri, 15 Apr 2022 08:21:37 +0000
Subject: [PATCH 11/23] fix conflict

---
 paddle/phi/kernels/sparse/gpu/convolution.cu.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/paddle/phi/kernels/sparse/gpu/convolution.cu.h b/paddle/phi/kernels/sparse/gpu/convolution.cu.h
index 0acfa41c4ef9b..fcbb3c60183eb 100644
--- a/paddle/phi/kernels/sparse/gpu/convolution.cu.h
+++ b/paddle/phi/kernels/sparse/gpu/convolution.cu.h
@@ -422,7 +422,7 @@ int ProductRuleBook(const Context& dev_ctx,
                                 rulebook_ptr + 3 * rulebook_len,
                                 -1);
     phi::funcs::sparse::DistanceKernel<IntT><<<1, 1, 0, dev_ctx.stream()>>>(
-        rulebook_ptr, last, key_result.data<IntT>() + rulebook_len);
+        rulebook_ptr, last, bound_ptr);
     phi::backends::gpu::GpuMemcpyAsync(&rulebook_len,
                                        bound_ptr,
                                        sizeof(IntT),
@@ -489,7 +489,7 @@ int ProductRuleBook(const Context& dev_ctx,
     // thrust::distance doesn't support stream parameters
     // const int out_non_zero_num = thrust::distance(unique_key_ptr,
     // new_end.first);
-    DistanceKernel<IntT><<<1, 1, 0, dev_ctx.stream()>>>(
+    phi::funcs::sparse::DistanceKernel<IntT><<<1, 1, 0, dev_ctx.stream()>>>(
         unique_key_ptr,
         new_end,
         rulebook_ptr + rulebook_rows * rulebook_cols - 1);

From ed97c98577e16f3cea867643e06565fa4e343a96 Mon Sep 17 00:00:00 2001
From: zkh2016 <zhangkaihuo@baidu.com>
Date: Fri, 15 Apr 2022 09:23:00 +0000
Subject: [PATCH 12/23] Add parameter verification

---
 .../tests/unittests/test_sparse_utils_op.py   | 119 ++++++++++++++++--
 python/paddle/sparse/creation.py              |  60 ++++++++-
 2 files changed, 167 insertions(+), 12 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/test_sparse_utils_op.py b/python/paddle/fluid/tests/unittests/test_sparse_utils_op.py
index a13cb58520d9d..d56556720e9be 100644
--- a/python/paddle/fluid/tests/unittests/test_sparse_utils_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sparse_utils_op.py
@@ -32,6 +32,8 @@ def test_create_coo_by_tensor(self):
             dense_elements = paddle.to_tensor(values, dtype='float32')
             coo = paddle.sparse.sparse_coo_tensor(
                 dense_indices, dense_elements, dense_shape, stop_gradient=False)
+            # test the to_string.py
+            print(coo)
             assert np.array_equal(indices, coo.indices().numpy())
             assert np.array_equal(values, coo.values().numpy())
 
@@ -69,6 +71,8 @@ def test_create_csr_by_np(self):
             dense_shape = [3, 4]
             csr = paddle.sparse.sparse_csr_tensor(crows, cols, values,
                                                   dense_shape)
+            # test the to_string.py
+            print(csr)
             assert np.array_equal(crows, csr.crows().numpy())
             assert np.array_equal(cols, csr.cols().numpy())
             assert np.array_equal(values, csr.values().numpy())
@@ -243,15 +247,112 @@ def test_sparse_coo_tensor_sorted(self):
                 assert np.array_equal(values_sorted, sparse_x.values().numpy())
 
 
-class TestError(unittest.TestCase):
-    def test_sparse_coo_tensor(self):
-        with self.assertRaises(ValueError):
-            indices = [[2, 3], [0, 2]]
-            values = [1, 2]
-            #the shape too small
-            dense_shape = [2, 2]
-            sparse_x = paddle.sparse.sparse_coo_tensor(
-                indices, values, shape=dense_shape)
+class TestCooError(unittest.TestCase):
+    def test_small_shape(self):
+        with _test_eager_guard():
+            with self.assertRaises(ValueError):
+                indices = [[2, 3], [0, 2]]
+                values = [1, 2]
+                # 1. the shape too small
+                dense_shape = [2, 2]
+                sparse_x = paddle.sparse.sparse_coo_tensor(
+                    indices, values, shape=dense_shape)
+
+    def test_same_nnz(self):
+        with _test_eager_guard():
+            with self.assertRaises(ValueError):
+                # 2. test the nnz of indices must same as nnz of values
+                indices = [[1, 2], [1, 0]]
+                values = [1, 2, 3]
+                sparse_x = paddle.sparse.sparse_coo_tensor(indices, values)
+
+    def test_same_dimensions(self):
+        with _test_eager_guard():
+            with self.assertRaises(ValueError):
+                indices = [[1, 2], [1, 0]]
+                values = [1, 2, 3]
+                shape = [2, 3, 4]
+                sparse_x = paddle.sparse.sparse_coo_tensor(
+                    indices, values, shape=shape)
+
+    def test_indices_dtype(self):
+        with _test_eager_guard():
+            with self.assertRaises(TypeError):
+                indices = [[1.0, 2.0], [0, 1]]
+                values = [1, 2]
+                sparse_x = paddle.sparse.sparse_coo_tensor(indices, values)
+
+
+class TestCsrError(unittest.TestCase):
+    def test_dimension1(self):
+        with _test_eager_guard():
+            with self.assertRaises(ValueError):
+                crows = [0, 1, 2, 3]
+                cols = [0, 1, 2]
+                values = [1, 2, 3]
+                shape = [3]
+                sparse_x = paddle.sparse.sparse_csr_tensor(crows, cols, values,
+                                                           shape)
+
+    def test_dimension2(self):
+        with _test_eager_guard():
+            with self.assertRaises(ValueError):
+                crows = [0, 1, 2, 3]
+                cols = [0, 1, 2]
+                values = [1, 2, 3]
+                shape = [3, 3, 3, 3]
+                sparse_x = paddle.sparse.sparse_csr_tensor(crows, cols, values,
+                                                           shape)
+
+    def test_same_shape1(self):
+        with _test_eager_guard():
+            with self.assertRaises(ValueError):
+                crows = [0, 1, 2, 3]
+                cols = [0, 1, 2, 3]
+                values = [1, 2, 3]
+                shape = [3, 4]
+                sparse_x = paddle.sparse.sparse_csr_tensor(crows, cols, values,
+                                                           shape)
+
+    def test_same_shape2(self):
+        with _test_eager_guard():
+            with self.assertRaises(ValueError):
+                crows = [0, 1, 2, 3]
+                cols = [0, 1, 2, 3]
+                values = [1, 2, 3, 4]
+                shape = [3, 4]
+                sparse_x = paddle.sparse.sparse_csr_tensor(crows, cols, values,
+                                                           shape)
+
+    def test_same_shape3(self):
+        with _test_eager_guard():
+            with self.assertRaises(ValueError):
+                crows = [0, 1, 2, 3, 0, 1, 2]
+                cols = [0, 1, 2, 3, 0, 1, 2]
+                values = [1, 2, 3, 4, 0, 1, 2]
+                shape = [2, 3, 4]
+                sparse_x = paddle.sparse.sparse_csr_tensor(crows, cols, values,
+                                                           shape)
+
+    def test_crows_first_value(self):
+        with _test_eager_guard():
+            with self.assertRaises(ValueError):
+                crows = [1, 1, 2, 3]
+                cols = [0, 1, 2]
+                values = [1, 2, 3]
+                shape = [3, 4]
+                sparse_x = paddle.sparse.sparse_csr_tensor(crows, cols, values,
+                                                           shape)
+
+    def test_dtype(self):
+        with _test_eager_guard():
+            with self.assertRaises(TypeError):
+                crows = [0, 1, 2, 3.0]
+                cols = [0, 1, 2]
+                values = [1, 2, 3]
+                shape = [3]
+                sparse_x = paddle.sparse.sparse_csr_tensor(crows, cols, values,
+                                                           shape)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/sparse/creation.py b/python/paddle/sparse/creation.py
index b48cc60851c99..d494336e1ff50 100644
--- a/python/paddle/sparse/creation.py
+++ b/python/paddle/sparse/creation.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import paddle
 from paddle import _C_ops
 from ..framework import core, dygraph_only
 from ..framework import _current_expected_place, _get_paddle_place
@@ -51,6 +52,13 @@ def _get_place(place):
     return place
 
 
+def _check_indices_dtype(dtype):
+    if dtype not in [paddle.int8, paddle.int16, paddle.int32, paddle.int64]:
+        raise TypeError(
+            "the dtype of indices must be 'int8' or 'int16' or 'int32' or 'int64'"
+        )
+
+
 @dygraph_only
 def sparse_coo_tensor(indices,
                       values,
@@ -117,6 +125,18 @@ def sparse_coo_tensor(indices,
     if len(indices.shape) != 2:
         raise ValueError("'indices' must be 2-D.")
 
+    nnz = indices.shape[1]
+    sparse_dim = indices.shape[0]
+
+    _check_indices_dtype(indices.dtype)
+
+    if nnz != values.shape[0]:
+        raise ValueError(
+            "the indices and values must have same number of non-zero, but get {} and {}".
+            format(nnz, values.shape[0]))
+
+    dense_dim = len(values.shape) - 1
+
     if not indices.place._equals(place):
         indices = indices._copy_to(place, False)
 
@@ -132,6 +152,10 @@ def sparse_coo_tensor(indices,
         if shape < min_shape:
             raise ValueError("the minimun shape required is {}, but get {}".
                              format(min_shape, shape))
+        if len(shape) != sparse_dim + dense_dim:
+            raise ValueError(
+                "the number of dimensions(len(shape) must be sparse_dim({}) + dense_dim({}), but get {}".
+                format(sparse_dim, dense_dim, len(shape)))
 
     return _C_ops.final_state_sparse_create_sparse_coo_tensor(values, indices,
                                                               shape)
@@ -149,6 +173,7 @@ def sparse_csr_tensor(crows,
     r"""
     Constructs a sparse ``paddle.Tensor`` in CSR(Compressed Sparse Row) format according to the 
     ``crows``, ``cols`` and ``values``.
+    Currently, the crows and cols of each batch must be incrementd.
 
     Args:
         crows(list|tuple|ndarray|Tensor): 1-D array, each element in the rows represents the 
@@ -207,10 +232,14 @@ def sparse_csr_tensor(crows,
         cols = to_tensor(cols, dtype=None, place=place, stop_gradient=True)
     if not isinstance(values, core.eager.Tensor):
         values = to_tensor(values, dtype, place, stop_gradient)
-    if len(crows.shape) != 1 or len(cols.shape) != 1 or len(values.shape) != 1:
+
+    _check_indices_dtype(crows.dtype)
+    _check_indices_dtype(cols.dtype)
+
+    if len(shape) != 2 and len(shape) != 3:
         raise ValueError(
-            "SparseCsrTensor only support 2-D or 3-D matrix. The 'crows', 'cols' and 'values' must be 1-D."
-        )
+            "SparseCsrTensor only support 2-D or 3-D matrix. but get shape {}".
+            format(shape))
 
     if not crows.place._equals(place):
         crows = crows._copy_to(place, False)
@@ -222,5 +251,30 @@ def sparse_csr_tensor(crows,
         values = values._copy_to(place, False)
     values = _handle_dtype(values, dtype)
     values.stop_gradient = stop_gradient
+
+    if len(crows.shape) != 1 or len(cols.shape) != 1 or len(values.shape) != 1:
+        raise ValueError("The 'crows', 'cols' and 'values' must be 1-D.")
+
+    if (len(cols) != len(values)):
+        raise ValueError("the length of cols must be same as length of values")
+
+    if len(shape) == 2:
+        if crows.shape[0] != shape[0] + 1:
+            raise ValueError(
+                "The length({}) of crows must be equal to the rows({})+1 of matrix.".
+                format(crows.shape[0], shape[0]))
+        if crows[0] != 0:
+            raise ValueError("the 0th value of crows must be 0")
+
+        if crows[-1] != values.shape[0]:
+            raise ValueError(
+                "the last value of crows must be equal the number of non-zero")
+    else:
+        if crows.shape[0] % (shape[0] + 1) != 0:
+            raise ValueError(
+                "The length({}) of crows must be divisible the rows({})+1 of matrix.".
+                format(crows.shape[0], shape[0]))
+    # TODO(zkh2016): check whether the value in crows and cols is legal 
+
     return core.eager.sparse_csr_tensor(crows, cols, values, shape,
                                         stop_gradient)

From 591e3c1470393a9c9391d9fb33211be9f0a618ce Mon Sep 17 00:00:00 2001
From: zkh2016 <zhangkaihuo@baidu.com>
Date: Fri, 15 Apr 2022 12:58:04 +0000
Subject: [PATCH 13/23] fix device

---
 .../tests/unittests/test_sparse_utils_op.py   | 64 ++++++++++---------
 1 file changed, 35 insertions(+), 29 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/test_sparse_utils_op.py b/python/paddle/fluid/tests/unittests/test_sparse_utils_op.py
index d56556720e9be..c87626a10c631 100644
--- a/python/paddle/fluid/tests/unittests/test_sparse_utils_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sparse_utils_op.py
@@ -212,39 +212,45 @@ def test_coo_values_grad(self):
     def test_sparse_coo_tensor_grad(self):
         with _test_eager_guard():
             for device in devices:
-                paddle.device.set_device(device)
-                indices = [[0, 1], [0, 1]]
-                values = [1, 2]
-                indices = paddle.to_tensor(indices, dtype='int32')
-                values = paddle.to_tensor(
-                    values, dtype='float32', stop_gradient=False)
-                sparse_x = paddle.sparse.sparse_coo_tensor(
-                    indices, values, shape=[2, 2], stop_gradient=False)
-                grad_indices = [[0, 1], [1, 1]]
-                grad_values = [2, 3]
-                grad_indices = paddle.to_tensor(grad_indices, dtype='int32')
-                grad_values = paddle.to_tensor(grad_values, dtype='float32')
-                sparse_out_grad = paddle.sparse.sparse_coo_tensor(
-                    grad_indices, grad_values, shape=[2, 2])
-                sparse_x.backward(sparse_out_grad)
-                correct_values_grad = [0, 3]
-                assert np.array_equal(correct_values_grad, values.grad.numpy())
+                if device == 'cpu' or (device == 'gpu' and
+                                       paddle.is_compiled_with_cuda()):
+                    paddle.device.set_device(device)
+                    indices = [[0, 1], [0, 1]]
+                    values = [1, 2]
+                    indices = paddle.to_tensor(indices, dtype='int32')
+                    values = paddle.to_tensor(
+                        values, dtype='float32', stop_gradient=False)
+                    sparse_x = paddle.sparse.sparse_coo_tensor(
+                        indices, values, shape=[2, 2], stop_gradient=False)
+                    grad_indices = [[0, 1], [1, 1]]
+                    grad_values = [2, 3]
+                    grad_indices = paddle.to_tensor(grad_indices, dtype='int32')
+                    grad_values = paddle.to_tensor(grad_values, dtype='float32')
+                    sparse_out_grad = paddle.sparse.sparse_coo_tensor(
+                        grad_indices, grad_values, shape=[2, 2])
+                    sparse_x.backward(sparse_out_grad)
+                    correct_values_grad = [0, 3]
+                    assert np.array_equal(correct_values_grad,
+                                          values.grad.numpy())
 
     def test_sparse_coo_tensor_sorted(self):
         with _test_eager_guard():
             for device in devices:
-                paddle.device.set_device(device)
-                #test unsorted and duplicate indices 
-                indices = [[1, 0, 0], [0, 1, 1]]
-                values = [1.0, 2.0, 3.0]
-                indices = paddle.to_tensor(indices, dtype='int32')
-                values = paddle.to_tensor(values, dtype='float32')
-                sparse_x = paddle.sparse.sparse_coo_tensor(indices, values)
-                indices_sorted = [[0, 1], [1, 0]]
-                values_sorted = [5.0, 1.0]
-                assert np.array_equal(indices_sorted,
-                                      sparse_x.indices().numpy())
-                assert np.array_equal(values_sorted, sparse_x.values().numpy())
+                if device == 'cpu' or (device == 'gpu' and
+                                       paddle.is_compiled_with_cuda()):
+                    paddle.device.set_device(device)
+                    #test unsorted and duplicate indices 
+                    indices = [[1, 0, 0], [0, 1, 1]]
+                    values = [1.0, 2.0, 3.0]
+                    indices = paddle.to_tensor(indices, dtype='int32')
+                    values = paddle.to_tensor(values, dtype='float32')
+                    sparse_x = paddle.sparse.sparse_coo_tensor(indices, values)
+                    indices_sorted = [[0, 1], [1, 0]]
+                    values_sorted = [5.0, 1.0]
+                    assert np.array_equal(indices_sorted,
+                                          sparse_x.indices().numpy())
+                    assert np.array_equal(values_sorted,
+                                          sparse_x.values().numpy())
 
 
 class TestCooError(unittest.TestCase):

From 8ecfa706c5f09bc553f6f1921915d2d1e4e0dca3 Mon Sep 17 00:00:00 2001
From: zkh2016 <zhangkaihuo@baidu.com>
Date: Mon, 18 Apr 2022 08:53:10 +0000
Subject: [PATCH 14/23] add sparse norm and pool

---
 .../fluid/tests/unittests/test_sparse_norm.py | 15 +++-
 .../tests/unittests/test_sparse_pool_op.py    | 36 +++++++++
 python/paddle/sparse/__init__.py              |  4 +-
 python/paddle/sparse/layer/__init__.py        |  2 +-
 python/paddle/sparse/layer/norm.py            | 75 +++++++++++--------
 python/paddle/utils/code_gen/sparse_api.yaml  |  9 +++
 .../paddle/utils/code_gen/sparse_bw_api.yaml  |  7 ++
 7 files changed, 108 insertions(+), 40 deletions(-)
 create mode 100644 python/paddle/fluid/tests/unittests/test_sparse_pool_op.py

diff --git a/python/paddle/fluid/tests/unittests/test_sparse_norm.py b/python/paddle/fluid/tests/unittests/test_sparse_norm.py
index ac505e3b7b42e..e1081b03270af 100644
--- a/python/paddle/fluid/tests/unittests/test_sparse_norm.py
+++ b/python/paddle/fluid/tests/unittests/test_sparse_norm.py
@@ -22,11 +22,18 @@
 class TestSparseBatchNorm(unittest.TestCase):
     def test_sparse_batch_norm(self):
         with _test_eager_guard():
-            shape = [2, 6, 6, 6, 4]
+            channels = 1
+            shape = [1, 1, 6, 6, channels]
             dense_x = paddle.randn(shape)
-            print(dense_x)
-            batch_norm = paddle.nn.BatchNorm3D(4, data_format="NDHWC")
+            batch_norm = paddle.nn.BatchNorm3D(channels, data_format="NDHWC")
             dense_y = batch_norm(dense_x)
+
             sparse_dim = 4
             sparse_x = dense_x.to_sparse_coo(sparse_dim)
-            batch_norm = paddle.sparse.BatchNorm(4)
+            sparse_batch_norm = paddle.sparse.BatchNorm1D(channels)
+            sparse_y = sparse_batch_norm(sparse_x)
+            assert np.allclose(
+                dense_y.flatten().numpy(),
+                sparse_y.values().flatten().numpy(),
+                atol=1e-5,
+                rtol=1e-5)
diff --git a/python/paddle/fluid/tests/unittests/test_sparse_pool_op.py b/python/paddle/fluid/tests/unittests/test_sparse_pool_op.py
new file mode 100644
index 0000000000000..c8c8d0637eb05
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_sparse_pool_op.py
@@ -0,0 +1,36 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import unittest
+import numpy as np
+import paddle
+import paddle.fluid.core as core
+from paddle import _C_ops
+from paddle.fluid.framework import _test_eager_guard
+
+
+class TestSparseMaxPool3D(unittest.TestCase):
+    def test1(self):
+        with _test_eager_guard():
+            dense_x = paddle.randn((1, 1, 4, 4, 1))
+            print(dense_x)
+            sparse_x = dense_x.to_sparse_coo(4)
+            kernel_sizes = [1, 3, 3]
+            paddings = [0, 0, 0]
+            strides = [1, 1, 1]
+            dilations = [1, 1, 1]
+            out = _C_ops.final_state_sparse_max_pool3d(
+                sparse_x, kernel_sizes, paddings, dilations, strides)
+            print(out)
diff --git a/python/paddle/sparse/__init__.py b/python/paddle/sparse/__init__.py
index 23ee0c5014aed..680d2d5109857 100644
--- a/python/paddle/sparse/__init__.py
+++ b/python/paddle/sparse/__init__.py
@@ -15,12 +15,12 @@
 from .creation import sparse_coo_tensor
 from .creation import sparse_csr_tensor
 from .layer.activation import ReLU
-from .layer.norm import BatchNorm
+from .layer.norm import BatchNorm1D
 
 from .layer.conv import Conv3D
 from .layer.conv import SubmConv3D
 
 __all__ = [
     'sparse_coo_tensor', 'sparse_csr_tensor', 'ReLU', 'Conv3D', 'SubmConv3D',
-    'BatchNorm'
+    'BatchNorm1D'
 ]
diff --git a/python/paddle/sparse/layer/__init__.py b/python/paddle/sparse/layer/__init__.py
index ee32e5027b50f..f065e6c8bb3c1 100644
--- a/python/paddle/sparse/layer/__init__.py
+++ b/python/paddle/sparse/layer/__init__.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 from .activation import ReLU
-from .norm import BatchNorm
+from .norm import BatchNorm1D
 from .conv import Conv3D
 from .conv import SubmConv3D
 
diff --git a/python/paddle/sparse/layer/norm.py b/python/paddle/sparse/layer/norm.py
index 4f2818cd2dcb2..71d1b55eeb26a 100644
--- a/python/paddle/sparse/layer/norm.py
+++ b/python/paddle/sparse/layer/norm.py
@@ -25,46 +25,55 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import paddle
+import warnings
 
-class BatchNorm(paddle.fluid.dygraph.BatchNorm):
+
+class BatchNorm1D(paddle.nn.BatchNorm1D):
     def __init__(self,
-                 num_channels,
-                 act=None,
-                 is_test=False,
+                 num_features,
                  momentum=0.9,
                  epsilon=1e-05,
-                 param_attr=None,
+                 weight_attr=None,
                  bias_attr=None,
-                 dtype='float32',
-                 data_layout='NCHW',
-                 in_place=False,
-                 moving_mean_name=None,
-                 moving_variance_name=None,
-                 do_model_average_for_mean_and_var=True,
-                 use_global_stats=False,
-                 trainable_statistics=False):
-        super(BatchNorm, self).__init__(
-            num_channels,
-            act=act,
-            is_test=is_test,
+                 data_format='NCL',
+                 use_global_stats=None,
+                 name=None):
+        super(BatchNorm1D, self).__init__(
+            num_features,
             momentum=momentum,
             epsilon=epsilon,
-            param_attr=param_attr,
+            weight_attr=weight_attr,
             bias_attr=bias_attr,
-            dtype=dtype,
-            data_layout,
-            in_place=in_place,
-            moving_mean_name=moving_mean_name,
-            moving_variance_name=moving_variance_name,
-            do_model_average_for_mean_and_var=do_model_average_for_mean_and_var,
+            data_format=data_format,
             use_global_stats=use_global_stats,
-            trainable_statistics=tranable_statistics)
+            name=name)
+
+    def forward(self, input):
+        values = input.values()
+        #out = super(BatchNorm1D, self).forward(values)
+        self._check_data_format(self._data_format)
+
+        self._check_input_dim(values)
+
+        if self.training:
+            warnings.warn(
+                "When training, we now always track global mean and variance.")
+
+        out = paddle.nn.functional.batch_norm(
+            values,
+            self._mean,
+            self._variance,
+            weight=self.weight,
+            bias=self.bias,
+            training=self.training,
+            momentum=self._momentum,
+            epsilon=self._epsilon,
+            data_format=self._data_format,
+            use_global_stats=self._use_global_stats)
 
-        def forward(self, input):
-            values = input.values()
-            out = super(BatchNorm, self).forward(values)
-            return paddle.sparse.sparse_coo_tensor(
-                input.indices(),
-                out,
-                shape=input.shape,
-                stop_gradient=input.stop_gradient)
+        return paddle.sparse.sparse_coo_tensor(
+            input.indices(),
+            out,
+            shape=input.shape,
+            stop_gradient=input.stop_gradient)
diff --git a/python/paddle/utils/code_gen/sparse_api.yaml b/python/paddle/utils/code_gen/sparse_api.yaml
index 100d7ad78319b..f1a2f427b2af0 100644
--- a/python/paddle/utils/code_gen/sparse_api.yaml
+++ b/python/paddle/utils/code_gen/sparse_api.yaml
@@ -65,3 +65,12 @@
   args : (Tensor x)
   output : Tensor(out@SparseCsrTensor)
   invoke : to_sparse_csr_impl(x)
+
+- api: max_pool3d
+  args : (Tensor x, int[] kernel_sizes, int[] paddings, int[] dilations, int[] strides)
+  output : Tensor(out@SparseCooTensor), Tensor(rulebook@DenseTensor)
+  kernel :
+    func : sparse_maxpool 
+    layout : x
+  intermediate : rulebook
+  backward : max_pool3d_grad 
diff --git a/python/paddle/utils/code_gen/sparse_bw_api.yaml b/python/paddle/utils/code_gen/sparse_bw_api.yaml
index e3946cbf72bc2..e0e1675db398c 100644
--- a/python/paddle/utils/code_gen/sparse_bw_api.yaml
+++ b/python/paddle/utils/code_gen/sparse_bw_api.yaml
@@ -32,6 +32,13 @@
   output : Tensor(x_grad@DenseTensor)
   invoke : to_dense_impl(out_grad)
 
+- backward_api : max_pool3d_grad
+  forward : max_pool3d (Tensor x, int[] kernel_sizes, int[] paddings, int[] dilations, int[] strides) -> Tensor(out@SparseCooTensor), Tensor(rulebook@DenseTensor)
+  args : (Tensor x, Tensor rulebook, Tensor out, Tensor out_grad, int[] kernel_sizes)
+  output : Tensor(x_grad@SparseCooTensor)
+  kernel :
+    func : sparse_maxpool_grad
+
 - backward_api : sparse_relu_grad
   forward : sparse_relu(Tensor x) -> Tensor(out@SparseCooTensor)
   args : (Tensor x, Tensor out_grad)

From 6153a5a5d82ecb4576926c460772900f128a8009 Mon Sep 17 00:00:00 2001
From: zkh2016 <zhangkaihuo@baidu.com>
Date: Mon, 18 Apr 2022 09:12:41 +0000
Subject: [PATCH 15/23] batch norm

---
 .../fluid/tests/unittests/test_sparse_norm.py | 17 +++-
 python/paddle/sparse/layer/norm.py            | 78 ++++++++++++++++++-
 2 files changed, 93 insertions(+), 2 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/test_sparse_norm.py b/python/paddle/fluid/tests/unittests/test_sparse_norm.py
index e1081b03270af..30fb6a3bccbe0 100644
--- a/python/paddle/fluid/tests/unittests/test_sparse_norm.py
+++ b/python/paddle/fluid/tests/unittests/test_sparse_norm.py
@@ -30,10 +30,25 @@ def test_sparse_batch_norm(self):
 
             sparse_dim = 4
             sparse_x = dense_x.to_sparse_coo(sparse_dim)
-            sparse_batch_norm = paddle.sparse.BatchNorm1D(channels)
+            sparse_batch_norm = paddle.sparse.BatchNorm(channels)
             sparse_y = sparse_batch_norm(sparse_x)
             assert np.allclose(
                 dense_y.flatten().numpy(),
                 sparse_y.values().flatten().numpy(),
                 atol=1e-5,
                 rtol=1e-5)
+
+    def test(self):
+        with _test_eager_guard():
+            np.random.seed(123)
+            channels = 3
+            x_data = np.random.random(size=(1, 6, 6, 6,
+                                            channels)).astype('float32')
+            dense_x = paddle.to_tensor(x_data)
+            sparse_x = dense_x.to_sparse_coo(4)
+            batch_norm = paddle.sparse.BatchNorm(channels)
+            batch_norm_out = batch_norm(sparse_x)
+            print(batch_norm_out)
+
+
+#TODO(zkh2016): add more test
diff --git a/python/paddle/sparse/layer/norm.py b/python/paddle/sparse/layer/norm.py
index 71d1b55eeb26a..63b0d3dca23e1 100644
--- a/python/paddle/sparse/layer/norm.py
+++ b/python/paddle/sparse/layer/norm.py
@@ -29,7 +29,83 @@
 import warnings
 
 
-class BatchNorm1D(paddle.nn.BatchNorm1D):
+class BatchNorm(paddle.nn.BatchNorm1D):
+    r"""
+    Applies Batch Normalization over a SparseCooTensor as described in the paper Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift .
+
+    When use_global_stats = False, the :math:`\mu_{\beta}`
+    and :math:`\sigma_{\beta}^{2}` are the statistics of one mini-batch.
+    Calculated as follows:
+
+    ..  math::
+
+        \mu_{\beta} &\gets \frac{1}{m} \sum_{i=1}^{m} x_i \qquad &//\
+        \ mini-batch\ mean \\
+        \sigma_{\beta}^{2} &\gets \frac{1}{m} \sum_{i=1}^{m}(x_i - \
+        \mu_{\beta})^2 \qquad &//\ mini-batch\ variance \\
+
+    When use_global_stats = True, the :math:`\mu_{\beta}`
+    and :math:`\sigma_{\beta}^{2}` are not the statistics of one mini-batch.
+    They are global or running statistics (moving_mean and moving_variance). It usually got from the
+    pre-trained model. Calculated as follows:
+
+    .. math::
+        moving\_mean = moving\_mean * momentum + \mu_{\beta} * (1. - momentum) \quad &// global \ mean \\
+        moving\_variance = moving\_variance * momentum + \sigma_{\beta}^{2} * (1. - momentum) \quad &// global \ variance \\
+
+    The normalization function formula is as follows:
+
+    ..  math::
+
+        \hat{x_i} &\gets \frac{x_i - \mu_\beta} {\sqrt{\sigma_{\beta}^{2} + \epsilon}} \qquad &//\ normalize \\
+        y_i &\gets \gamma \hat{x_i} + \beta \qquad &//\ scale\ and\ shift
+
+    - :math:`\epsilon` : add a smaller value to the variance to prevent division by zero
+    - :math:`\gamma` : trainable proportional parameter
+    - :math:`\beta` : trainable deviation parameter
+
+    Parameters:
+        num_features(int): Indicate the number of channels of the input ``Tensor``.
+        epsilon(float, optional): The small value added to the variance to prevent division by zero. Default: 1e-5.
+        momentum(float, optional): The value used for the moving_mean and moving_var computation. Default: 0.9.
+        weight_attr(ParamAttr|bool, optional): The parameter attribute for Parameter `scale`
+            of batch_norm. If it is set to None or one attribute of ParamAttr, batch_norm
+            will create ParamAttr as weight_attr. If it is set to Fasle, the weight is not learnable.
+            If the Initializer of the weight_attr is not set, the parameter is initialized with Xavier. Default: None.
+        bias_attr(ParamAttr|bool, optional): The parameter attribute for the bias of batch_norm.
+            If it is set to None or one attribute of ParamAttr, batch_norm
+            will create ParamAttr as bias_attr. If it is set to Fasle, the weight is not learnable.
+            If the Initializer of the bias_attr is not set, the bias is initialized zero. Default: None.
+        data_format(str, optional): Specify the input data format, may be "NC", "NCL" or "NLC". Defalut "NCL".
+        use_global_stats(bool|None, optional): Whether to use global mean and variance. If set to False, use the statistics of one mini-batch, if set to True, use the global statistics, if set to None, use global statistics in the test phase and use the statistics of one mini-batch in the training phase. Default: None.
+        name(str, optional): Name for the BatchNorm, default is None. For more information, please refer to :ref:`api_guide_Name`..
+
+    Shape:
+        - x: A SparseCooTensor with layout = 'NDHWC'.
+        - output: SparseCooTensor with same shape as input x.
+
+    Returns:
+        None.
+    
+
+    Examples:
+        .. code-block:: python
+
+          import paddle
+          import numpy as np
+          from paddle.fluid.framework import _test_eager_guard
+
+          with _test_eager_guard():
+          np.random.seed(123)
+          channels = 3
+          x_data = np.random.random(size=(1, 6, 6, 6, channels)).astype('float32')
+          dense_x = paddle.to_tensor(x_data) 
+          sparse_x = dense_x.to_sparse_coo(4)
+          batch_norm = paddle.sparse.BatchNorm(channels)
+          batch_norm_out = batch_norm(sparse_x)
+          print(batch_norm_out)
+    """
+
     def __init__(self,
                  num_features,
                  momentum=0.9,

From 4667790e44fc104b266d194654c54a1ea331375f Mon Sep 17 00:00:00 2001
From: zkh2016 <zhangkaihuo@baidu.com>
Date: Wed, 20 Apr 2022 02:56:51 +0000
Subject: [PATCH 16/23] fix getting stride, fix infer dense shape

---
 paddle/phi/kernels/sparse/cpu/coalesced_kernel.cc   |  2 +-
 paddle/phi/kernels/sparse/cpu/sparse_mask_kernel.cc |  2 +-
 paddle/phi/kernels/sparse/gpu/coalesced_kernel.cu   |  2 +-
 paddle/phi/kernels/sparse/gpu/sparse_mask_kernel.cu |  2 +-
 python/paddle/sparse/creation.py                    | 12 +++++++++---
 5 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/paddle/phi/kernels/sparse/cpu/coalesced_kernel.cc b/paddle/phi/kernels/sparse/cpu/coalesced_kernel.cc
index 0ebddf9b683f0..22c5e14b35f56 100644
--- a/paddle/phi/kernels/sparse/cpu/coalesced_kernel.cc
+++ b/paddle/phi/kernels/sparse/cpu/coalesced_kernel.cc
@@ -44,7 +44,7 @@ void CoalescedCPUKernel(const CPUContext& dev_ctx,
 
   const T* x_values_ptr = x_values.data<T>();
   const int64_t stride =
-      x.dims().size() == sparse_dim ? 1 : x.dims().size() - sparse_dim;
+      x.dims().size() == sparse_dim ? 1 : x.non_zero_elements().dims()[1];
 
   std::map<IntT, std::vector<int64_t>> indices_to_index;
   for (uint64_t i = 0; i < x_indexs.size(); i++) {
diff --git a/paddle/phi/kernels/sparse/cpu/sparse_mask_kernel.cc b/paddle/phi/kernels/sparse/cpu/sparse_mask_kernel.cc
index 1508de407caa7..0ec8b808ba838 100644
--- a/paddle/phi/kernels/sparse/cpu/sparse_mask_kernel.cc
+++ b/paddle/phi/kernels/sparse/cpu/sparse_mask_kernel.cc
@@ -125,7 +125,7 @@ void SparseMaskHelperCPUKernel(const CPUContext& dev_ctx,
   T* out_ptr = out->data<T>();
   memset(out_ptr, static_cast<T>(0), out->numel() * sizeof(T));
   const int64_t stride =
-      x.dims().size() == sparse_dim ? 1 : x.dims().size() - sparse_dim;
+      x.dims().size() == sparse_dim ? 1 : x.non_zero_elements().dims()[1];
   const T* in_ptr = x.non_zero_elements().data<T>();
   // TODO(zhangkaihuo): multithreading can be used for acceleration
   for (uint64_t i = 0; i < mask_indexs.size(); i++) {
diff --git a/paddle/phi/kernels/sparse/gpu/coalesced_kernel.cu b/paddle/phi/kernels/sparse/gpu/coalesced_kernel.cu
index 3ffcd28955a53..b2e7884580c74 100644
--- a/paddle/phi/kernels/sparse/gpu/coalesced_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/coalesced_kernel.cu
@@ -76,7 +76,7 @@ void CoalescedGPUKernel(const GPUContext& dev_ctx,
   // 2. get the address of each non-zero values
   const T* x_values_ptr = x_values.data<T>();
   const int64_t stride =
-      x.dims().size() == sparse_dim ? 1 : x.dims().size() - sparse_dim;
+      x.dims().size() == sparse_dim ? 1 : x.non_zero_elements().dims()[1];
   DenseTensor values_indexs = phi::Empty(
       dev_ctx, DenseTensorMeta(DataType::INT32, {nnz}, DataLayout::NCHW));
   int* values_indexs_ptr = values_indexs.data<int>();
diff --git a/paddle/phi/kernels/sparse/gpu/sparse_mask_kernel.cu b/paddle/phi/kernels/sparse/gpu/sparse_mask_kernel.cu
index 4e2d12f33955e..4253845956ea7 100644
--- a/paddle/phi/kernels/sparse/gpu/sparse_mask_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/sparse_mask_kernel.cu
@@ -231,7 +231,7 @@ void SparseMaskHelperGPUKernel(const GPUContext& dev_ctx,
   T* out_ptr = out->data<T>();
 
   const int64_t stride =
-      x.dims().size() == sparse_dim ? 1 : x.dims().size() - sparse_dim;
+      x.dims().size() == sparse_dim ? 1 : x.non_zero_elements().dims()[1];
 
   SparseMaskCopyKernel<<<config.block_per_grid,
                          config.thread_per_block,
diff --git a/python/paddle/sparse/creation.py b/python/paddle/sparse/creation.py
index d494336e1ff50..0b3ee29f87bdf 100644
--- a/python/paddle/sparse/creation.py
+++ b/python/paddle/sparse/creation.py
@@ -20,6 +20,8 @@
 from ..tensor import max
 from ..fluid.data_feeder import check_variable_and_dtype, check_type, check_dtype, convert_dtype
 
+import numpy as np
+
 __all__ = [
     'sparse_coo_tensor',
     'sparse_csr_tensor',
@@ -33,11 +35,14 @@ def _handle_dtype(data, dtype):
     return data
 
 
-def _infer_dense_shape(indices):
+def _infer_dense_shape(indices, values):
     assert len(indices.shape) == 2
     lens = max(indices, axis=1)
     lens = lens + 1
-    return list(lens.numpy())
+    lens = lens.numpy()
+    if len(values.shape) > 1:
+        lens = np.append(lens, values.shape[1:])
+    return list(lens)
 
 
 def _get_place(place):
@@ -145,7 +150,8 @@ def sparse_coo_tensor(indices,
     values = _handle_dtype(values, dtype)
     values.stop_gradient = stop_gradient
 
-    min_shape = _infer_dense_shape(indices)
+    min_shape = _infer_dense_shape(indices, values)
+
     if shape is None:
         shape = min_shape
     else:

From 378eb60fe68a1172d6b71e5290f3dc5a7c947657 Mon Sep 17 00:00:00 2001
From: zkh2016 <zhangkaihuo@baidu.com>
Date: Wed, 20 Apr 2022 03:18:53 +0000
Subject: [PATCH 17/23] sparse batch_norm

---
 .../fluid/tests/unittests/test_sparse_norm.py | 51 ++++++++++++-------
 .../tests/unittests/test_sparse_utils_op.py   | 41 ++++++++++++++-
 python/paddle/sparse/__init__.py              |  4 +-
 python/paddle/sparse/layer/__init__.py        |  2 +-
 python/paddle/sparse/layer/norm.py            | 19 ++++---
 5 files changed, 89 insertions(+), 28 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/test_sparse_norm.py b/python/paddle/fluid/tests/unittests/test_sparse_norm.py
index 30fb6a3bccbe0..937f24c26f29d 100644
--- a/python/paddle/fluid/tests/unittests/test_sparse_norm.py
+++ b/python/paddle/fluid/tests/unittests/test_sparse_norm.py
@@ -17,38 +17,55 @@
 import numpy as np
 import paddle
 from paddle.fluid.framework import _test_eager_guard
+import copy
 
 
 class TestSparseBatchNorm(unittest.TestCase):
-    def test_sparse_batch_norm(self):
+    def test(self):
         with _test_eager_guard():
-            channels = 1
-            shape = [1, 1, 6, 6, channels]
+            paddle.seed(0)
+            channels = 4
+            shape = [2, 3, 6, 6, channels]
+            #there is no zero in dense_x
             dense_x = paddle.randn(shape)
+            dense_x.stop_gradient = False
+
             batch_norm = paddle.nn.BatchNorm3D(channels, data_format="NDHWC")
             dense_y = batch_norm(dense_x)
+            dense_y.backward(dense_y)
 
             sparse_dim = 4
-            sparse_x = dense_x.to_sparse_coo(sparse_dim)
+            dense_x2 = copy.deepcopy(dense_x)
+            dense_x2.stop_gradient = False
+            sparse_x = dense_x2.to_sparse_coo(sparse_dim)
             sparse_batch_norm = paddle.sparse.BatchNorm(channels)
+            # set same params
+            sparse_batch_norm._mean.set_value(batch_norm._mean)
+            sparse_batch_norm._variance.set_value(batch_norm._variance)
+            sparse_batch_norm.weight.set_value(batch_norm.weight)
+
             sparse_y = sparse_batch_norm(sparse_x)
+            # compare the result with dense batch_norm
             assert np.allclose(
                 dense_y.flatten().numpy(),
                 sparse_y.values().flatten().numpy(),
                 atol=1e-5,
                 rtol=1e-5)
 
-    def test(self):
+            # test backward
+            sparse_y.backward(sparse_y)
+            assert np.allclose(
+                dense_x.grad.flatten().numpy(),
+                sparse_x.grad.values().flatten().numpy(),
+                atol=1e-5,
+                rtol=1e-5)
+
+    def test_error_layout(self):
         with _test_eager_guard():
-            np.random.seed(123)
-            channels = 3
-            x_data = np.random.random(size=(1, 6, 6, 6,
-                                            channels)).astype('float32')
-            dense_x = paddle.to_tensor(x_data)
-            sparse_x = dense_x.to_sparse_coo(4)
-            batch_norm = paddle.sparse.BatchNorm(channels)
-            batch_norm_out = batch_norm(sparse_x)
-            print(batch_norm_out)
-
-
-#TODO(zkh2016): add more test
+            with self.assertRaises(ValueError):
+                shape = [2, 3, 6, 6, 3]
+                x = paddle.randn(shape)
+                sparse_x = x.to_sparse_coo(4)
+                sparse_batch_norm = paddle.sparse.BatchNorm(
+                    3, data_format='NCDHW')
+                sparse_batch_norm(sparse_x)
diff --git a/python/paddle/fluid/tests/unittests/test_sparse_utils_op.py b/python/paddle/fluid/tests/unittests/test_sparse_utils_op.py
index c87626a10c631..ad282e2fd2418 100644
--- a/python/paddle/fluid/tests/unittests/test_sparse_utils_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sparse_utils_op.py
@@ -19,7 +19,7 @@
 import paddle.fluid.core as core
 from paddle.fluid.framework import _test_eager_guard
 
-devices = ['cpu', 'gpu']
+devices = ['cpu']
 
 
 class TestSparseCreate(unittest.TestCase):
@@ -208,6 +208,20 @@ def test_coo_values_grad(self):
             # test coo_values_grad
             values_tensor.backward(paddle.to_tensor(out_grad))
             assert np.array_equal(out_grad, sparse_x.grad.values().numpy())
+            indices = [[0, 0, 1, 2, 2], [1, 3, 2, 0, 1]]
+            values = [[1.0, 1.0], [2.0, 2.0], [3.0, 3.0], [4.0, 4.0],
+                      [5.0, 5.0]]
+            sparse_x = paddle.sparse.sparse_coo_tensor(
+                paddle.to_tensor(indices),
+                paddle.to_tensor(values),
+                shape=[3, 4, 2],
+                stop_gradient=False)
+            values_tensor = sparse_x.values()
+            out_grad = [[2.0, 2.0], [3.0, 3.0], [5.0, 5.0], [8.0, 8.0],
+                        [9.0, 9.0]]
+            # test coo_values_grad
+            values_tensor.backward(paddle.to_tensor(out_grad))
+            assert np.array_equal(out_grad, sparse_x.grad.values().numpy())
 
     def test_sparse_coo_tensor_grad(self):
         with _test_eager_guard():
@@ -233,6 +247,21 @@ def test_sparse_coo_tensor_grad(self):
                     assert np.array_equal(correct_values_grad,
                                           values.grad.numpy())
 
+                    # test the non-zero values is a vector
+                    values = [[1, 1], [2, 2]]
+                    values = paddle.to_tensor(
+                        values, dtype='float32', stop_gradient=False)
+                    sparse_x = paddle.sparse.sparse_coo_tensor(
+                        indices, values, shape=[2, 2, 2], stop_gradient=False)
+                    grad_values = [[2, 2], [3, 3]]
+                    grad_values = paddle.to_tensor(grad_values, dtype='float32')
+                    sparse_out_grad = paddle.sparse.sparse_coo_tensor(
+                        grad_indices, grad_values, shape=[2, 2, 2])
+                    sparse_x.backward(sparse_out_grad)
+                    correct_values_grad = [[0, 0], [3, 3]]
+                    assert np.array_equal(correct_values_grad,
+                                          values.grad.numpy())
+
     def test_sparse_coo_tensor_sorted(self):
         with _test_eager_guard():
             for device in devices:
@@ -252,6 +281,16 @@ def test_sparse_coo_tensor_sorted(self):
                     assert np.array_equal(values_sorted,
                                           sparse_x.values().numpy())
 
+                    # test the non-zero values is a vector
+                    values = [[1.0, 1.0], [2.0, 2.0], [3.0, 3.0]]
+                    values = paddle.to_tensor(values, dtype='float32')
+                    sparse_x = paddle.sparse.sparse_coo_tensor(indices, values)
+                    values_sorted = [[5.0, 5.0], [1.0, 1.0]]
+                    assert np.array_equal(indices_sorted,
+                                          sparse_x.indices().numpy())
+                    assert np.array_equal(values_sorted,
+                                          sparse_x.values().numpy())
+
 
 class TestCooError(unittest.TestCase):
     def test_small_shape(self):
diff --git a/python/paddle/sparse/__init__.py b/python/paddle/sparse/__init__.py
index 680d2d5109857..23ee0c5014aed 100644
--- a/python/paddle/sparse/__init__.py
+++ b/python/paddle/sparse/__init__.py
@@ -15,12 +15,12 @@
 from .creation import sparse_coo_tensor
 from .creation import sparse_csr_tensor
 from .layer.activation import ReLU
-from .layer.norm import BatchNorm1D
+from .layer.norm import BatchNorm
 
 from .layer.conv import Conv3D
 from .layer.conv import SubmConv3D
 
 __all__ = [
     'sparse_coo_tensor', 'sparse_csr_tensor', 'ReLU', 'Conv3D', 'SubmConv3D',
-    'BatchNorm1D'
+    'BatchNorm'
 ]
diff --git a/python/paddle/sparse/layer/__init__.py b/python/paddle/sparse/layer/__init__.py
index f065e6c8bb3c1..ee32e5027b50f 100644
--- a/python/paddle/sparse/layer/__init__.py
+++ b/python/paddle/sparse/layer/__init__.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 from .activation import ReLU
-from .norm import BatchNorm1D
+from .norm import BatchNorm
 from .conv import Conv3D
 from .conv import SubmConv3D
 
diff --git a/python/paddle/sparse/layer/norm.py b/python/paddle/sparse/layer/norm.py
index 63b0d3dca23e1..1cd8cd07f85af 100644
--- a/python/paddle/sparse/layer/norm.py
+++ b/python/paddle/sparse/layer/norm.py
@@ -112,10 +112,10 @@ def __init__(self,
                  epsilon=1e-05,
                  weight_attr=None,
                  bias_attr=None,
-                 data_format='NCL',
+                 data_format='NDHWC',
                  use_global_stats=None,
                  name=None):
-        super(BatchNorm1D, self).__init__(
+        super(BatchNorm, self).__init__(
             num_features,
             momentum=momentum,
             epsilon=epsilon,
@@ -125,18 +125,23 @@ def __init__(self,
             use_global_stats=use_global_stats,
             name=name)
 
+    def _check_data_format(self, input):
+        if input != "NDHWC":
+            raise ValueError('sparse BatchNorm only support layout of "NDHWC"')
+
     def forward(self, input):
         values = input.values()
-        #out = super(BatchNorm1D, self).forward(values)
         self._check_data_format(self._data_format)
 
-        self._check_input_dim(values)
+        if len(values.shape) != 2:
+            raise ValueError('expected 2D input.values() (got {}D)'.format(
+                len(values.shape)))
 
         if self.training:
             warnings.warn(
                 "When training, we now always track global mean and variance.")
 
-        out = paddle.nn.functional.batch_norm(
+        batch_norm_out = paddle.nn.functional.batch_norm(
             values,
             self._mean,
             self._variance,
@@ -145,11 +150,11 @@ def forward(self, input):
             training=self.training,
             momentum=self._momentum,
             epsilon=self._epsilon,
-            data_format=self._data_format,
+            data_format='NC',
             use_global_stats=self._use_global_stats)
 
         return paddle.sparse.sparse_coo_tensor(
             input.indices(),
-            out,
+            batch_norm_out,
             shape=input.shape,
             stop_gradient=input.stop_gradient)

From 5467b9c814a6c1f412b6aba8c949da5190ea7ee6 Mon Sep 17 00:00:00 2001
From: zkh2016 <zhangkaihuo@baidu.com>
Date: Wed, 20 Apr 2022 06:10:51 +0000
Subject: [PATCH 18/23] remove pool3d

---
 python/paddle/utils/code_gen/sparse_api.yaml    | 9 ---------
 python/paddle/utils/code_gen/sparse_bw_api.yaml | 7 -------
 2 files changed, 16 deletions(-)

diff --git a/python/paddle/utils/code_gen/sparse_api.yaml b/python/paddle/utils/code_gen/sparse_api.yaml
index f1a2f427b2af0..100d7ad78319b 100644
--- a/python/paddle/utils/code_gen/sparse_api.yaml
+++ b/python/paddle/utils/code_gen/sparse_api.yaml
@@ -65,12 +65,3 @@
   args : (Tensor x)
   output : Tensor(out@SparseCsrTensor)
   invoke : to_sparse_csr_impl(x)
-
-- api: max_pool3d
-  args : (Tensor x, int[] kernel_sizes, int[] paddings, int[] dilations, int[] strides)
-  output : Tensor(out@SparseCooTensor), Tensor(rulebook@DenseTensor)
-  kernel :
-    func : sparse_maxpool 
-    layout : x
-  intermediate : rulebook
-  backward : max_pool3d_grad 
diff --git a/python/paddle/utils/code_gen/sparse_bw_api.yaml b/python/paddle/utils/code_gen/sparse_bw_api.yaml
index e0e1675db398c..e3946cbf72bc2 100644
--- a/python/paddle/utils/code_gen/sparse_bw_api.yaml
+++ b/python/paddle/utils/code_gen/sparse_bw_api.yaml
@@ -32,13 +32,6 @@
   output : Tensor(x_grad@DenseTensor)
   invoke : to_dense_impl(out_grad)
 
-- backward_api : max_pool3d_grad
-  forward : max_pool3d (Tensor x, int[] kernel_sizes, int[] paddings, int[] dilations, int[] strides) -> Tensor(out@SparseCooTensor), Tensor(rulebook@DenseTensor)
-  args : (Tensor x, Tensor rulebook, Tensor out, Tensor out_grad, int[] kernel_sizes)
-  output : Tensor(x_grad@SparseCooTensor)
-  kernel :
-    func : sparse_maxpool_grad
-
 - backward_api : sparse_relu_grad
   forward : sparse_relu(Tensor x) -> Tensor(out@SparseCooTensor)
   args : (Tensor x, Tensor out_grad)

From 3a4572542a0d98fb24e7c5e69eb7024b87c80a88 Mon Sep 17 00:00:00 2001
From: zkh2016 <zhangkaihuo@baidu.com>
Date: Wed, 20 Apr 2022 08:19:33 +0000
Subject: [PATCH 19/23] remove pool

---
 ..._sparse_norm.py => test_sparse_norm_op.py} |  0
 .../tests/unittests/test_sparse_pool_op.py    | 36 -------------------
 2 files changed, 36 deletions(-)
 rename python/paddle/fluid/tests/unittests/{test_sparse_norm.py => test_sparse_norm_op.py} (100%)
 delete mode 100644 python/paddle/fluid/tests/unittests/test_sparse_pool_op.py

diff --git a/python/paddle/fluid/tests/unittests/test_sparse_norm.py b/python/paddle/fluid/tests/unittests/test_sparse_norm_op.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/test_sparse_norm.py
rename to python/paddle/fluid/tests/unittests/test_sparse_norm_op.py
diff --git a/python/paddle/fluid/tests/unittests/test_sparse_pool_op.py b/python/paddle/fluid/tests/unittests/test_sparse_pool_op.py
deleted file mode 100644
index c8c8d0637eb05..0000000000000
--- a/python/paddle/fluid/tests/unittests/test_sparse_pool_op.py
+++ /dev/null
@@ -1,36 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-# 
-#     http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-import unittest
-import numpy as np
-import paddle
-import paddle.fluid.core as core
-from paddle import _C_ops
-from paddle.fluid.framework import _test_eager_guard
-
-
-class TestSparseMaxPool3D(unittest.TestCase):
-    def test1(self):
-        with _test_eager_guard():
-            dense_x = paddle.randn((1, 1, 4, 4, 1))
-            print(dense_x)
-            sparse_x = dense_x.to_sparse_coo(4)
-            kernel_sizes = [1, 3, 3]
-            paddings = [0, 0, 0]
-            strides = [1, 1, 1]
-            dilations = [1, 1, 1]
-            out = _C_ops.final_state_sparse_max_pool3d(
-                sparse_x, kernel_sizes, paddings, dilations, strides)
-            print(out)

From e20c1d9989335c3f6ed71b882fef1d38567e6200 Mon Sep 17 00:00:00 2001
From: zkh2016 <zhangkaihuo@baidu.com>
Date: Wed, 20 Apr 2022 11:59:03 +0000
Subject: [PATCH 20/23] fix examples

---
 python/paddle/sparse/layer/norm.py | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/python/paddle/sparse/layer/norm.py b/python/paddle/sparse/layer/norm.py
index 1cd8cd07f85af..39b0028fe8ece 100644
--- a/python/paddle/sparse/layer/norm.py
+++ b/python/paddle/sparse/layer/norm.py
@@ -96,14 +96,15 @@ class BatchNorm(paddle.nn.BatchNorm1D):
           from paddle.fluid.framework import _test_eager_guard
 
           with _test_eager_guard():
-          np.random.seed(123)
-          channels = 3
-          x_data = np.random.random(size=(1, 6, 6, 6, channels)).astype('float32')
-          dense_x = paddle.to_tensor(x_data) 
-          sparse_x = dense_x.to_sparse_coo(4)
-          batch_norm = paddle.sparse.BatchNorm(channels)
-          batch_norm_out = batch_norm(sparse_x)
-          print(batch_norm_out)
+              np.random.seed(123)
+              channels = 3
+              x_data = np.random.random(size=(1, 6, 6, 6, channels)).astype('float32')
+              dense_x = paddle.to_tensor(x_data) 
+              sparse_x = dense_x.to_sparse_coo(4)
+              batch_norm = paddle.sparse.BatchNorm(channels)
+              batch_norm_out = batch_norm(sparse_x)
+              print(batch_norm_out.shape)
+              # [1, 6, 6, 6, 3]
     """
 
     def __init__(self,

From b609c2b3cb409d8d0f8e953ea215ca5d0f8e06a6 Mon Sep 17 00:00:00 2001
From: zkh2016 <zhangkaihuo@baidu.com>
Date: Wed, 20 Apr 2022 12:31:36 +0000
Subject: [PATCH 21/23] conv3d support bias

---
 .../tests/unittests/test_sparse_conv_op.py      |  6 ++++--
 python/paddle/sparse/functional/conv.py         | 17 ++++++++++++++---
 2 files changed, 18 insertions(+), 5 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/test_sparse_conv_op.py b/python/paddle/fluid/tests/unittests/test_sparse_conv_op.py
index 42f628c8fb1fd..1677051ee9db4 100644
--- a/python/paddle/fluid/tests/unittests/test_sparse_conv_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sparse_conv_op.py
@@ -31,19 +31,21 @@ def test_conv3d(self):
             paddings = [0, 0, 0]
             strides = [1, 1, 1]
             dilations = [1, 1, 1]
+            bias = [1]
 
             indices = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 1, 2], [1, 3, 2, 3]]
             values = [1, 2, 3, 4]
             indices = paddle.to_tensor(indices, dtype='int32')
             values = paddle.to_tensor(values, dtype='float32')
             dense_shape = [1, 1, 3, 4, 1]
-            correct_out_values = [[4], [10]]
+            correct_out_values = [[5], [11]]
             sparse_input = core.eager.sparse_coo_tensor(indices, values,
                                                         dense_shape, False)
             out = paddle.sparse.functional.conv3d(
                 sparse_input,
                 dense_kernel,
-                bias=None,
+                bias=paddle.to_tensor(
+                    bias, dtype='float32'),
                 stride=strides,
                 padding=paddings,
                 dilation=dilations,
diff --git a/python/paddle/sparse/functional/conv.py b/python/paddle/sparse/functional/conv.py
index d8c0e5c914ccb..42b7b49835cf0 100644
--- a/python/paddle/sparse/functional/conv.py
+++ b/python/paddle/sparse/functional/conv.py
@@ -16,6 +16,8 @@
 
 from paddle import _C_ops, in_dynamic_mode
 from ...fluid.layers.utils import convert_to_list
+from ...fluid.layers.nn import elementwise_add
+from .. import sparse_coo_tensor
 from paddle.nn.functional.conv import _update_padding_nd
 
 
@@ -30,7 +32,6 @@ def _conv3d(x,
             data_format="NDHWC",
             name=None):
     assert in_dynamic_mode(), "Currently, only support dynamic mode"
-    assert bias == None, "Currently, sparse_conv3d does not support bias"
     assert groups == 1, "Currently, only support groups=1"
 
     dims = 3
@@ -61,8 +62,18 @@ def _conv3d(x,
     dilation = convert_to_list(dilation, dims, 'dilation')
     op_type = "conv3d"
 
-    return _C_ops.final_state_sparse_conv3d(x, weight, padding, dilation,
-                                            stride, groups, subm)
+    pre_bias = _C_ops.final_state_sparse_conv3d(x, weight, padding, dilation,
+                                                stride, groups, subm)
+    if bias is not None:
+        values = pre_bias.values()
+        add_bias = elementwise_add(values, bias, axis=1)
+        return sparse_coo_tensor(
+            pre_bias.indices(),
+            add_bias,
+            shape=pre_bias.shape,
+            stop_gradient=pre_bias.stop_gradient)
+    else:
+        return pre_bias
 
 
 def conv3d(x,

From 422e8019d7b93a1b2cad1ce355f1e29a48d0a16b Mon Sep 17 00:00:00 2001
From: zkh2016 <zhangkaihuo@baidu.com>
Date: Thu, 21 Apr 2022 09:09:07 +0000
Subject: [PATCH 22/23] fix docs

---
 .../fluid/tests/unittests/test_sparse_norm_op.py | 16 ++++++++++++++++
 python/paddle/sparse/layer/norm.py               |  7 +++----
 2 files changed, 19 insertions(+), 4 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/test_sparse_norm_op.py b/python/paddle/fluid/tests/unittests/test_sparse_norm_op.py
index 937f24c26f29d..3c3085ec8be69 100644
--- a/python/paddle/fluid/tests/unittests/test_sparse_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sparse_norm_op.py
@@ -69,3 +69,19 @@ def test_error_layout(self):
                 sparse_batch_norm = paddle.sparse.BatchNorm(
                     3, data_format='NCDHW')
                 sparse_batch_norm(sparse_x)
+
+    def test2(self):
+        with _test_eager_guard():
+            paddle.seed(123)
+            channels = 3
+            x_data = paddle.randn((1, 6, 6, 6, channels)).astype('float32')
+            dense_x = paddle.to_tensor(x_data)
+            sparse_x = dense_x.to_sparse_coo(4)
+            batch_norm = paddle.sparse.BatchNorm(channels)
+            batch_norm_out = batch_norm(sparse_x)
+            print(batch_norm_out.shape)
+            # [1, 6, 6, 6, 3]
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/sparse/layer/norm.py b/python/paddle/sparse/layer/norm.py
index 39b0028fe8ece..83b738a5dc354 100644
--- a/python/paddle/sparse/layer/norm.py
+++ b/python/paddle/sparse/layer/norm.py
@@ -66,8 +66,8 @@ class BatchNorm(paddle.nn.BatchNorm1D):
 
     Parameters:
         num_features(int): Indicate the number of channels of the input ``Tensor``.
-        epsilon(float, optional): The small value added to the variance to prevent division by zero. Default: 1e-5.
         momentum(float, optional): The value used for the moving_mean and moving_var computation. Default: 0.9.
+        epsilon(float, optional): The small value added to the variance to prevent division by zero. Default: 1e-5.
         weight_attr(ParamAttr|bool, optional): The parameter attribute for Parameter `scale`
             of batch_norm. If it is set to None or one attribute of ParamAttr, batch_norm
             will create ParamAttr as weight_attr. If it is set to Fasle, the weight is not learnable.
@@ -92,13 +92,12 @@ class BatchNorm(paddle.nn.BatchNorm1D):
         .. code-block:: python
 
           import paddle
-          import numpy as np
           from paddle.fluid.framework import _test_eager_guard
 
           with _test_eager_guard():
-              np.random.seed(123)
+              paddle.seed(123)
               channels = 3
-              x_data = np.random.random(size=(1, 6, 6, 6, channels)).astype('float32')
+              x_data = paddle.randn((1, 6, 6, 6, channels)).astype('float32')
               dense_x = paddle.to_tensor(x_data) 
               sparse_x = dense_x.to_sparse_coo(4)
               batch_norm = paddle.sparse.BatchNorm(channels)

From 56774d9d4760699a44db3a97996088d5a7aa2498 Mon Sep 17 00:00:00 2001
From: zkh2016 <zhangkaihuo@baidu.com>
Date: Thu, 21 Apr 2022 10:22:47 +0000
Subject: [PATCH 23/23] fix examples

---
 python/paddle/sparse/creation.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/paddle/sparse/creation.py b/python/paddle/sparse/creation.py
index 0b3ee29f87bdf..2cfbb3144acc2 100644
--- a/python/paddle/sparse/creation.py
+++ b/python/paddle/sparse/creation.py
@@ -111,7 +111,7 @@ def sparse_coo_tensor(indices,
         with _test_eager_guard():
             indices = [[0, 1, 2], [1, 2, 0]]
             values = [1.0, 2.0, 3.0]
-            dense_shape = [2, 3]
+            dense_shape = [3, 3]
             coo = paddle.sparse.sparse_coo_tensor(indices, values, dense_shape)
             # print(coo)
             # Tensor(shape=[2, 3], dtype=paddle.float32, place=Place(gpu:0), stop_gradient=True,