From 1b2bc14cb3307660265f0e31d46e42fadc5ec832 Mon Sep 17 00:00:00 2001
From: DesmonDay <908660116@qq.com>
Date: Wed, 3 Aug 2022 06:54:26 +0000
Subject: [PATCH 01/12] initial commit

---
 paddle/phi/api/yaml/api.yaml                  |  12 +-
 paddle/phi/api/yaml/backward.yaml             |  11 +
 paddle/phi/infermeta/multiary.cc              |  75 +++++
 paddle/phi/infermeta/multiary.h               |   7 +
 .../kernels/gpu/graph_send_ue_recv_funcs.h    | 131 ++++++++
 .../kernels/gpu/graph_send_uv_grad_kernel.cu  | 295 ++++++++++++++++++
 .../phi/kernels/gpu/graph_send_uv_kernel.cu   | 166 ++++++++++
 .../phi/kernels/graph_send_uv_grad_kernel.h   |  33 ++
 paddle/phi/kernels/graph_send_uv_kernel.h     |  31 ++
 .../impl/graph_send_ue_recv_kernel_impl.h     | 139 +++++++++
 .../tests/unittests/test_graph_send_uv_op.py  | 146 +++++++++
 11 files changed, 1045 insertions(+), 1 deletion(-)
 create mode 100644 paddle/phi/kernels/gpu/graph_send_ue_recv_funcs.h
 create mode 100644 paddle/phi/kernels/gpu/graph_send_uv_grad_kernel.cu
 create mode 100644 paddle/phi/kernels/gpu/graph_send_uv_kernel.cu
 create mode 100644 paddle/phi/kernels/graph_send_uv_grad_kernel.h
 create mode 100644 paddle/phi/kernels/graph_send_uv_kernel.h
 create mode 100644 paddle/phi/kernels/impl/graph_send_ue_recv_kernel_impl.h
 create mode 100644 python/paddle/fluid/tests/unittests/test_graph_send_uv_op.py

diff --git a/paddle/phi/api/yaml/api.yaml b/paddle/phi/api/yaml/api.yaml
index 1156206ee4b51..c00d64cf853e7 100644
--- a/paddle/phi/api/yaml/api.yaml
+++ b/paddle/phi/api/yaml/api.yaml
@@ -98,6 +98,16 @@
     func : erf
   backward : erf_grad
 
+- api : graph_send_uv
+  args : (Tensor x, Tensor y, Tensor src_index, Tensor dst_index, str compute_type = "ADD")
+  output : Tensor(out)
+  infer_meta :
+    func : GraphSendUVInferMeta
+  kernel :
+    func : graph_send_uv
+    data_type : x
+  backward : graph_send_uv_grad
+
 - api : lgamma
   args : (Tensor x)
   output : Tensor(out)
@@ -105,7 +115,7 @@
     func : UnchangedInferMeta
   kernel :
     func : lgamma
-  backward : lgamma_grad 
+  backward : lgamma_grad
 
 - api : mv
   args : (Tensor x, Tensor vec)
diff --git a/paddle/phi/api/yaml/backward.yaml b/paddle/phi/api/yaml/backward.yaml
index 53cdc97a716d7..c1d411c54ca50 100644
--- a/paddle/phi/api/yaml/backward.yaml
+++ b/paddle/phi/api/yaml/backward.yaml
@@ -105,6 +105,17 @@
     func : erf_grad
     data_type : out_grad
 
+- backward_api : graph_send_uv_grad
+  forward : graph_send_uv (Tensor x, Tensor y, Tensor src_index, Tensor dst_index, str compute_type = "ADD") -> Tensor(out)
+  args: (Tensor x, Tensor y, Tensor src_index, Tensor dst_index, Tensor out_grad, str compute_type = "ADD")
+  output : Tensor(x_grad), Tensor(y_grad)
+  infer_meta :
+    func : GeneralBinaryGradInferMeta
+    param : [x, y]
+  kernel :
+    func : graph_send_uv_grad
+    data_type : x
+
 - backward_api : lgamma_grad
   forward : lgamma(Tensor x) -> Tensor(out)
   args : (Tensor x, Tensor out_grad)
diff --git a/paddle/phi/infermeta/multiary.cc b/paddle/phi/infermeta/multiary.cc
index a524506c7f07b..7953900d706c0 100644
--- a/paddle/phi/infermeta/multiary.cc
+++ b/paddle/phi/infermeta/multiary.cc
@@ -20,6 +20,7 @@ limitations under the License. */
 #include "paddle/phi/common/scalar.h"
 #include "paddle/phi/core/infermeta_utils.h"
 #include "paddle/phi/core/meta_tensor.h"
+#include "paddle/phi/kernels/funcs/common_shape.h"
 #include "paddle/phi/kernels/funcs/concat_funcs.h"
 namespace phi {
 
@@ -2410,6 +2411,80 @@ void Yolov3LossInferMeta(const MetaTensor& x,
   gt_match_mask->set_dtype(x.dtype());
 }
 
+void GraphSendUVInferMeta(const MetaTensor& x,
+                          const MetaTensor& y,
+                          const MetaTensor& src_index,
+                          const MetaTensor& dst_index,
+                          const std::string& compute_type,
+                          MetaTensor* out) {
+  std::cout << "Enter GraphSendUVInferMeta" << std::endl;
+
+  auto src_index_dims = src_index.dims();
+
+  std::cout << "Check whether enter here.\n";
+  if (src_index_dims.size() == 2) {
+    PADDLE_ENFORCE_EQ(src_index_dims[1],
+                      1,
+                      phi::errors::InvalidArgument(
+                          "The last dim of Src_index should be 1 when it "
+                          "is 2D, but we get %d",
+                          src_index_dims[1]));
+  } else {
+    PADDLE_ENFORCE_EQ(
+        src_index_dims.size(),
+        1,
+        phi::errors::InvalidArgument(
+            "The Src_index should be 1D, when it is not 2D, but we get %d",
+            src_index_dims.size()));
+  }
+
+  auto dst_index_dims = dst_index.dims();
+  if (dst_index_dims.size() == 2) {
+    PADDLE_ENFORCE_EQ(dst_index_dims[1],
+                      1,
+                      phi::errors::InvalidArgument(
+                          "The last dim of Dst_index should be 1 when it "
+                          "is 2D, but we get %d",
+                          dst_index_dims[1]));
+  } else {
+    PADDLE_ENFORCE_EQ(
+        dst_index_dims.size(),
+        1,
+        phi::errors::InvalidArgument("The Dst_index should be 1D, "
+                                     "when it is not 2D, but we get %d",
+                                     dst_index_dims.size()));
+  }
+
+  PADDLE_ENFORCE_EQ(src_index_dims[0],
+                    dst_index_dims[0],
+                    phi::errors::InvalidArgument(
+                        "Src_index and Dst_index should have the same shape."));
+
+  // Infer out's shape according to x and y(need broadcasting condition)
+  out->set_dtype(x.dtype());
+  auto x_dims = x.dims();
+  auto y_dims = y.dims();
+  auto x_dims1 = phi::vectorize<int>(x_dims);
+  auto y_dims1 = phi::vectorize<int>(y_dims);
+  std::vector<int> x_dims2(x_dims1.begin() + 1, x_dims1.end());
+  std::vector<int> y_dims2(y_dims1.begin() + 1, y_dims1.end());
+  int max_dim = std::max(x_dims2.size(), y_dims2.size());
+  int axis = std::abs(static_cast<int>(x_dims2.size() - y_dims2.size()));
+  std::vector<int> x_dims_array(max_dim);
+  std::vector<int> y_dims_array(max_dim);
+  std::vector<int> out_dims_array(max_dim);
+  // Only need to broadcast dimensions other than the 0th dimension.
+  phi::funcs::GetBroadcastDimsArrays(phi::make_ddim(x_dims2),
+                                     phi::make_ddim(y_dims2),
+                                     x_dims_array.data(),
+                                     y_dims_array.data(),
+                                     out_dims_array.data(),
+                                     max_dim,
+                                     axis);
+  out_dims_array.insert(out_dims_array.begin(), src_index_dims[0]);
+  out->set_dims(phi::make_ddim(out_dims_array));
+}
+
 }  // namespace phi
 
 PD_REGISTER_INFER_META_FN(batch_norm, phi::BatchNormInferMeta);
diff --git a/paddle/phi/infermeta/multiary.h b/paddle/phi/infermeta/multiary.h
index 60342dc58f5c9..8f585350044a1 100644
--- a/paddle/phi/infermeta/multiary.h
+++ b/paddle/phi/infermeta/multiary.h
@@ -420,4 +420,11 @@ void Yolov3LossInferMeta(const MetaTensor& x,
                          MetaTensor* objectness_mask,
                          MetaTensor* gt_match_mask);
 
+void GraphSendUVInferMeta(const MetaTensor& x,
+                          const MetaTensor& y,
+                          const MetaTensor& src_index,
+                          const MetaTensor& dst_index,
+                          const std::string& compute_type,
+                          MetaTensor* out);
+
 }  // namespace phi
diff --git a/paddle/phi/kernels/gpu/graph_send_ue_recv_funcs.h b/paddle/phi/kernels/gpu/graph_send_ue_recv_funcs.h
new file mode 100644
index 0000000000000..2853a522b83d0
--- /dev/null
+++ b/paddle/phi/kernels/gpu/graph_send_ue_recv_funcs.h
@@ -0,0 +1,131 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <thrust/device_vector.h>
+
+#include <algorithm>
+#include <vector>
+
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/hostdevice.h"
+#include "paddle/phi/kernels/impl/graph_send_ue_recv_kernel_impl.h"
+
+namespace phi {
+
+#define CUDA_MAX_NUM_THREADS 1024
+
+inline void CopyBCastOff(const BroadCastInfo& bcast_info,
+                         thrust::device_vector<int64_t>& l_bcastoff,
+                         thrust::device_vector<int64_t>& r_bcastoff) {
+  l_bcastoff.resize(bcast_info.out_len);
+  r_bcastoff.resize(bcast_info.out_len);
+#ifdef PADDLE_WITH_HIP
+  hipMemcpy(thrust::raw_pointer_cast(l_bcastoff.data()),
+            bcast_info.l_offset.data(),
+            sizeof(int64_t) * bcast_info.out_len,
+            hipMemcpyHostToDevice);
+  hipMemcpy(thrust::raw_pointer_cast(r_bcastoff.data()),
+            bcast_info.r_offset.data(),
+            sizeof(int64_t) * bcast_info.out_len,
+            hipMemcpyHostToDevice);
+#else
+  cudaMemcpy(thrust::raw_pointer_cast(l_bcastoff.data()),
+             bcast_info.l_offset.data(),
+             sizeof(int64_t) * bcast_info.out_len,
+             cudaMemcpyHostToDevice);
+  cudaMemcpy(thrust::raw_pointer_cast(r_bcastoff.data()),
+             bcast_info.r_offset.data(),
+             sizeof(int64_t) * bcast_info.out_len,
+             cudaMemcpyHostToDevice);
+#endif
+}
+
+inline int FindNumThreads(int dim, int max_num_threads = CUDA_MAX_NUM_THREADS) {
+  PADDLE_ENFORCE_GE(dim,
+                    0,
+                    phi::errors::PreconditionNotMet(
+                        "Required dim >= 0, but received dim = %d", dim));
+  if (dim == 0) return 1;
+  int res = max_num_threads;
+  while (res > dim) {
+    res = res >> 1;
+  }
+  return res;
+}
+
+template <typename T>
+struct GraphSendUERecvSumCUDAFunctor {
+  DEVICE inline void operator()(T* output, T val) {
+    paddle::platform::CudaAtomicAdd(output, val);
+  }
+};
+
+template <typename T>
+struct GraphSendUERecvMaxCUDAFunctor {
+  DEVICE inline void operator()(T* output, T val) {
+    paddle::platform::CudaAtomicMax(output, val);
+  }
+};
+
+template <typename T>
+struct GraphSendUERecvMinCUDAFunctor {
+  DEVICE inline void operator()(T* output, T val) {
+    paddle::platform::CudaAtomicMin(output, val);
+  }
+};
+
+template <typename T,
+          typename IndexT,
+          typename ReduceFunctor,
+          typename ComputeFunctor>
+__global__ void GraphSendUERecvCUDAKernel(const T* x_data,
+                                          const T* e_data,
+                                          const IndexT* src_indices,
+                                          const IndexT* dst_indices,
+                                          const int64_t* xbcast_off,
+                                          const int64_t* ebcast_off,
+                                          T* output,
+                                          int64_t index_size,
+                                          int64_t x_len,
+                                          int64_t e_len,
+                                          int64_t out_len,
+                                          bool use_bcast,
+                                          ComputeFunctor cfunctor,
+                                          ReduceFunctor rfunctor) {
+  IndexT ty = blockIdx.y * blockDim.y + threadIdx.y;
+  const IndexT stride_y = blockDim.y * gridDim.y;
+
+  while (ty < index_size) {
+    IndexT src = src_indices[ty];
+    IndexT dst = dst_indices[ty];
+    int64_t tx = blockIdx.x * blockDim.x + threadIdx.x;
+    int64_t stride_x = blockDim.x * gridDim.x;
+
+    const T* x_off = x_data + src * x_len;
+    const T* e_off = e_data + ty * e_len;
+    T* out_off = output + dst * out_len;
+    while (tx < out_len) {
+      int64_t x_add = use_bcast ? xbcast_off[tx] : tx;
+      int64_t e_add = use_bcast ? ebcast_off[tx] : tx;
+      T val = cfunctor(x_off[x_add], e_off[e_add]);
+      rfunctor(out_off + tx, val);
+      tx += stride_x;
+    }
+    ty += stride_y;
+  }
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/gpu/graph_send_uv_grad_kernel.cu b/paddle/phi/kernels/gpu/graph_send_uv_grad_kernel.cu
new file mode 100644
index 0000000000000..96ec93ae555e6
--- /dev/null
+++ b/paddle/phi/kernels/gpu/graph_send_uv_grad_kernel.cu
@@ -0,0 +1,295 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/graph_send_uv_grad_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/hostdevice.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/empty_kernel.h"
+#include "paddle/phi/kernels/funcs/elementwise_functor.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/gpu/graph_send_recv_funcs.h"
+#include "paddle/phi/kernels/gpu/graph_send_ue_recv_funcs.h"
+#include "paddle/phi/kernels/impl/graph_send_ue_recv_kernel_impl.h"
+#include "paddle/phi/kernels/reduce_sum_kernel.h"
+
+namespace phi {
+
+template <typename Context, typename T, typename IndexT>
+void CalculateGrad(const Context& ctx,
+                   const T* out_grad,
+                   const IndexT* s_index,
+                   const IndexT* d_index,
+                   const phi::DDim& out_grad_dims,
+                   const phi::DDim& x_grad_dims,
+                   const std::string& compute_type,
+                   int64_t index_size,
+                   int64_t slice_size,
+                   T* x_grad,
+                   const DenseTensor& out_grad_tensor,
+                   const DenseTensor& y) {
+  std::vector<int64_t> reduce_idx;
+  bool reduce = ReduceGrad(out_grad_dims, x_grad_dims, reduce_idx);
+
+  if (compute_type == "ADD") {
+#ifdef PADDLE_WITH_HIP
+    int block = 256;
+#else
+    int block = 1024;
+#endif
+    int64_t n = slice_size * index_size;
+    int max_grid_dimx = ctx.GetCUDAMaxGridDimSize()[0];
+    int64_t grid_tmp = (n + block - 1) / block;
+    int64_t grid = grid_tmp < max_grid_dimx ? grid_tmp : max_grid_dimx;
+    GraphSendRecvSumCUDAFunctor<T, IndexT> functor;
+    if (!reduce) {
+      GraphSendRecvCUDAKernel<T, IndexT, GraphSendRecvSumCUDAFunctor<T, IndexT>>
+          <<<grid, block, 0, ctx.stream()>>>(out_grad,
+                                             d_index,
+                                             s_index,
+                                             x_grad,
+                                             index_size,
+                                             slice_size,
+                                             functor);
+    } else {
+      const auto& bcast_info = phi::CalcBCastInfo(out_grad_dims, x_grad_dims);
+      DenseTensor x_grad_v2 = phi::EmptyLike<T, Context>(ctx, out_grad_tensor);
+      phi::funcs::SetConstant<Context, T>()(ctx, &x_grad_v2, T(0));
+      T* x_grad_v2_data = x_grad_v2.data<T>();
+      GraphSendRecvCUDAKernel<T, IndexT, GraphSendRecvSumCUDAFunctor<T, IndexT>>
+          <<<grid, block, 0, ctx.stream()>>>(out_grad,
+                                             d_index,
+                                             s_index,
+                                             x_grad,
+                                             index_size,
+                                             bcast_info.out_len,
+                                             functor);
+      // Run reduce sum
+      DenseTensor x_grad_out = phi::Sum<T, Context>(
+          ctx,
+          x_grad_v2,
+          reduce_idx,
+          paddle::experimental::CppTypeToDataType<T>::Type(),
+          true);
+#ifdef PADDLE_WITH_HIP
+      hipMemcpy(x_grad,
+                x_grad_out.data<T>(),
+                x_grad_out.numel() * sizeof(T),
+                hipMemcpyDeviceToDevice);
+#else
+      cudaMemcpy(x_grad,
+                 x_grad_out.data<T>(),
+                 x_grad_out.numel() * sizeof(T),
+                 cudaMemcpyDeviceToDevice);
+#endif
+    }
+  } else if (compute_type == "MUL") {
+    const auto& bcast_info = phi::CalcBCastInfo(y.dims(), out_grad_dims);
+    thrust::device_vector<int64_t> l_bcastoff, r_bcastoff;
+    if (bcast_info.use_bcast) {
+      CopyBCastOff(bcast_info, l_bcastoff, r_bcastoff);
+    }
+    int64_t out_len = bcast_info.out_len;
+    const int ntx = FindNumThreads(out_len);
+    const int nty = CUDA_MAX_NUM_THREADS / ntx;
+    const int nbx = (out_len + ntx - 1) / ntx;
+    const int nby = (index_size + nty - 1) / nty;
+    const dim3 grid_(nbx, nby);
+    const dim3 block_(ntx, nty);
+    funcs::MultiplyFunctor<T> mul_functor;
+    GraphSendUERecvSumCUDAFunctor<T> sum_functor;
+    const T* y_data = y.data<T>();
+    if (!reduce) {
+      GraphSendUERecvCUDAKernel<T,
+                                IndexT,
+                                GraphSendUERecvSumCUDAFunctor<T>,
+                                funcs::MultiplyFunctor<T>>
+          <<<grid_, block_, 0, ctx.stream()>>>(
+              y_data,
+              out_grad,
+              d_index,
+              s_index,
+              thrust::raw_pointer_cast(l_bcastoff.data()),
+              thrust::raw_pointer_cast(r_bcastoff.data()),
+              x_grad,
+              index_size,
+              bcast_info.l_len,
+              bcast_info.r_len,
+              out_len,
+              bcast_info.use_bcast,
+              mul_functor,
+              sum_functor);
+    } else {
+      DenseTensor x_grad_v2 = phi::EmptyLike<T, Context>(ctx, out_grad_tensor);
+      phi::funcs::SetConstant<Context, T>()(ctx, &x_grad_v2, T(0));
+      T* x_grad_v2_data = x_grad_v2.data<T>();
+      GraphSendUERecvCUDAKernel<T,
+                                IndexT,
+                                GraphSendUERecvSumCUDAFunctor<T>,
+                                funcs::MultiplyFunctor<T>>
+          <<<grid_, block_, 0, ctx.stream()>>>(
+              y_data,
+              out_grad,
+              d_index,
+              s_index,
+              thrust::raw_pointer_cast(l_bcastoff.data()),
+              thrust::raw_pointer_cast(r_bcastoff.data()),
+              x_grad_v2_data,
+              index_size,
+              bcast_info.l_len,
+              bcast_info.r_len,
+              out_len,
+              bcast_info.use_bcast,
+              mul_functor,
+              sum_functor);
+      // Run reduce_sum
+      DenseTensor x_grad_out = phi::Sum<T, Context>(
+          ctx,
+          x_grad_v2,
+          reduce_idx,
+          paddle::experimental::CppTypeToDataType<T>::Type(),
+          true);
+#ifdef PADDLE_WITH_HIP
+      hipMemcpy(x_grad,
+                x_grad_out.data<T>(),
+                x_grad_out.numel() * sizeof(T),
+                hipMemcpyDeviceToDevice);
+#else
+      cudaMemcpy(x_grad,
+                 x_grad_out.data<T>(),
+                 x_grad_out.numel() * sizeof(T),
+                 cudaMemcpyDeviceToDevice);
+#endif
+    }
+  }
+}
+
+template <typename Context, typename T, typename IndexT>
+void GraphSendUVGradOpCUDAKernelLaunchHelper(const Context& ctx,
+                                             const DenseTensor& x,
+                                             const DenseTensor& y,
+                                             const DenseTensor& out_grad,
+                                             const DenseTensor& src_index,
+                                             const DenseTensor& dst_index,
+                                             const std::string& compute_type,
+                                             DenseTensor* x_grad,
+                                             DenseTensor* y_grad) {
+  const int& index_size = dst_index.dims()[0];
+
+  ctx.template Alloc<T>(x_grad);
+  T* x_grad_data = x_grad->data<T>();
+  ctx.template Alloc<T>(y_grad);
+  T* y_grad_data = y_grad->data<T>();
+  const auto& x_grad_dims = x_grad->dims();
+  const auto& y_grad_dims = y_grad->dims();
+  int64_t memset_size_x = 1, memset_size_y = 1;
+  int64_t slice_size_x = 1, slice_size_y = 1;
+  for (int i = 0; i < x_grad_dims.size(); i++) {
+    memset_size_x *= x_grad_dims[i];
+    if (i > 0) slice_size_x *= x_grad_dims[i];
+  }
+  for (int i = 0; i < y_grad_dims.size(); i++) {
+    memset_size_y *= y_grad_dims[i];
+    if (i > 0) slice_size_y *= y_grad_dims[i];
+  }
+  const size_t& memset_bytes_x = memset_size_x * sizeof(T);
+  const size_t& memset_bytes_y = memset_size_y * sizeof(T);
+#ifdef PADDLE_WITH_HIP
+  hipMemset(x_grad_data, 0, memset_bytes_x);
+  hipMemset(y_grad_data, 0, memset_bytes_y);
+#else
+  cudaMemset(x_grad_data, 0, memset_bytes_x);
+  cudaMemset(y_grad_data, 0, memset_bytes_y);
+#endif
+
+  if (index_size == 0) return;
+
+  const T* out_grad_data = out_grad.data<T>();
+  const IndexT* s_index = src_index.data<IndexT>();
+  const IndexT* d_index = dst_index.data<IndexT>();
+  // Calculate X grad.
+  const auto& out_grad_dims = out_grad.dims();
+  CalculateGrad<Context, T, IndexT>(ctx,
+                                    out_grad_data,
+                                    s_index,
+                                    d_index,
+                                    out_grad_dims,
+                                    x_grad_dims,
+                                    compute_type,
+                                    index_size,
+                                    slice_size_x,
+                                    x_grad_data,
+                                    out_grad,
+                                    y);
+  // Calculate Y grad.
+  CalculateGrad<Context, T, IndexT>(ctx,
+                                    out_grad_data,
+                                    d_index,
+                                    s_index,
+                                    out_grad_dims,
+                                    y_grad_dims,
+                                    compute_type,
+                                    index_size,
+                                    slice_size_y,
+                                    y_grad_data,
+                                    out_grad,
+                                    x);
+}
+
+template <typename T, typename Context>
+void GraphSendUVGradKernel(const Context& ctx,
+                           const DenseTensor& x,
+                           const DenseTensor& y,
+                           const DenseTensor& src_index,
+                           const DenseTensor& dst_index,
+                           const DenseTensor& out_grad,
+                           const std::string& compute_type,
+                           DenseTensor* x_grad,
+                           DenseTensor* y_grad) {
+  auto index_type = src_index.dtype();
+  if (index_type == phi::DataType::INT32) {
+    GraphSendUVGradOpCUDAKernelLaunchHelper<Context, T, int32_t>(ctx,
+                                                                 x,
+                                                                 y,
+                                                                 out_grad,
+                                                                 src_index,
+                                                                 dst_index,
+                                                                 compute_type,
+                                                                 x_grad,
+                                                                 y_grad);
+  } else if (index_type == phi::DataType::INT64) {
+    GraphSendUVGradOpCUDAKernelLaunchHelper<Context, T, int64_t>(ctx,
+                                                                 x,
+                                                                 y,
+                                                                 out_grad,
+                                                                 src_index,
+                                                                 dst_index,
+                                                                 compute_type,
+                                                                 x_grad,
+                                                                 y_grad);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(graph_send_uv_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::GraphSendUVGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/gpu/graph_send_uv_kernel.cu b/paddle/phi/kernels/gpu/graph_send_uv_kernel.cu
new file mode 100644
index 0000000000000..2a8bb9eed5750
--- /dev/null
+++ b/paddle/phi/kernels/gpu/graph_send_uv_kernel.cu
@@ -0,0 +1,166 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/graph_send_uv_kernel.h"
+#include "paddle/phi/kernels/gpu/graph_send_ue_recv_funcs.h"
+#include "paddle/phi/kernels/impl/graph_send_ue_recv_kernel_impl.h"
+
+#include <thrust/device_vector.h>
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/hostdevice.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/elementwise_functor.h"
+
+namespace phi {
+
+template <typename T, typename IndexT, typename ComputeFunctor>
+__global__ void GraphSendUVCUDAKernel(const T* x_data,
+                                      const T* y_data,
+                                      const IndexT* src_indices,
+                                      const IndexT* dst_indices,
+                                      const int64_t* xbcast_off,
+                                      const int64_t* ybcast_off,
+                                      T* output,
+                                      int64_t index_size,
+                                      int64_t x_len,
+                                      int64_t y_len,
+                                      int64_t out_len,
+                                      bool use_bcast,
+                                      ComputeFunctor cfunctor) {
+  IndexT ty = blockIdx.y * blockDim.y + threadIdx.y;
+  const IndexT stride_y = blockDim.y * gridDim.y;
+
+  while (ty < index_size) {
+    IndexT src = src_indices[ty];
+    IndexT dst = dst_indices[ty];
+    int64_t tx = blockIdx.x * blockDim.x + threadIdx.x;
+    int64_t stride_x = blockDim.x * gridDim.x;
+
+    const T* x_off = x_data + src * x_len;
+    const T* y_off = y_data + dst * y_len;
+    T* out_off = output + ty * out_len;
+    while (tx < out_len) {
+      int64_t x_add = use_bcast ? xbcast_off[tx] : tx;
+      int64_t y_add = use_bcast ? ybcast_off[tx] : tx;
+      T val = cfunctor(x_off[x_add], y_off[y_add]);
+      out_off[tx] = val;
+      tx += stride_x;
+    }
+    ty += stride_y;
+  }
+}
+
+template <typename Context, typename T, typename IndexT>
+void GraphSendUVOpCUDAKernelLaunchHelper(const Context& ctx,
+                                         const DenseTensor& x,
+                                         const DenseTensor& y,
+                                         const DenseTensor& src_index,
+                                         const DenseTensor& dst_index,
+                                         const std::string& compute_type,
+                                         DenseTensor* out) {
+  const int& index_size = src_index.dims()[0];
+  auto out_dims = out->dims();
+  int64_t memset_size = 1;
+  for (int i = 0; i < out_dims.size(); i++) {
+    memset_size *= out_dims[i];
+  }
+  ctx.template Alloc<T>(out);
+  T* out_data = out->data<T>();
+  if (index_size == 0) return;
+
+  const auto& bcast_info = phi::CalcBCastInfo(x.dims(), y.dims());
+  const T* x_data = x.data<T>();
+  const T* y_data = y.data<T>();
+  const IndexT* s_index = src_index.data<IndexT>();
+  const IndexT* d_index = dst_index.data<IndexT>();
+
+  thrust::device_vector<int64_t> x_bcastoff, y_bcastoff;
+  if (bcast_info.use_bcast) {
+    CopyBCastOff(bcast_info, x_bcastoff, y_bcastoff);
+  }
+
+  int64_t out_len = bcast_info.out_len;
+  const int ntx = FindNumThreads(out_len);
+  const int nty = CUDA_MAX_NUM_THREADS / ntx;
+  const int nbx = (out_len + ntx - 1) / ntx;
+  const int nby = (index_size + nty - 1) / nty;
+  const dim3 grid(nbx, nby);
+  const dim3 block(ntx, nty);
+  if (compute_type == "ADD") {
+    funcs::AddFunctor<T> add_functor;
+    GraphSendUVCUDAKernel<T, IndexT, funcs::AddFunctor<T>>
+        <<<grid, block, 0, ctx.stream()>>>(
+            x_data,
+            y_data,
+            s_index,
+            d_index,
+            thrust::raw_pointer_cast(x_bcastoff.data()),
+            thrust::raw_pointer_cast(y_bcastoff.data()),
+            out_data,
+            index_size,
+            bcast_info.l_len,
+            bcast_info.r_len,
+            out_len,
+            bcast_info.use_bcast,
+            add_functor);
+  } else if (compute_type == "MUL") {
+    funcs::MultiplyFunctor<T> mul_functor;
+    GraphSendUVCUDAKernel<T, IndexT, funcs::MultiplyFunctor<T>>
+        <<<grid, block, 0, ctx.stream()>>>(
+            x_data,
+            y_data,
+            s_index,
+            d_index,
+            thrust::raw_pointer_cast(x_bcastoff.data()),
+            thrust::raw_pointer_cast(y_bcastoff.data()),
+            out_data,
+            index_size,
+            bcast_info.l_len,
+            bcast_info.r_len,
+            out_len,
+            bcast_info.use_bcast,
+            mul_functor);
+  }
+}
+
+template <typename T, typename Context>
+void GraphSendUVKernel(const Context& ctx,
+                       const DenseTensor& x,
+                       const DenseTensor& y,
+                       const DenseTensor& src_index,
+                       const DenseTensor& dst_index,
+                       const std::string& compute_type,
+                       DenseTensor* out) {
+  auto index_type = src_index.dtype();
+  if (index_type == phi::DataType::INT32) {
+    GraphSendUVOpCUDAKernelLaunchHelper<Context, T, int32_t>(
+        ctx, x, y, src_index, dst_index, compute_type, out);
+  } else if (index_type == phi::DataType::INT64) {
+    GraphSendUVOpCUDAKernelLaunchHelper<Context, T, int64_t>(
+        ctx, x, y, src_index, dst_index, compute_type, out);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(graph_send_uv,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::GraphSendUVKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/graph_send_uv_grad_kernel.h b/paddle/phi/kernels/graph_send_uv_grad_kernel.h
new file mode 100644
index 0000000000000..329e4e61e9e74
--- /dev/null
+++ b/paddle/phi/kernels/graph_send_uv_grad_kernel.h
@@ -0,0 +1,33 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void GraphSendUVGradKernel(const Context& ctx,
+                           const DenseTensor& x,
+                           const DenseTensor& y,
+                           const DenseTensor& src_index,
+                           const DenseTensor& dst_index,
+                           const DenseTensor& out_grad,
+                           const std::string& compute_type,
+                           DenseTensor* x_grad,
+                           DenseTensor* y_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/graph_send_uv_kernel.h b/paddle/phi/kernels/graph_send_uv_kernel.h
new file mode 100644
index 0000000000000..159a27fc30a3b
--- /dev/null
+++ b/paddle/phi/kernels/graph_send_uv_kernel.h
@@ -0,0 +1,31 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void GraphSendUVKernel(const Context& ctx,
+                       const DenseTensor& x,
+                       const DenseTensor& y,
+                       const DenseTensor& src_index,
+                       const DenseTensor& dst_index,
+                       const std::string& compute_type,
+                       DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/graph_send_ue_recv_kernel_impl.h b/paddle/phi/kernels/impl/graph_send_ue_recv_kernel_impl.h
new file mode 100644
index 0000000000000..35e51fb930c8d
--- /dev/null
+++ b/paddle/phi/kernels/impl/graph_send_ue_recv_kernel_impl.h
@@ -0,0 +1,139 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <vector>
+
+#include "paddle/phi/kernels/funcs/common_shape.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+
+namespace phi {
+
+struct BroadCastInfo {
+  bool use_bcast;
+  // l_offset[i] indicates the start position of tensor lhs that required to
+  // compute the i-th element in output, so as r_offset[i].
+  std::vector<int64_t> l_offset, r_offset;
+  int64_t l_len, r_len, out_len, reduce_size;
+};
+
+inline bool UseBroadCast(const phi::DDim& l_dims, const phi::DDim& r_dims) {
+  if (l_dims.size() != r_dims.size()) {
+    return true;
+  }
+  for (int i = 1; i < l_dims.size(); i++) {
+    if (l_dims[i] != r_dims[i]) {
+      return true;
+    }
+  }
+  return false;
+}
+
+inline BroadCastInfo CalcBCastInfo(const phi::DDim& l_dims,
+                                   const phi::DDim& r_dims) {
+  BroadCastInfo binfo;
+  binfo.use_bcast = UseBroadCast(l_dims, r_dims);
+  binfo.l_len = 1;
+  binfo.r_len = 1;
+  for (int i = 1; i < l_dims.size(); i++) {
+    binfo.l_len *= l_dims[i];
+  }
+  for (int i = 1; i < r_dims.size(); i++) {
+    binfo.r_len *= r_dims[i];
+  }
+  // TODO(daisiming): Whether to add dot.
+  binfo.reduce_size = 1;
+  if (binfo.use_bcast) {
+    const int max_dim = std::max(l_dims.size(), r_dims.size()) - 1;
+    int stride_l = 1, stride_r = 1;
+    binfo.l_offset.emplace_back(0);
+    binfo.r_offset.emplace_back(0);
+    int out_len = 1;
+    for (int i = 0; i < max_dim; i++) {
+      // Iterate the axis from back to front.
+      const int dl =
+          (l_dims.size() - 1 - i < 1) ? 1 : l_dims[l_dims.size() - 1 - i];
+      const int dr =
+          (r_dims.size() - 1 - i < 1) ? 1 : r_dims[r_dims.size() - 1 - i];
+      for (int j = 1; j < std::max(dl, dr); j++) {
+        for (int k = 0; k < out_len; k++) {
+          binfo.l_offset.emplace_back(binfo.l_offset[k] +
+                                      j * (j < dl) * stride_l);
+          binfo.r_offset.emplace_back(binfo.r_offset[k] +
+                                      j * (j < dr) * stride_r);
+        }
+      }
+      out_len *= std::max(dl, dr);
+      stride_l *= dl;
+      stride_r *= dr;
+    }
+    binfo.out_len = out_len;
+  } else {
+    binfo.out_len = binfo.l_len;
+  }
+  return binfo;
+}
+
+inline std::vector<int> InferBroadcastShape(const phi::DDim& x_dims,
+                                            const phi::DDim& e_dims,
+                                            const std::string& type = "x") {
+  auto x_dims1 = phi::vectorize<int>(x_dims);
+  auto e_dims1 = phi::vectorize<int>(e_dims);
+  std::vector<int> x_dims2(x_dims1.begin() + 1, x_dims1.end());
+  std::vector<int> e_dims2(e_dims1.begin() + 1, e_dims1.end());
+  int max_dim = std::max(x_dims2.size(), e_dims2.size());
+  int axis = std::abs(static_cast<int>(x_dims2.size() - e_dims2.size()));
+  std::vector<int> x_dims_array(max_dim);
+  std::vector<int> e_dims_array(max_dim);
+  std::vector<int> out_dims_array(max_dim);
+  // Only need to broadcast dimensions other than the 0th dimension.
+  phi::funcs::GetBroadcastDimsArrays(phi::make_ddim(x_dims2),
+                                     phi::make_ddim(e_dims2),
+                                     x_dims_array.data(),
+                                     e_dims_array.data(),
+                                     out_dims_array.data(),
+                                     max_dim,
+                                     axis);
+  if (type == "x") {
+    out_dims_array.insert(out_dims_array.begin(), x_dims[0]);
+  } else {
+    out_dims_array.insert(out_dims_array.begin(), e_dims[0]);
+  }
+  return out_dims_array;
+}
+
+inline bool ReduceGrad(const phi::DDim& out_grad_dims,
+                       const phi::DDim& x_dims,
+                       std::vector<int64_t>& axis) {
+  // We must ensure the ndim of out_grad and x are the same.
+  bool reduce = false;
+  for (int i = 1; i < out_grad_dims.size(); i++) {
+    if (out_grad_dims[i] != x_dims[i]) {
+      reduce = true;
+      break;
+    }
+  }
+  if (!reduce) return false;
+
+  // Get reduce axis.
+  for (int i = 1; i < out_grad_dims.size(); i++) {
+    if (out_grad_dims[i] - x_dims[i] != 0) {
+      axis.emplace_back(i);
+    }
+  }
+  return true;
+}
+
+}  // namespace phi
diff --git a/python/paddle/fluid/tests/unittests/test_graph_send_uv_op.py b/python/paddle/fluid/tests/unittests/test_graph_send_uv_op.py
new file mode 100644
index 0000000000000..3d66df06cc5ed
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_graph_send_uv_op.py
@@ -0,0 +1,146 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid.framework import _test_eager_guard
+
+from op_test import OpTest
+
+
+def get_broadcast_shape(shp1, shp2):
+    pad_shp1, pad_shp2 = shp1, shp2
+    if len(shp1) > len(shp2):
+        pad_shp2 = [
+            1,
+        ] * (len(shp1) - len(shp2)) + shp2
+    elif len(shp1) < len(shp2):
+        pad_shp1 = [
+            1,
+        ] * (len(shp2) - len(shp1)) + shp1
+    for d1, d2 in zip(pad_shp1, pad_shp2):
+        if d1 != d2 and d1 != 1 and d2 != 1:
+            raise ValueError
+    rst = [max(d1, d2) for d1, d2 in zip(pad_shp1, pad_shp2)]
+    return rst
+
+
+class BroadCastInfo(object):
+
+    def __init__(self, x_shape, e_shape):
+        self.x_shape = x_shape
+        self.e_shape = e_shape
+
+        self.calculate_bcastinfo()
+
+    def use_bcast(self):
+        if len(self.x_shape) != len(self.e_shape):
+            return True
+        for i in range(1, len(self.x_shape)):
+            if self.x_shape[i] != self.e_shape[i]:
+                return True
+        return False
+
+    def calculate_bcastinfo(self):
+        lhs_len = 1
+        rhs_len = 1
+        for i in range(1, len(self.x_shape)):
+            lhs_len *= self.x_shape[i]
+        for i in range(1, len(self.e_shape)):
+            rhs_len *= self.e_shape[i]
+        use_b = self.use_bcast()
+
+        if use_b:
+            max_ndim = max(len(self.x_shape), len(self.e_shape)) - 1
+            out_len = 1
+            stride_l = stride_r = 1
+            lhs_offset = [0]
+            rhs_offset = [0]
+            for j in range(0, max_ndim):
+                dl = 1 if (len(self.x_shape) - 1 - j) < 1 \
+                       else self.x_shape[len(self.x_shape) - 1 - j]
+                dr = 1 if (len(self.e_shape) - 1 - j) < 1 \
+                       else self.e_shape[len(self.e_shape) - 1 - j]
+                for i in range(1, max(dl, dr)):
+                    for k in range(0, out_len):
+                        lhs_offset.append(lhs_offset[k] + i *
+                                          (i < dl) * stride_l)
+                        rhs_offset.append(rhs_offset[k] + i *
+                                          (i < dr) * stride_r)
+
+                out_len *= max(dl, dr)
+                stride_l *= dl
+                stride_r *= dr
+        else:
+            out_len = rhs_len
+
+        self.use_broadcast = use_b
+        self.out_len = out_len
+        self.lhs_len = lhs_len
+        self.rhs_len = rhs_len
+        if use_b:
+            self.lhs_offset = lhs_offset
+            self.rhs_offset = rhs_offset
+
+
+def compute_graph_send_uv(inputs, attributes):
+    x = inputs['x']
+    y = inputs['y']
+    src_index = inputs['src_index']
+    dst_index = inputs['dst_index']
+    compute_type = attributes['compute_type']
+
+    gather_x = x[src_index]
+    gather_y = y[dst_index]
+
+    # Calculate forward output.
+    if compute_type == "ADD":
+        results = gather_x + gather_y
+    elif compute_type == "MUL":
+        results = gather_x * gather_y
+    return results
+
+
+class TestGraphSendUVOp(OpTest):
+
+    def setUp(self):
+        paddle.enable_static()
+        self.op_type = "graph_send_uv"
+        self.set_config()
+        self.inputs = {
+            'x': self.x,
+            'y': self.y,
+            'src_index': self.src_index,
+            'dst_index': self.dst_index
+        }
+        self.attrs = {'compute_type': self.compute_type}
+        out = compute_graph_send_uv(self.inputs, self.attrs)
+        self.outputs = {'out': out}
+
+    def test_check_output(self):
+        self.check_output(check_eager=True)
+
+    # def test_check_grad(self):
+    #     self.check_grad(['x', 'y'], 'Out', check_eager=True)
+
+    def set_config(self):
+        self.x = np.random.random((10, 20)).astype("float64")
+        self.y = np.random.random((10, 20)).astype("float64")
+        index = np.random.randint(0, 10, (15, 2)).astype(np.int64)
+        self.src_index = index[:, 0]
+        self.dst_index = index[:, 1]
+        self.compute_type = 'ADD'

From 6f456000ecb5fcf1b516fc56c0d64bc5379847b0 Mon Sep 17 00:00:00 2001
From: DesmonDay <908660116@qq.com>
Date: Wed, 3 Aug 2022 08:57:25 +0000
Subject: [PATCH 02/12] fix op maker bug

---
 paddle/phi/api/yaml/generator/filters.py      |  4 +-
 paddle/phi/infermeta/multiary.cc              |  4 -
 .../tests/unittests/test_graph_send_uv_op.py  | 81 +------------------
 3 files changed, 5 insertions(+), 84 deletions(-)

diff --git a/paddle/phi/api/yaml/generator/filters.py b/paddle/phi/api/yaml/generator/filters.py
index cda858ab6e74e..de9fdf25e9834 100644
--- a/paddle/phi/api/yaml/generator/filters.py
+++ b/paddle/phi/api/yaml/generator/filters.py
@@ -86,9 +86,9 @@ def to_opmaker_name(s):
 
 def to_opmaker_name_cstr(s):
     if s.endswith("_grad"):
-        return '"{}@GRAD"'.format(to_pascal_case(s[:-5]))
+        return '"{}@GRAD"'.format(s[:-5])
     else:
-        return '"{}"'.format(to_pascal_case(s))
+        return '"{}"'.format(s)
 
 
 def to_pascal_case(s):
diff --git a/paddle/phi/infermeta/multiary.cc b/paddle/phi/infermeta/multiary.cc
index 7953900d706c0..e379ee8129134 100644
--- a/paddle/phi/infermeta/multiary.cc
+++ b/paddle/phi/infermeta/multiary.cc
@@ -2417,11 +2417,7 @@ void GraphSendUVInferMeta(const MetaTensor& x,
                           const MetaTensor& dst_index,
                           const std::string& compute_type,
                           MetaTensor* out) {
-  std::cout << "Enter GraphSendUVInferMeta" << std::endl;
-
   auto src_index_dims = src_index.dims();
-
-  std::cout << "Check whether enter here.\n";
   if (src_index_dims.size() == 2) {
     PADDLE_ENFORCE_EQ(src_index_dims[1],
                       1,
diff --git a/python/paddle/fluid/tests/unittests/test_graph_send_uv_op.py b/python/paddle/fluid/tests/unittests/test_graph_send_uv_op.py
index 3d66df06cc5ed..2affb75ee7a61 100644
--- a/python/paddle/fluid/tests/unittests/test_graph_send_uv_op.py
+++ b/python/paddle/fluid/tests/unittests/test_graph_send_uv_op.py
@@ -22,81 +22,6 @@
 from op_test import OpTest
 
 
-def get_broadcast_shape(shp1, shp2):
-    pad_shp1, pad_shp2 = shp1, shp2
-    if len(shp1) > len(shp2):
-        pad_shp2 = [
-            1,
-        ] * (len(shp1) - len(shp2)) + shp2
-    elif len(shp1) < len(shp2):
-        pad_shp1 = [
-            1,
-        ] * (len(shp2) - len(shp1)) + shp1
-    for d1, d2 in zip(pad_shp1, pad_shp2):
-        if d1 != d2 and d1 != 1 and d2 != 1:
-            raise ValueError
-    rst = [max(d1, d2) for d1, d2 in zip(pad_shp1, pad_shp2)]
-    return rst
-
-
-class BroadCastInfo(object):
-
-    def __init__(self, x_shape, e_shape):
-        self.x_shape = x_shape
-        self.e_shape = e_shape
-
-        self.calculate_bcastinfo()
-
-    def use_bcast(self):
-        if len(self.x_shape) != len(self.e_shape):
-            return True
-        for i in range(1, len(self.x_shape)):
-            if self.x_shape[i] != self.e_shape[i]:
-                return True
-        return False
-
-    def calculate_bcastinfo(self):
-        lhs_len = 1
-        rhs_len = 1
-        for i in range(1, len(self.x_shape)):
-            lhs_len *= self.x_shape[i]
-        for i in range(1, len(self.e_shape)):
-            rhs_len *= self.e_shape[i]
-        use_b = self.use_bcast()
-
-        if use_b:
-            max_ndim = max(len(self.x_shape), len(self.e_shape)) - 1
-            out_len = 1
-            stride_l = stride_r = 1
-            lhs_offset = [0]
-            rhs_offset = [0]
-            for j in range(0, max_ndim):
-                dl = 1 if (len(self.x_shape) - 1 - j) < 1 \
-                       else self.x_shape[len(self.x_shape) - 1 - j]
-                dr = 1 if (len(self.e_shape) - 1 - j) < 1 \
-                       else self.e_shape[len(self.e_shape) - 1 - j]
-                for i in range(1, max(dl, dr)):
-                    for k in range(0, out_len):
-                        lhs_offset.append(lhs_offset[k] + i *
-                                          (i < dl) * stride_l)
-                        rhs_offset.append(rhs_offset[k] + i *
-                                          (i < dr) * stride_r)
-
-                out_len *= max(dl, dr)
-                stride_l *= dl
-                stride_r *= dr
-        else:
-            out_len = rhs_len
-
-        self.use_broadcast = use_b
-        self.out_len = out_len
-        self.lhs_len = lhs_len
-        self.rhs_len = rhs_len
-        if use_b:
-            self.lhs_offset = lhs_offset
-            self.rhs_offset = rhs_offset
-
-
 def compute_graph_send_uv(inputs, attributes):
     x = inputs['x']
     y = inputs['y']
@@ -132,10 +57,10 @@ def setUp(self):
         self.outputs = {'out': out}
 
     def test_check_output(self):
-        self.check_output(check_eager=True)
+        self.check_output(check_eager=False)
 
-    # def test_check_grad(self):
-    #     self.check_grad(['x', 'y'], 'Out', check_eager=True)
+    def test_check_grad(self):
+        self.check_grad(['x', 'y'], 'out', check_eager=False)
 
     def set_config(self):
         self.x = np.random.random((10, 20)).astype("float64")

From c667bd09ce1ffd7b0043a54d764a8eaf99030450 Mon Sep 17 00:00:00 2001
From: DesmonDay <908660116@qq.com>
Date: Thu, 4 Aug 2022 08:20:09 +0000
Subject: [PATCH 03/12] fix mul grad bug

---
 .../kernels/gpu/graph_send_uv_grad_kernel.cu  | 38 ++++++++++++-
 .../tests/unittests/test_graph_send_uv_op.py  | 56 +++++++++++++++++++
 2 files changed, 91 insertions(+), 3 deletions(-)

diff --git a/paddle/phi/kernels/gpu/graph_send_uv_grad_kernel.cu b/paddle/phi/kernels/gpu/graph_send_uv_grad_kernel.cu
index 96ec93ae555e6..49741b65bd9f7 100644
--- a/paddle/phi/kernels/gpu/graph_send_uv_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/graph_send_uv_grad_kernel.cu
@@ -27,6 +27,29 @@
 
 namespace phi {
 
+/*template <typename Context, typename T, typename IndexT>
+__global__ void GraphSendUVCUDAKernel(const T* out_grad,
+                                      const IndexT* src_indices,
+                                      const IndexT* dst_indices,
+                                      int64_t index_size,
+                                      int64_t slice_size,
+                                      T* x_grad) {
+  Idx ty = blockIdx.y * blockDim.y + threadIdx.y;
+  const Idx stride_y = blockDim.y * gridDim.y;
+  while (ty < index_size) {
+    IndexT src = src_indices[ty];
+    IndexT dst = dst_indices[ty];
+    int64_t tx = blockIdx.x * blockDim.x + threadIdx.x;
+    int64_t stride_x = blockDim.x * gridDim.x;
+
+    const T* e_off = e_data + ty * slice_size;
+    T* x_grad_off = x_grad + dst * slice_size;
+    while (tx < slice_size) {
+
+    }
+  }
+}*/
+
 template <typename Context, typename T, typename IndexT>
 void CalculateGrad(const Context& ctx,
                    const T* out_grad,
@@ -64,12 +87,17 @@ void CalculateGrad(const Context& ctx,
                                              slice_size,
                                              functor);
     } else {
+      // TODO(daisiming): 反向很多bug
       const auto& bcast_info = phi::CalcBCastInfo(out_grad_dims, x_grad_dims);
-      DenseTensor x_grad_v2 = phi::EmptyLike<T, Context>(ctx, out_grad_tensor);
+      auto out_grad_dims_1 = phi::vectorize<int>(out_grad_dims);
+      std::vector<int> out_grad_dims_2(out_grad_dims_1.begin() + 1,
+                                       out_grad_dims_1.end());
+      out_grad_dims_2.insert(out_grad_dims_2.begin(), x_grad_dims[0]);
+      DenseTensor x_grad_v2 = phi::Empty<T, Context>(ctx, out_grad_dims_2);
       phi::funcs::SetConstant<Context, T>()(ctx, &x_grad_v2, T(0));
       T* x_grad_v2_data = x_grad_v2.data<T>();
       GraphSendRecvCUDAKernel<T, IndexT, GraphSendRecvSumCUDAFunctor<T, IndexT>>
-          <<<grid, block, 0, ctx.stream()>>>(out_grad,
+          <<<grid, block, 0, ctx.stream()>>>(out_grad,  // 觉得有点怪
                                              d_index,
                                              s_index,
                                              x_grad,
@@ -132,7 +160,11 @@ void CalculateGrad(const Context& ctx,
               mul_functor,
               sum_functor);
     } else {
-      DenseTensor x_grad_v2 = phi::EmptyLike<T, Context>(ctx, out_grad_tensor);
+      auto out_grad_dims_1 = phi::vectorize<int>(out_grad_dims);
+      std::vector<int> out_grad_dims_2(out_grad_dims_1.begin() + 1,
+                                       out_grad_dims_1.end());
+      out_grad_dims_2.insert(out_grad_dims_2.begin(), x_grad_dims[0]);
+      DenseTensor x_grad_v2 = phi::Empty<T, Context>(ctx, out_grad_dims_2);
       phi::funcs::SetConstant<Context, T>()(ctx, &x_grad_v2, T(0));
       T* x_grad_v2_data = x_grad_v2.data<T>();
       GraphSendUERecvCUDAKernel<T,
diff --git a/python/paddle/fluid/tests/unittests/test_graph_send_uv_op.py b/python/paddle/fluid/tests/unittests/test_graph_send_uv_op.py
index 2affb75ee7a61..0d7e2faeaaf9b 100644
--- a/python/paddle/fluid/tests/unittests/test_graph_send_uv_op.py
+++ b/python/paddle/fluid/tests/unittests/test_graph_send_uv_op.py
@@ -37,6 +37,7 @@ def compute_graph_send_uv(inputs, attributes):
         results = gather_x + gather_y
     elif compute_type == "MUL":
         results = gather_x * gather_y
+
     return results
 
 
@@ -69,3 +70,58 @@ def set_config(self):
         self.src_index = index[:, 0]
         self.dst_index = index[:, 1]
         self.compute_type = 'ADD'
+
+
+class TestCase1(TestGraphSendUVOp):
+
+    def set_config(self):
+        self.x = np.random.random((10, 20)).astype("float64")
+        self.y = np.random.random((10, 20)).astype("float64")
+        index = np.random.randint(0, 10, (15, 2)).astype(np.int64)
+        self.src_index = index[:, 0]
+        self.dst_index = index[:, 1]
+        self.compute_type = 'MUL'
+
+
+class TestCase2(TestGraphSendUVOp):
+
+    def set_config(self):
+        self.x = np.random.random((100, 1)).astype("float64")
+        self.y = np.random.random((100, 20)).astype("float64")
+        index = np.random.randint(0, 100, (15, 2)).astype(np.int64)
+        self.src_index = index[:, 0]
+        self.dst_index = index[:, 1]
+        self.compute_type = 'ADD'
+
+
+class TestCase3(TestGraphSendUVOp):
+
+    def set_config(self):
+        self.x = np.random.random((100, 20)).astype("float64")
+        self.y = np.random.random((100, 1)).astype("float64")
+        index = np.random.randint(0, 100, (15, 2)).astype(np.int64)
+        self.src_index = index[:, 0]
+        self.dst_index = index[:, 1]
+        self.compute_type = 'ADD'
+
+
+class TestCase4(TestGraphSendUVOp):
+
+    def set_config(self):
+        self.x = np.random.random((100, 1)).astype("float64")
+        self.y = np.random.random((100, 20)).astype("float64")
+        index = np.random.randint(0, 100, (15, 2)).astype(np.int64)
+        self.src_index = index[:, 0]
+        self.dst_index = index[:, 1]
+        self.compute_type = 'MUL'
+
+
+class TestCase5(TestGraphSendUVOp):
+
+    def set_config(self):
+        self.x = np.random.random((100, 20)).astype("float64")
+        self.y = np.random.random((100, 1)).astype("float64")
+        index = np.random.randint(0, 100, (15, 2)).astype(np.int64)
+        self.src_index = index[:, 0]
+        self.dst_index = index[:, 1]
+        self.compute_type = 'MUL'

From e0bac525580569905b9ba1b022201c077190e446 Mon Sep 17 00:00:00 2001
From: DesmonDay <908660116@qq.com>
Date: Fri, 5 Aug 2022 06:31:36 +0000
Subject: [PATCH 04/12] add unittest

---
 .../kernels/gpu/graph_send_uv_grad_kernel.cu  |  8 +++----
 .../tests/unittests/test_graph_send_uv_op.py  | 22 +++++++++++++++++++
 2 files changed, 25 insertions(+), 5 deletions(-)

diff --git a/paddle/phi/kernels/gpu/graph_send_uv_grad_kernel.cu b/paddle/phi/kernels/gpu/graph_send_uv_grad_kernel.cu
index 49741b65bd9f7..c4a9d111be491 100644
--- a/paddle/phi/kernels/gpu/graph_send_uv_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/graph_send_uv_grad_kernel.cu
@@ -27,7 +27,7 @@
 
 namespace phi {
 
-/*template <typename Context, typename T, typename IndexT>
+template <typename Context, typename T, typename IndexT>
 __global__ void GraphSendUVCUDAKernel(const T* out_grad,
                                       const IndexT* src_indices,
                                       const IndexT* dst_indices,
@@ -45,10 +45,9 @@ __global__ void GraphSendUVCUDAKernel(const T* out_grad,
     const T* e_off = e_data + ty * slice_size;
     T* x_grad_off = x_grad + dst * slice_size;
     while (tx < slice_size) {
-
     }
   }
-}*/
+}
 
 template <typename Context, typename T, typename IndexT>
 void CalculateGrad(const Context& ctx,
@@ -87,7 +86,6 @@ void CalculateGrad(const Context& ctx,
                                              slice_size,
                                              functor);
     } else {
-      // TODO(daisiming): 反向很多bug
       const auto& bcast_info = phi::CalcBCastInfo(out_grad_dims, x_grad_dims);
       auto out_grad_dims_1 = phi::vectorize<int>(out_grad_dims);
       std::vector<int> out_grad_dims_2(out_grad_dims_1.begin() + 1,
@@ -97,7 +95,7 @@ void CalculateGrad(const Context& ctx,
       phi::funcs::SetConstant<Context, T>()(ctx, &x_grad_v2, T(0));
       T* x_grad_v2_data = x_grad_v2.data<T>();
       GraphSendRecvCUDAKernel<T, IndexT, GraphSendRecvSumCUDAFunctor<T, IndexT>>
-          <<<grid, block, 0, ctx.stream()>>>(out_grad,  // 觉得有点怪
+          <<<grid, block, 0, ctx.stream()>>>(out_grad,
                                              d_index,
                                              s_index,
                                              x_grad,
diff --git a/python/paddle/fluid/tests/unittests/test_graph_send_uv_op.py b/python/paddle/fluid/tests/unittests/test_graph_send_uv_op.py
index 0d7e2faeaaf9b..b5ca6364555c5 100644
--- a/python/paddle/fluid/tests/unittests/test_graph_send_uv_op.py
+++ b/python/paddle/fluid/tests/unittests/test_graph_send_uv_op.py
@@ -125,3 +125,25 @@ def set_config(self):
         self.src_index = index[:, 0]
         self.dst_index = index[:, 1]
         self.compute_type = 'MUL'
+
+
+class TestCase6(TestGraphSendUVOp):
+
+    def set_config(self):
+        self.x = np.random.random((10, 10, 1)).astype("float64")
+        self.y = np.random.random((10, 10, 10))
+        index = np.random.randint(0, 10, (15, 2)).astype(np.int64)
+        self.src_index = index[:, 0]
+        self.dst_index = index[:, 1]
+        self.compute_type = 'ADD'
+
+
+class TestCase7(TestGraphSendUVOp):
+
+    def set_config(self):
+        self.x = np.random.random((10, 10, 1)).astype("float64")
+        self.y = np.random.random((10, 10, 10))
+        index = np.random.randint(0, 10, (15, 2)).astype(np.int64)
+        self.src_index = index[:, 0]
+        self.dst_index = index[:, 1]
+        self.compute_type = 'MUL'

From 1f66a0dd60f2d95015889d3311b40aaf23edbafd Mon Sep 17 00:00:00 2001
From: DesmonDay <908660116@qq.com>
Date: Sun, 7 Aug 2022 07:34:01 +0000
Subject: [PATCH 05/12] fix add grad bug, add cpu kernel

---
 .../kernels/cpu/graph_send_ue_recv_funcs.h    |  46 +++
 .../kernels/cpu/graph_send_uv_grad_kernel.cc  | 268 ++++++++++++++++++
 .../phi/kernels/cpu/graph_send_uv_kernel.cc   | 125 ++++++++
 .../kernels/gpu/graph_send_uv_grad_kernel.cu  |  75 ++---
 .../phi/kernels/gpu/graph_send_uv_kernel.cu   |   2 +-
 5 files changed, 478 insertions(+), 38 deletions(-)
 create mode 100644 paddle/phi/kernels/cpu/graph_send_ue_recv_funcs.h
 create mode 100644 paddle/phi/kernels/cpu/graph_send_uv_grad_kernel.cc
 create mode 100644 paddle/phi/kernels/cpu/graph_send_uv_kernel.cc

diff --git a/paddle/phi/kernels/cpu/graph_send_ue_recv_funcs.h b/paddle/phi/kernels/cpu/graph_send_ue_recv_funcs.h
new file mode 100644
index 0000000000000..7647415d8e7cb
--- /dev/null
+++ b/paddle/phi/kernels/cpu/graph_send_ue_recv_funcs.h
@@ -0,0 +1,46 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <algorithm>
+#include <vector>
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/hostdevice.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+
+namespace phi {
+
+template <typename T>
+struct GraphAddFunctor {
+  inline T operator()(const T a, const T b) const { return a + b; }
+};
+
+template <typename T>
+struct GraphMulFunctor {
+  inline T operator()(const T a, const T b) const { return a * b; }
+};
+
+template <typename T>
+struct GraphMaxFunctor {
+  inline T operator()(const T a, const T b) const { return a < b ? b : a; }
+};
+
+template <typename T>
+struct GraphMinFunctor {
+  inline T operator()(const T a, const T b) const { return a < b ? a : b; }
+};
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/cpu/graph_send_uv_grad_kernel.cc b/paddle/phi/kernels/cpu/graph_send_uv_grad_kernel.cc
new file mode 100644
index 0000000000000..4b224726b6ffb
--- /dev/null
+++ b/paddle/phi/kernels/cpu/graph_send_uv_grad_kernel.cc
@@ -0,0 +1,268 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/graph_send_uv_grad_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/hostdevice.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/empty_kernel.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/impl/graph_send_ue_recv_kernel_impl.h"
+#include "paddle/phi/kernels/reduce_sum_kernel.h"
+
+namespace phi {
+
+template <typename Context, typename T, typename IndexT>
+void CalculateGrad(const Context& ctx,
+                   const T* out_grad,
+                   const IndexT* s_index,
+                   const IndexT* d_index,
+                   const phi::DDim& out_grad_dims,
+                   const phi::DDim& x_grad_dims,
+                   const std::string& compute_type,
+                   int64_t index_size,
+                   int64_t slice_size,
+                   T* x_grad,
+                   const DenseTensor& out_grad_tensor,
+                   const DenseTensor& y) {
+  std::vector<int64_t> reduce_idx;
+  bool reduce = ReduceGrad(out_grad_dims, x_grad_dims, reduce_idx);
+
+  if (compute_type == "ADD") {
+    if (!reduce) {
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for
+#endif
+      for (int64_t i = 0; i < index_size; i++) {
+        IndexT dst = d_index[i];
+        T* x_grad_off = x_grad + dst * slice_size;
+        const T* out_grad_off = out_grad + i * slice_size;
+        for (int64_t j = 0; j < slice_size; j++) {
+          if (out_grad_off[j] != 0) {
+#ifdef PADDLE_WITH_MKLML
+#pragma omp atomic
+#endif
+            x_grad_off[j] += out_grad_off[j];
+          }
+        }
+      }
+    } else {
+      const auto& bcast_info = phi::CalcBCastInfo(out_grad_dims, x_grad_dims);
+      auto out_grad_dims_1 = phi::vectorize<int>(out_grad_dims);
+      std::vector<int> out_grad_dims_2(out_grad_dims_1.begin() + 1,
+                                       out_grad_dims_1.end());
+      out_grad_dims_2.insert(out_grad_dims_2.begin(), x_grad_dims[0]);
+      DenseTensor x_grad_v2 = phi::Empty<T, Context>(ctx, out_grad_dims_2);
+      phi::funcs::SetConstant<Context, T>()(ctx, &x_grad_v2, T(0));
+      T* x_grad_v2_data = x_grad_v2.data<T>();
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for
+#endif
+      for (int64_t i = 0; i < index_size; i++) {
+        IndexT dst = d_index[i];
+        T* x_grad_off = x_grad_v2_data + dst * bcast_info.out_len;
+        const T* out_grad_off = out_grad + i * bcast_info.out_len;
+        for (int64_t j = 0; j < bcast_info.out_len; j++) {
+          if (out_grad_off[j] != 0) {
+#ifdef PADDLE_WITH_MKLML
+#pragma omp atomic
+#endif
+            x_grad_off[j] += out_grad_off[j];
+          }
+        }
+      }
+      DenseTensor x_grad_out = phi::Sum<T, Context>(
+          ctx,
+          x_grad_v2,
+          reduce_idx,
+          paddle::experimental::CppTypeToDataType<T>::Type(),
+          true);
+      memcpy(x_grad, x_grad_out.data<T>(), x_grad_out.numel() * sizeof(T));
+    }
+  } else if (compute_type == "MUL") {
+    const auto& bcast = phi::CalcBCastInfo(y.dims(), out_grad_dims);
+    const T* y_data = y.data<T>();
+    if (!reduce) {
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for
+#endif
+      for (int64_t i = 0; i < index_size; i++) {
+        IndexT src = s_index[i];
+        IndexT dst = d_index[i];
+        T* x_grad_off = x_grad + dst * bcast.out_len;
+        const T* y_off = y_data + src * bcast.l_len;
+        const T* out_grad_off = out_grad + i * bcast.r_len;
+        for (int64_t j = 0; j < bcast.out_len; j++) {
+          int64_t y_add = bcast.use_bcast ? bcast.l_offset[j] : j;
+          int64_t o_add = bcast.use_bcast ? bcast.r_offset[j] : j;
+          T val = y_off[y_add] * out_grad_off[o_add];
+          if (val != 0) {
+#ifdef PADDLE_WITH_MKLML
+#pragma omp atomic
+#endif
+            x_grad_off[j] += val;
+          }
+        }
+      }
+    } else {
+      auto out_grad_dims_1 = phi::vectorize<int>(out_grad_dims);
+      std::vector<int> out_grad_dims_2(out_grad_dims_1.begin() + 1,
+                                       out_grad_dims_1.end());
+      out_grad_dims_2.insert(out_grad_dims_2.begin(), x_grad_dims[0]);
+      DenseTensor x_grad_v2 = phi::Empty<T, Context>(ctx, out_grad_dims_2);
+      phi::funcs::SetConstant<Context, T>()(ctx, &x_grad_v2, T(0));
+      T* x_grad_v2_data = x_grad_v2.data<T>();
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for
+#endif
+      for (int64_t i = 0; i < index_size; i++) {
+        IndexT src = s_index[i];
+        IndexT dst = d_index[i];
+        T* x_grad_off = x_grad_v2_data + dst * bcast.out_len;
+        const T* y_off = y_data + src * bcast.l_len;
+        const T* out_grad_off = out_grad + i * bcast.r_len;
+        for (int64_t j = 0; j < bcast.out_len; j++) {
+          int64_t y_add = bcast.use_bcast ? bcast.l_offset[j] : j;
+          int64_t o_add = bcast.use_bcast ? bcast.r_offset[j] : j;
+          T val = y_off[y_add] * out_grad_off[o_add];
+          if (val != 0) {
+#ifdef PADDLE_WITH_MKLML
+#pragma omp atomic
+#endif
+            x_grad_off[j] += val;
+          }
+        }
+      }
+      DenseTensor x_grad_out = phi::Sum<T, Context>(
+          ctx,
+          x_grad_v2,
+          reduce_idx,
+          paddle::experimental::CppTypeToDataType<T>::Type(),
+          true);
+      memcpy(x_grad, x_grad_out.data<T>(), x_grad_out.numel() * sizeof(T));
+    }
+  }
+}
+
+template <typename Context, typename T, typename IndexT>
+void GraphSendUVGradOpKernelLaunchHelper(const Context& ctx,
+                                         const DenseTensor& x,
+                                         const DenseTensor& y,
+                                         const DenseTensor& out_grad,
+                                         const DenseTensor& src_index,
+                                         const DenseTensor& dst_index,
+                                         const std::string& compute_type,
+                                         DenseTensor* x_grad,
+                                         DenseTensor* y_grad) {
+  const int64_t& index_size = dst_index.dims()[0];
+
+  ctx.template Alloc<T>(x_grad);
+  T* x_grad_data = x_grad->data<T>();
+  ctx.template Alloc<T>(y_grad);
+  T* y_grad_data = y_grad->data<T>();
+  const auto& x_grad_dims = x_grad->dims();
+  const auto& y_grad_dims = y_grad->dims();
+  int64_t memset_size_x = 1, memset_size_y = 1;
+  int64_t slice_size_x = 1, slice_size_y = 1;
+  for (int i = 0; i < x_grad_dims.size(); i++) {
+    memset_size_x *= x_grad_dims[i];
+    if (i > 0) slice_size_x *= x_grad_dims[i];
+  }
+  for (int i = 0; i < y_grad_dims.size(); i++) {
+    memset_size_y *= y_grad_dims[i];
+    if (i > 0) slice_size_y *= y_grad_dims[i];
+  }
+  const size_t& memset_bytes_x = memset_size_x * sizeof(T);
+  const size_t& memset_bytes_y = memset_size_y * sizeof(T);
+  memset(x_grad_data, 0, memset_bytes_x);
+  memset(y_grad_data, 0, memset_bytes_y);
+
+  if (index_size == 0) return;
+  const T* out_grad_data = out_grad.data<T>();
+  const IndexT* s_index = src_index.data<IndexT>();
+  const IndexT* d_index = dst_index.data<IndexT>();
+  const auto& out_grad_dims = out_grad.dims();
+  // Calculate X Grad.
+  CalculateGrad<Context, T, IndexT>(ctx,
+                                    out_grad_data,
+                                    d_index,
+                                    s_index,
+                                    out_grad_dims,
+                                    x_grad_dims,
+                                    compute_type,
+                                    index_size,
+                                    slice_size_x,
+                                    x_grad_data,
+                                    out_grad,
+                                    y);
+  // Calcuate Y Grad.
+  CalculateGrad<Context, T, IndexT>(ctx,
+                                    out_grad_data,
+                                    s_index,
+                                    d_index,
+                                    out_grad_dims,
+                                    y_grad_dims,
+                                    compute_type,
+                                    index_size,
+                                    slice_size_y,
+                                    y_grad_data,
+                                    out_grad,
+                                    x);
+}
+
+template <typename T, typename Context>
+void GraphSendUVGradKernel(const Context& ctx,
+                           const DenseTensor& x,
+                           const DenseTensor& y,
+                           const DenseTensor& src_index,
+                           const DenseTensor& dst_index,
+                           const DenseTensor& out_grad,
+                           const std::string& compute_type,
+                           DenseTensor* x_grad,
+                           DenseTensor* y_grad) {
+  auto index_type = src_index.dtype();
+  if (index_type == phi::DataType::INT32) {
+    GraphSendUVGradOpKernelLaunchHelper<Context, T, int32_t>(ctx,
+                                                             x,
+                                                             y,
+                                                             out_grad,
+                                                             src_index,
+                                                             dst_index,
+                                                             compute_type,
+                                                             x_grad,
+                                                             y_grad);
+  } else if (index_type == phi::DataType::INT64) {
+    GraphSendUVGradOpKernelLaunchHelper<Context, T, int64_t>(ctx,
+                                                             x,
+                                                             y,
+                                                             out_grad,
+                                                             src_index,
+                                                             dst_index,
+                                                             compute_type,
+                                                             x_grad,
+                                                             y_grad);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(graph_send_uv_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::GraphSendUVGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/cpu/graph_send_uv_kernel.cc b/paddle/phi/kernels/cpu/graph_send_uv_kernel.cc
new file mode 100644
index 0000000000000..3689f51199777
--- /dev/null
+++ b/paddle/phi/kernels/cpu/graph_send_uv_kernel.cc
@@ -0,0 +1,125 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/graph_send_uv_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/hostdevice.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/cpu/graph_send_ue_recv_funcs.h"
+#include "paddle/phi/kernels/impl/graph_send_ue_recv_kernel_impl.h"
+
+namespace phi {
+
+template <typename T, typename IndexT, typename ComputeFunctor>
+void GraphSendUVCpuKernel(const BroadCastInfo& bcast,
+                          const T* x_data,
+                          const T* y_data,
+                          const IndexT* src_indices,
+                          const IndexT* dst_indices,
+                          T* output,
+                          int64_t index_size,
+                          ComputeFunctor cfunctor) {
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for
+#endif
+  for (int64_t i = 0; i < index_size; i++) {
+    IndexT src = src_indices[i];
+    IndexT dst = dst_indices[i];
+    T* out_off = output + i * bcast.out_len;
+    const T* x_off = x_data + src * bcast.l_len;
+    const T* y_off = y_data + dst * bcast.r_len;
+    for (int64_t j = 0; j < bcast.out_len; j++) {
+      int64_t x_add = bcast.use_bcast ? bcast.l_offset[j] : j;
+      int64_t y_add = bcast.use_bcast ? bcast.r_offset[j] : j;
+      T val = cfunctor(x_off[x_add], y_off[y_add]);
+      out_off[j] = val;
+    }
+  }
+}
+
+template <typename Context, typename T, typename IndexT>
+void GraphSendUVOpKernelLaunchHelper(const Context& ctx,
+                                     const DenseTensor& x,
+                                     const DenseTensor& y,
+                                     const DenseTensor& src_index,
+                                     const DenseTensor& dst_index,
+                                     const std::string& compute_type,
+                                     DenseTensor* out) {
+  const int& index_size = src_index.dims()[0];
+  auto out_dims = out->dims();
+  int64_t memset_size = 1;
+  for (int i = 0; i < out_dims.size(); i++) {
+    memset_size *= out_dims[i];
+  }
+  ctx.template Alloc<T>(out);
+  T* out_data = out->data<T>();
+  if (index_size == 0) return;
+
+  const auto& bcast_info = phi::CalcBCastInfo(x.dims(), y.dims());
+  const T* x_data = x.data<T>();
+  const T* y_data = y.data<T>();
+  const IndexT* s_index = src_index.data<IndexT>();
+  const IndexT* d_index = dst_index.data<IndexT>();
+  if (compute_type == "ADD") {
+    GraphAddFunctor<T> add_functor;
+    GraphSendUVCpuKernel<T, IndexT, GraphAddFunctor<T>>(bcast_info,
+                                                        x_data,
+                                                        y_data,
+                                                        s_index,
+                                                        d_index,
+                                                        out_data,
+                                                        index_size,
+                                                        add_functor);
+  } else if (compute_type == "MUL") {
+    GraphMulFunctor<T> mul_functor;
+    GraphSendUVCpuKernel<T, IndexT, GraphMulFunctor<T>>(bcast_info,
+                                                        x_data,
+                                                        y_data,
+                                                        s_index,
+                                                        d_index,
+                                                        out_data,
+                                                        index_size,
+                                                        mul_functor);
+  }
+}
+
+template <typename T, typename Context>
+void GraphSendUVKernel(const Context& ctx,
+                       const DenseTensor& x,
+                       const DenseTensor& y,
+                       const DenseTensor& src_index,
+                       const DenseTensor& dst_index,
+                       const std::string& compute_type,
+                       DenseTensor* out) {
+  auto index_type = src_index.dtype();
+  if (index_type == phi::DataType::INT32) {
+    GraphSendUVOpKernelLaunchHelper<Context, T, int32_t>(
+        ctx, x, y, src_index, dst_index, compute_type, out);
+  } else if (index_type == phi::DataType::INT64) {
+    GraphSendUVOpKernelLaunchHelper<Context, T, int64_t>(
+        ctx, x, y, src_index, dst_index, compute_type, out);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(graph_send_uv,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::GraphSendUVKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/gpu/graph_send_uv_grad_kernel.cu b/paddle/phi/kernels/gpu/graph_send_uv_grad_kernel.cu
index c4a9d111be491..ae3c6eea19a39 100644
--- a/paddle/phi/kernels/gpu/graph_send_uv_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/graph_send_uv_grad_kernel.cu
@@ -27,25 +27,28 @@
 
 namespace phi {
 
-template <typename Context, typename T, typename IndexT>
-__global__ void GraphSendUVCUDAKernel(const T* out_grad,
-                                      const IndexT* src_indices,
-                                      const IndexT* dst_indices,
-                                      int64_t index_size,
-                                      int64_t slice_size,
-                                      T* x_grad) {
-  Idx ty = blockIdx.y * blockDim.y + threadIdx.y;
-  const Idx stride_y = blockDim.y * gridDim.y;
+template <typename T, typename IndexT>
+__global__ void GraphSendUVGradCUDAKernel(const T* out_grad,
+                                          const IndexT* src_indices,
+                                          const IndexT* dst_indices,
+                                          int64_t index_size,
+                                          int64_t slice_size,
+                                          T* x_grad) {
+  IndexT ty = blockIdx.y * blockDim.y + threadIdx.y;
+  const IndexT stride_y = blockDim.y * gridDim.y;
   while (ty < index_size) {
     IndexT src = src_indices[ty];
     IndexT dst = dst_indices[ty];
     int64_t tx = blockIdx.x * blockDim.x + threadIdx.x;
     int64_t stride_x = blockDim.x * gridDim.x;
 
-    const T* e_off = e_data + ty * slice_size;
+    const T* out_grad_off = out_grad + ty * slice_size;
     T* x_grad_off = x_grad + dst * slice_size;
     while (tx < slice_size) {
+      paddle::platform::CudaAtomicAdd(x_grad_off + tx, out_grad_off[tx]);
+      tx += stride_x;
     }
+    ty += stride_y;
   }
 }
 
@@ -66,25 +69,16 @@ void CalculateGrad(const Context& ctx,
   bool reduce = ReduceGrad(out_grad_dims, x_grad_dims, reduce_idx);
 
   if (compute_type == "ADD") {
-#ifdef PADDLE_WITH_HIP
-    int block = 256;
-#else
-    int block = 1024;
-#endif
-    int64_t n = slice_size * index_size;
-    int max_grid_dimx = ctx.GetCUDAMaxGridDimSize()[0];
-    int64_t grid_tmp = (n + block - 1) / block;
-    int64_t grid = grid_tmp < max_grid_dimx ? grid_tmp : max_grid_dimx;
-    GraphSendRecvSumCUDAFunctor<T, IndexT> functor;
     if (!reduce) {
-      GraphSendRecvCUDAKernel<T, IndexT, GraphSendRecvSumCUDAFunctor<T, IndexT>>
-          <<<grid, block, 0, ctx.stream()>>>(out_grad,
-                                             d_index,
-                                             s_index,
-                                             x_grad,
-                                             index_size,
-                                             slice_size,
-                                             functor);
+      const int ntx = FindNumThreads(slice_size);
+      const int nty = CUDA_MAX_NUM_THREADS / ntx;
+      const int nbx = (slice_size + ntx - 1) / ntx;
+      const int nby = (index_size + nty - 1) / nty;
+      const dim3 grid_tmp(nbx, nby);
+      const dim3 block_tmp(ntx, nty);
+      GraphSendUVGradCUDAKernel<T, IndexT>
+          <<<grid_tmp, block_tmp, 0, ctx.stream()>>>(
+              out_grad, d_index, s_index, index_size, slice_size, x_grad);
     } else {
       const auto& bcast_info = phi::CalcBCastInfo(out_grad_dims, x_grad_dims);
       auto out_grad_dims_1 = phi::vectorize<int>(out_grad_dims);
@@ -94,14 +88,21 @@ void CalculateGrad(const Context& ctx,
       DenseTensor x_grad_v2 = phi::Empty<T, Context>(ctx, out_grad_dims_2);
       phi::funcs::SetConstant<Context, T>()(ctx, &x_grad_v2, T(0));
       T* x_grad_v2_data = x_grad_v2.data<T>();
-      GraphSendRecvCUDAKernel<T, IndexT, GraphSendRecvSumCUDAFunctor<T, IndexT>>
-          <<<grid, block, 0, ctx.stream()>>>(out_grad,
-                                             d_index,
-                                             s_index,
-                                             x_grad,
-                                             index_size,
-                                             bcast_info.out_len,
-                                             functor);
+
+      const int ntx = FindNumThreads(bcast_info.out_len);
+      const int nty = CUDA_MAX_NUM_THREADS / ntx;
+      const int nbx = (bcast_info.out_len + ntx - 1) / ntx;
+      const int nby = (index_size + nty - 1) / nty;
+      const dim3 grid_tmp(nbx, nby);
+      const dim3 block_tmp(ntx, nty);
+      GraphSendUVGradCUDAKernel<T, IndexT>
+          <<<grid_tmp, block_tmp, 0, ctx.stream()>>>(out_grad,
+                                                     d_index,
+                                                     s_index,
+                                                     index_size,
+                                                     bcast_info.out_len,
+                                                     x_grad_v2_data);
+
       // Run reduce sum
       DenseTensor x_grad_out = phi::Sum<T, Context>(
           ctx,
@@ -216,7 +217,7 @@ void GraphSendUVGradOpCUDAKernelLaunchHelper(const Context& ctx,
                                              const std::string& compute_type,
                                              DenseTensor* x_grad,
                                              DenseTensor* y_grad) {
-  const int& index_size = dst_index.dims()[0];
+  const int64_t& index_size = dst_index.dims()[0];
 
   ctx.template Alloc<T>(x_grad);
   T* x_grad_data = x_grad->data<T>();
diff --git a/paddle/phi/kernels/gpu/graph_send_uv_kernel.cu b/paddle/phi/kernels/gpu/graph_send_uv_kernel.cu
index 2a8bb9eed5750..31e48f1f90704 100644
--- a/paddle/phi/kernels/gpu/graph_send_uv_kernel.cu
+++ b/paddle/phi/kernels/gpu/graph_send_uv_kernel.cu
@@ -70,7 +70,7 @@ void GraphSendUVOpCUDAKernelLaunchHelper(const Context& ctx,
                                          const DenseTensor& dst_index,
                                          const std::string& compute_type,
                                          DenseTensor* out) {
-  const int& index_size = src_index.dims()[0];
+  const int64_t& index_size = src_index.dims()[0];
   auto out_dims = out->dims();
   int64_t memset_size = 1;
   for (int i = 0; i < out_dims.size(); i++) {

From a91b584a6cc7d176a449e72b7f34f8bd3c7b13b9 Mon Sep 17 00:00:00 2001
From: DesmonDay <908660116@qq.com>
Date: Mon, 8 Aug 2022 03:46:14 +0000
Subject: [PATCH 06/12] add paddle.geometric.message_passing

---
 python/paddle/geometric/__init__.py           |  19 +++++++
 .../geometric/message_passing/.send.py.swp    | Bin 0 -> 12288 bytes
 .../geometric/message_passing/__init__.py     |  15 ++++++
 .../paddle/geometric/message_passing/send.py  |  13 +++++
 .../paddle/geometric/message_passing/utils.py |  47 ++++++++++++++++++
 python/setup.py.in                            |   2 +
 6 files changed, 96 insertions(+)
 create mode 100644 python/paddle/geometric/__init__.py
 create mode 100644 python/paddle/geometric/message_passing/.send.py.swp
 create mode 100644 python/paddle/geometric/message_passing/__init__.py
 create mode 100644 python/paddle/geometric/message_passing/send.py
 create mode 100644 python/paddle/geometric/message_passing/utils.py

diff --git a/python/paddle/geometric/__init__.py b/python/paddle/geometric/__init__.py
new file mode 100644
index 0000000000000..04a4103debdd9
--- /dev/null
+++ b/python/paddle/geometric/__init__.py
@@ -0,0 +1,19 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .message_passing import send_uv  # noqa: F401
+
+__all__ = [
+    'send_uv',
+]
diff --git a/python/paddle/geometric/message_passing/.send.py.swp b/python/paddle/geometric/message_passing/.send.py.swp
new file mode 100644
index 0000000000000000000000000000000000000000..7175c240dbb5b37d0b35e02de41dd55b64e35373
GIT binary patch
literal 12288
zcmeI2&5j#I5XWcXzyTHz9N|z*4(x(=>~$gq1?6Bii!x#vBYQVdPNVirji;rV?xwqE
zy~Blf;0lidyaC@L1a3S5uR!(MmNpPUuApk^@1E|duCDr(FP3XDx|e<kH<KF?jhCZH
zG=D7KeD2%bhp+EMR%;)&|7}R+yif%mTwkp8;>KZrFgkqFnV}J7+2HV_^<;WqYqi~6
zqHfV!SqxirPAoAu!a1pyLx-wNO#dX0yvGEXz*7<Ee0A{7?%DP6;V9T&i{F4(UpafK
zqin<km;e)C0!)AjFaajO1en18gMc?X(WjKkGn;f?+sr$EnKw(Gm;e)C0!)AjFaajO
z1egF5U;<2l2{3{GAOTTE(eVpW^bINe|G)kH|Mg2z^d;#N(oNDX>3Py`FGkTL(r2WM
z^epMC-6;Bk^f^hArlgGYHt8+WKIwPz|10Sc=@-&N($A!D?@y#3Nk5RjCw)ia!33B9
z6JP>NfC(@GCcp%k02BB}0ze~<<4c?EbBPs%4&uB!-@6z<KLBToT&fajRXV?YEOcu+
zkGby+-j1n5IUax#E#A(w!u?z1v$DE{r1P?IAX}rY2aB#22J@zNxvkyCX5HqzwaJfi
zZQT0w`b|<boh*|^^l0-M8-sRhd=mEVUT#s3CN#2@UU;dMgN{PB;A?~vS)g*5kii`B
z>K0UbZQ=|TvXTWfLM=P$riP_HN2{ox8tA%KQfqZ^DpW~c?x&}Br)Q^de=?g*veWc<
z4$~PNO|v`cX*$hl^*&6p2k=3f-5CH%3c><j7)sv(-AFVViDd|=l#ZIX7euoBd&D5!
zV+LAL01!{+kj`UxZ!%Bk1M+dEn&8;t$DOp~XVHT&bVDH)4N^v{aHm*Wq<3C}RIswr
zQ)*OOUHO%;7+oVjVbtE6Tf^aMwMv8$MU9EJ%OUv+1l{l?Jvz?j#|PtNba~`~?w}Pt
z=!F;LS3sc+t*XZ$3s27bjiu5aIv1)y$yJg9#ESxr4`GGxR3k%cJgf-XRVRVfrjA!Z
zO)g{Hz8Szhw4qQy@xflGeY`mo@4vgr@rWX{a#?%WEB0YL8joS!V88x^Nk?t=)+I1$
Q8khw`2QzeN&qKxj1d0w(O8@`>

literal 0
HcmV?d00001

diff --git a/python/paddle/geometric/message_passing/__init__.py b/python/paddle/geometric/message_passing/__init__.py
new file mode 100644
index 0000000000000..5beaea8aac2e5
--- /dev/null
+++ b/python/paddle/geometric/message_passing/__init__.py
@@ -0,0 +1,15 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .send import send_uv  # noqa: F401
diff --git a/python/paddle/geometric/message_passing/send.py b/python/paddle/geometric/message_passing/send.py
new file mode 100644
index 0000000000000..513558501a0eb
--- /dev/null
+++ b/python/paddle/geometric/message_passing/send.py
@@ -0,0 +1,13 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/paddle/geometric/message_passing/utils.py b/python/paddle/geometric/message_passing/utils.py
new file mode 100644
index 0000000000000..a0d54360f27a1
--- /dev/null
+++ b/python/paddle/geometric/message_passing/utils.py
@@ -0,0 +1,47 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+
+
+def reshape_lhs_rhs(x, y):
+    """
+    Expand dims to ensure there will be no broadcasting issues with different
+    number of dimensions.
+    """
+    if len(x.shape) == 1:
+        x = paddle.reshape(x, [-1, 1])
+    if len(y.shape) == 1:
+        y = paddle.reshape(y, [-1, 1])
+
+    x_shape = paddle.shape(x)
+    y_shape = paddle.shape(y)
+    if len(x.shape) != len(y.shape):
+        max_ndims = max(len(x.shape), len(y.shape))
+        x_pad_ndims = max_ndims - len(x.shape)
+        y_pad_ndims = max_ndims - len(y.shape)
+        new_x_shape = [
+            x_shape[0],
+        ] + [
+            1,
+        ] * x_pad_ndims + list(x_shape[1:])
+        new_y_shape = [
+            y_shape[0],
+        ] + [
+            1,
+        ] * y_pad_ndims + list(y_shape[1:])
+        x = paddle.reshape(x, new_x_shape)
+        y = paddle.reshape(y, new_y_shape)
+
+    return x, y
diff --git a/python/setup.py.in b/python/setup.py.in
index 1b36b272d0d70..b317da7972214 100755
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -400,6 +400,8 @@ packages=['paddle',
           'paddle.device.cuda',
           'paddle.version',
           'paddle.profiler',
+          'paddle.geometric',
+          'paddle.geometric.message_passing',
           ]
 
 with open('@PADDLE_SOURCE_DIR@/python/requirements.txt') as f:

From 6fdfcb16af1adde163c447aa8c9fed250358f355 Mon Sep 17 00:00:00 2001
From: DesmonDay <908660116@qq.com>
Date: Mon, 8 Aug 2022 07:49:44 +0000
Subject: [PATCH 07/12] add paddle.geometric.send_uv api, add unittest

---
 python/paddle/__init__.py                     |   1 +
 .../tests/unittests/test_graph_send_uv_op.py  | 116 ++++++++++++++++-
 .../paddle/geometric/message_passing/send.py  | 121 ++++++++++++++++++
 3 files changed, 236 insertions(+), 2 deletions(-)

diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py
index 2e4dc53290226..1c013e1ca3494 100755
--- a/python/paddle/__init__.py
+++ b/python/paddle/__init__.py
@@ -78,6 +78,7 @@
 import paddle.reader  # noqa: F401
 import paddle.static  # noqa: F401
 import paddle.vision  # noqa: F401
+import paddle.geometric  # noqa: F401
 
 from .tensor.attribute import is_complex  # noqa: F401
 from .tensor.attribute import is_integer  # noqa: F401
diff --git a/python/paddle/fluid/tests/unittests/test_graph_send_uv_op.py b/python/paddle/fluid/tests/unittests/test_graph_send_uv_op.py
index b5ca6364555c5..aa44b172025ed 100644
--- a/python/paddle/fluid/tests/unittests/test_graph_send_uv_op.py
+++ b/python/paddle/fluid/tests/unittests/test_graph_send_uv_op.py
@@ -41,10 +41,17 @@ def compute_graph_send_uv(inputs, attributes):
     return results
 
 
+def graph_send_uv_wrapper(x, y, src_index, dst_index, compute_type="add"):
+    return paddle.geometric.send_uv(x, y, src_index, dst_index,
+                                    compute_type.lower())
+
+
 class TestGraphSendUVOp(OpTest):
 
     def setUp(self):
         paddle.enable_static()
+        self.python_api = graph_send_uv_wrapper
+        self.python_out_sig = ['out']
         self.op_type = "graph_send_uv"
         self.set_config()
         self.inputs = {
@@ -58,10 +65,10 @@ def setUp(self):
         self.outputs = {'out': out}
 
     def test_check_output(self):
-        self.check_output(check_eager=False)
+        self.check_output(check_eager=True)
 
     def test_check_grad(self):
-        self.check_grad(['x', 'y'], 'out', check_eager=False)
+        self.check_grad(['x', 'y'], 'out', check_eager=True)
 
     def set_config(self):
         self.x = np.random.random((10, 20)).astype("float64")
@@ -147,3 +154,108 @@ def set_config(self):
         self.src_index = index[:, 0]
         self.dst_index = index[:, 1]
         self.compute_type = 'MUL'
+
+
+class API_GeometricSendUVTest(unittest.TestCase):
+
+    def test_compute_all_dygraph(self):
+        paddle.disable_static()
+        x = paddle.to_tensor([[0, 2, 3], [1, 4, 5], [2, 6, 7]], dtype="float32")
+        y = paddle.to_tensor([[1, 1, 2], [2, 3, 4], [4, 5, 6]], dtype="float32")
+        src_index = paddle.to_tensor(np.array([0, 1, 2, 0]), dtype="int32")
+        dst_index = paddle.to_tensor(np.array([1, 2, 1, 0]), dtype="int32")
+
+        res_add = paddle.geometric.send_uv(x,
+                                           y,
+                                           src_index,
+                                           dst_index,
+                                           compute_type="add")
+        res_sub = paddle.geometric.send_uv(x,
+                                           y,
+                                           src_index,
+                                           dst_index,
+                                           compute_type="sub")
+        res_mul = paddle.geometric.send_uv(x,
+                                           y,
+                                           src_index,
+                                           dst_index,
+                                           compute_type="mul")
+        res_div = paddle.geometric.send_uv(x,
+                                           y,
+                                           src_index,
+                                           dst_index,
+                                           compute_type="div")
+        res = [res_add, res_sub, res_mul, res_div]
+
+        np_add = np.array([[2, 5, 7], [5, 9, 11], [4, 9, 11], [1, 3, 5]],
+                          dtype="float32")
+        np_sub = np.array([[-2, -1, -1], [-3, -1, -1], [0, 3, 3], [-1, 1, 1]],
+                          dtype="float32")
+        np_mul = np.array([[0, 6, 12], [4, 20, 30], [4, 18, 28], [0, 2, 6]],
+                          dtype="float32")
+        np_div = np.array(
+            [[0, 2 / 3, 0.75], [0.25, 0.8, 5 / 6], [1, 2, 7 / 4], [0, 2, 1.5]],
+            dtype="float32")
+
+        for np_res, paddle_res in zip([np_add, np_sub, np_mul, np_div], res):
+            self.assertTrue(
+                np.allclose(np_res, paddle_res, atol=1e-6), "two value is\
+                {}\n{}, check diff!".format(np_res, paddle_res))
+
+    def test_compute_all_static(self):
+        paddle.enable_static()
+        with paddle.static.program_guard(paddle.static.Program()):
+            x = paddle.static.data(name="x", shape=[3, 3], dtype="float32")
+            y = paddle.static.data(name="y", shape=[3, 3], dtype="float32")
+            src_index = paddle.static.data(name="src", shape=[4], dtype="int32")
+            dst_index = paddle.static.data(name="dst", shape=[4], dtype="int32")
+            res_add = paddle.geometric.send_uv(x,
+                                               y,
+                                               src_index,
+                                               dst_index,
+                                               compute_type="add")
+            res_sub = paddle.geometric.send_uv(x,
+                                               y,
+                                               src_index,
+                                               dst_index,
+                                               compute_type="sub")
+            res_mul = paddle.geometric.send_uv(x,
+                                               y,
+                                               src_index,
+                                               dst_index,
+                                               compute_type="mul")
+            res_div = paddle.geometric.send_uv(x,
+                                               y,
+                                               src_index,
+                                               dst_index,
+                                               compute_type="div")
+
+            exe = paddle.static.Executor(paddle.CPUPlace())
+            data1 = np.array([[0, 2, 3], [1, 4, 5], [2, 6, 7]], dtype="float32")
+            data2 = np.array([[1, 1, 2], [2, 3, 4], [4, 5, 6]], dtype="float32")
+            data3 = np.array([0, 1, 2, 0], dtype="int32")
+            data4 = np.array([1, 2, 1, 0], dtype="int32")
+
+            np_add = np.array([[2, 5, 7], [5, 9, 11], [4, 9, 11], [1, 3, 5]],
+                              dtype="float32")
+            np_sub = np.array(
+                [[-2, -1, -1], [-3, -1, -1], [0, 3, 3], [-1, 1, 1]],
+                dtype="float32")
+            np_mul = np.array([[0, 6, 12], [4, 20, 30], [4, 18, 28], [0, 2, 6]],
+                              dtype="float32")
+            np_div = np.array([[0, 2 / 3, 0.75], [0.25, 0.8, 5 / 6],
+                               [1, 2, 7 / 4], [0, 2, 1.5]],
+                              dtype="float32")
+
+            ret = exe.run(feed={
+                'x': data1,
+                'y': data2,
+                'src': data3,
+                'dst': data4,
+            },
+                          fetch_list=[res_add, res_sub, res_mul, res_div])
+            for np_res, paddle_res in zip([np_add, np_sub, np_mul, np_div],
+                                          ret):
+                self.assertTrue(
+                    np.allclose(np_res, paddle_res, atol=1e-6), "two value is\
+                    {}\n{}, check diff!".format(np_res, paddle_res))
diff --git a/python/paddle/geometric/message_passing/send.py b/python/paddle/geometric/message_passing/send.py
index 513558501a0eb..64363cb8656f8 100644
--- a/python/paddle/geometric/message_passing/send.py
+++ b/python/paddle/geometric/message_passing/send.py
@@ -11,3 +11,124 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
+import numpy as np
+from paddle.fluid.layer_helper import LayerHelper
+from paddle.fluid.framework import _in_legacy_dygraph, in_dygraph_mode
+from paddle.fluid.data_feeder import check_variable_and_dtype
+from paddle import _C_ops
+
+from .utils import reshape_lhs_rhs
+
+
+def send_uv(x, y, src_index, dst_index, compute_type="add", name=None):
+    """
+
+    Graph Learning message passing api.
+
+    This api is mainly used in Graph Learning domain, and the main purpose is to reduce intermediate memory 
+    consumption in the process of message passing. Take `x` as the source node feature tensor, take `y` as 
+    the destination node feature tensor. Then we use `src_index` and `dst_index` to gather the corresponding data,
+    and then compute the edge features in different compute_types like `add`, `sub`, `mul`, `div`.
+
+    .. code-block:: text
+
+           Given:
+
+           X = [[0, 2, 3],
+                [1, 4, 5],
+                [2, 6, 7]]
+
+           Y = [[0, 1, 2],
+                [2, 3, 4],
+                [4, 5, 6]]
+
+           src_index = [0, 1, 2, 0]
+
+           dst_index = [1, 2, 1, 0]
+
+           compute_type = "add"
+
+           Then:
+
+           Out = [[2, 5, 7],
+                  [5, 9, 11],
+                  [4, 9, 11],
+                  [0, 3, 5]]
+
+    Args:
+        x (Tensor): The source node feature tensor, and the available data type is float32, float64, int32, int64. And we support float16 in gpu version.
+        y (Tensor): The destination node feature tensor, and the available data type is float32, float64, int32, int64. And we support float16 in gpu version.
+        src_index (Tensor): An 1-D tensor, and the available data type is int32, int64.
+        dst_index (Tensor): An 1-D tensor, and should have the same shape as `src_index`. 
+                            The available data type is int32, int64. 
+        compute_type (Tensor): Different compute types for x and y, including `add`, `sub`, `mul` and `div`.
+        name (str, optional): Name for the operation (optional, default is None).
+                              For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        out (Tensor): The output tensor.
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+
+            x = paddle.to_tensor([[0, 2, 3], [1, 4, 5], [2, 6, 7]], dtype="float32")
+            y = paddle.to_tensor([[0, 1, 2], [2, 3, 4], [4, 5, 6]], dtype="float32")
+            indexes = paddle.to_tensor([[0, 1], [1, 2], [2, 1], [0, 0]], dtype="int32")
+            src_index = indexes[:, 0]
+            dst_index = indexes[:, 1]
+            out = paddle.geometric.send_uv(x, y, src_index, dst_index, compute_type="add")
+            # Outputs: [[2., 5., 7.], [5., 9., 11.], [4., 9., 11.], [0., 3., 5.]]
+
+    """
+
+    if compute_type not in ['add', 'sub', 'mul', 'div']:
+        raise ValueError(
+            "compute_type should be `add`, `sub`, `mul`, `div`, but received %s"
+            % compute_type)
+
+    x, y = reshape_lhs_rhs(x, y)
+
+    if compute_type == 'sub':
+        compute_type = 'add'
+        y = -y
+    if compute_type == 'div':
+        compute_type = 'mul'
+        y = 1. / y
+
+    if in_dygraph_mode():
+        return _C_ops.final_state_graph_send_uv(x, y, src_index, dst_index,
+                                                compute_type.upper())
+    else:
+        if _in_legacy_dygraph():
+            return _C_ops.graph_send_uv(x, y, src_index, dst_index,
+                                        "compute_type", compute_type.upper())
+        else:
+            helper = LayerHelper("send_uv", **locals())
+            check_variable_and_dtype(x, 'x',
+                                     ['int32', 'int64', 'float32', 'float64'],
+                                     'graph_send_uv')
+            check_variable_and_dtype(y, 'y',
+                                     ['int32', 'int64', 'float32', 'float64'],
+                                     'graph_send_uv')
+            check_variable_and_dtype(src_index, 'src_index', ['int32', 'int64'],
+                                     'graph_send_uv')
+            check_variable_and_dtype(dst_index, 'dst_index', ['int32', 'int64'],
+                                     'graph_send_uv')
+            out = helper.create_variable_for_type_inference(dtype=x.dtype)
+
+            inputs = {
+                'x': x,
+                'y': y,
+                'src_index': src_index,
+                'dst_index': dst_index
+            }
+            attrs = {'compute_type': compute_type.upper()}
+            helper.append_op(type="graph_send_uv",
+                             inputs=inputs,
+                             attrs=attrs,
+                             outputs={"out": out})
+            return out

From c21919984beb617a37b18dc88ed528ae010feae4 Mon Sep 17 00:00:00 2001
From: DesmonDay <908660116@qq.com>
Date: Mon, 8 Aug 2022 13:06:21 +0000
Subject: [PATCH 08/12] add fp16 judgement

---
 python/paddle/geometric/message_passing/send.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/python/paddle/geometric/message_passing/send.py b/python/paddle/geometric/message_passing/send.py
index 64363cb8656f8..1d26c45c5f9ef 100644
--- a/python/paddle/geometric/message_passing/send.py
+++ b/python/paddle/geometric/message_passing/send.py
@@ -108,12 +108,12 @@ def send_uv(x, y, src_index, dst_index, compute_type="add", name=None):
                                         "compute_type", compute_type.upper())
         else:
             helper = LayerHelper("send_uv", **locals())
-            check_variable_and_dtype(x, 'x',
-                                     ['int32', 'int64', 'float32', 'float64'],
-                                     'graph_send_uv')
-            check_variable_and_dtype(y, 'y',
-                                     ['int32', 'int64', 'float32', 'float64'],
-                                     'graph_send_uv')
+            check_variable_and_dtype(
+                x, 'x', ['int32', 'int64', 'float32', 'float64', 'float16'],
+                'graph_send_uv')
+            check_variable_and_dtype(
+                y, 'y', ['int32', 'int64', 'float32', 'float64', 'float16'],
+                'graph_send_uv')
             check_variable_and_dtype(src_index, 'src_index', ['int32', 'int64'],
                                      'graph_send_uv')
             check_variable_and_dtype(dst_index, 'dst_index', ['int32', 'int64'],

From 6ceb41668ef9b81d70ca6bf75cd6c8d92f7a23a5 Mon Sep 17 00:00:00 2001
From: DesmonDay <908660116@qq.com>
Date: Fri, 12 Aug 2022 04:52:33 +0000
Subject: [PATCH 09/12] fix file typo, move compute_type to message_op

---
 paddle/phi/api/yaml/api.yaml                  |   2 +-
 paddle/phi/api/yaml/backward.yaml             |   4 +-
 paddle/phi/infermeta/multiary.cc              |   2 +-
 paddle/phi/infermeta/multiary.h               |   2 +-
 .../cpu/graph_send_ue_recv_grad_kernel.cc     |   2 +-
 .../kernels/cpu/graph_send_ue_recv_kernel.cc  |   2 +-
 .../kernels/cpu/graph_send_uv_grad_kernel.cc  |  38 ++---
 .../phi/kernels/cpu/graph_send_uv_kernel.cc   |  14 +-
 .../gpu/graph_send_ue_recv_grad_kernel.cu     |   2 +-
 .../kernels/gpu/graph_send_ue_recv_kernel.cu  |   2 +-
 .../kernels/gpu/graph_send_uv_grad_kernel.cu  |  51 +++----
 .../phi/kernels/gpu/graph_send_uv_kernel.cu   |  18 +--
 .../phi/kernels/graph_send_uv_grad_kernel.h   |   2 +-
 paddle/phi/kernels/graph_send_uv_kernel.h     |   2 +-
 .../impl/graph_messaage_passing_impl.h        | 140 ------------------
 .../impl/graph_send_ue_recv_kernel_impl.h     | 139 -----------------
 .../tests/unittests/test_graph_send_uv_op.py  |  44 +++---
 .../geometric/message_passing/.send.py.swp    | Bin 12288 -> 0 bytes
 .../paddle/geometric/message_passing/send.py  |  36 ++---
 19 files changed, 98 insertions(+), 404 deletions(-)
 delete mode 100644 paddle/phi/kernels/impl/graph_messaage_passing_impl.h
 delete mode 100644 paddle/phi/kernels/impl/graph_send_ue_recv_kernel_impl.h
 delete mode 100644 python/paddle/geometric/message_passing/.send.py.swp

diff --git a/paddle/phi/api/yaml/api.yaml b/paddle/phi/api/yaml/api.yaml
index 3454fe7e7d925..dafc15646e4b1 100644
--- a/paddle/phi/api/yaml/api.yaml
+++ b/paddle/phi/api/yaml/api.yaml
@@ -136,7 +136,7 @@
   backward : fft_r2c_grad
 
 - api : graph_send_uv
-  args : (Tensor x, Tensor y, Tensor src_index, Tensor dst_index, str compute_type = "ADD")
+  args : (Tensor x, Tensor y, Tensor src_index, Tensor dst_index, str message_op = "ADD")
   output : Tensor(out)
   infer_meta :
     func : GraphSendUVInferMeta
diff --git a/paddle/phi/api/yaml/backward.yaml b/paddle/phi/api/yaml/backward.yaml
index 574f554bd0203..d2ed2533ae03e 100644
--- a/paddle/phi/api/yaml/backward.yaml
+++ b/paddle/phi/api/yaml/backward.yaml
@@ -148,8 +148,8 @@
   no_need_buffer: x
 
 - backward_api : graph_send_uv_grad
-  forward : graph_send_uv (Tensor x, Tensor y, Tensor src_index, Tensor dst_index, str compute_type = "ADD") -> Tensor(out)
-  args: (Tensor x, Tensor y, Tensor src_index, Tensor dst_index, Tensor out_grad, str compute_type = "ADD")
+  forward : graph_send_uv (Tensor x, Tensor y, Tensor src_index, Tensor dst_index, str message_op = "ADD") -> Tensor(out)
+  args: (Tensor x, Tensor y, Tensor src_index, Tensor dst_index, Tensor out_grad, str message_op = "ADD")
   output : Tensor(x_grad), Tensor(y_grad)
   infer_meta :
     func : GeneralBinaryGradInferMeta
diff --git a/paddle/phi/infermeta/multiary.cc b/paddle/phi/infermeta/multiary.cc
index 41e4acc323e4c..27c3d20d853c3 100644
--- a/paddle/phi/infermeta/multiary.cc
+++ b/paddle/phi/infermeta/multiary.cc
@@ -2691,7 +2691,7 @@ void GraphSendUVInferMeta(const MetaTensor& x,
                           const MetaTensor& y,
                           const MetaTensor& src_index,
                           const MetaTensor& dst_index,
-                          const std::string& compute_type,
+                          const std::string& message_op,
                           MetaTensor* out) {
   auto src_index_dims = src_index.dims();
   if (src_index_dims.size() == 2) {
diff --git a/paddle/phi/infermeta/multiary.h b/paddle/phi/infermeta/multiary.h
index 3d0ec5fa22c4d..05c834cebf361 100644
--- a/paddle/phi/infermeta/multiary.h
+++ b/paddle/phi/infermeta/multiary.h
@@ -480,7 +480,7 @@ void GraphSendUVInferMeta(const MetaTensor& x,
                           const MetaTensor& y,
                           const MetaTensor& src_index,
                           const MetaTensor& dst_index,
-                          const std::string& compute_type,
+                          const std::string& message_op,
                           MetaTensor* out);
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/cpu/graph_send_ue_recv_grad_kernel.cc b/paddle/phi/kernels/cpu/graph_send_ue_recv_grad_kernel.cc
index 95fdc6ff0a9cc..c7b1e3e51853b 100644
--- a/paddle/phi/kernels/cpu/graph_send_ue_recv_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/graph_send_ue_recv_grad_kernel.cc
@@ -24,7 +24,7 @@
 #include "paddle/phi/kernels/cpu/graph_send_ue_recv_funcs.h"
 #include "paddle/phi/kernels/empty_kernel.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
-#include "paddle/phi/kernels/impl/graph_messaage_passing_impl.h"
+#include "paddle/phi/kernels/impl/graph_message_passing_impl.h"
 #include "paddle/phi/kernels/reduce_sum_kernel.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/cpu/graph_send_ue_recv_kernel.cc b/paddle/phi/kernels/cpu/graph_send_ue_recv_kernel.cc
index 74fca002294db..ab9adc3897170 100644
--- a/paddle/phi/kernels/cpu/graph_send_ue_recv_kernel.cc
+++ b/paddle/phi/kernels/cpu/graph_send_ue_recv_kernel.cc
@@ -22,7 +22,7 @@
 #include "paddle/phi/core/hostdevice.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/cpu/graph_send_ue_recv_funcs.h"
-#include "paddle/phi/kernels/impl/graph_messaage_passing_impl.h"
+#include "paddle/phi/kernels/impl/graph_message_passing_impl.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/cpu/graph_send_uv_grad_kernel.cc b/paddle/phi/kernels/cpu/graph_send_uv_grad_kernel.cc
index 4b224726b6ffb..bd92875461da7 100644
--- a/paddle/phi/kernels/cpu/graph_send_uv_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/graph_send_uv_grad_kernel.cc
@@ -19,7 +19,7 @@
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/empty_kernel.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
-#include "paddle/phi/kernels/impl/graph_send_ue_recv_kernel_impl.h"
+#include "paddle/phi/kernels/impl/graph_message_passing_impl.h"
 #include "paddle/phi/kernels/reduce_sum_kernel.h"
 
 namespace phi {
@@ -31,7 +31,7 @@ void CalculateGrad(const Context& ctx,
                    const IndexT* d_index,
                    const phi::DDim& out_grad_dims,
                    const phi::DDim& x_grad_dims,
-                   const std::string& compute_type,
+                   const std::string& message_op,
                    int64_t index_size,
                    int64_t slice_size,
                    T* x_grad,
@@ -40,7 +40,7 @@ void CalculateGrad(const Context& ctx,
   std::vector<int64_t> reduce_idx;
   bool reduce = ReduceGrad(out_grad_dims, x_grad_dims, reduce_idx);
 
-  if (compute_type == "ADD") {
+  if (message_op == "ADD") {
     if (!reduce) {
 #ifdef PADDLE_WITH_MKLML
 #pragma omp parallel for
@@ -91,7 +91,7 @@ void CalculateGrad(const Context& ctx,
           true);
       memcpy(x_grad, x_grad_out.data<T>(), x_grad_out.numel() * sizeof(T));
     }
-  } else if (compute_type == "MUL") {
+  } else if (message_op == "MUL") {
     const auto& bcast = phi::CalcBCastInfo(y.dims(), out_grad_dims);
     const T* y_data = y.data<T>();
     if (!reduce) {
@@ -163,7 +163,7 @@ void GraphSendUVGradOpKernelLaunchHelper(const Context& ctx,
                                          const DenseTensor& out_grad,
                                          const DenseTensor& src_index,
                                          const DenseTensor& dst_index,
-                                         const std::string& compute_type,
+                                         const std::string& message_op,
                                          DenseTensor* x_grad,
                                          DenseTensor* y_grad) {
   const int64_t& index_size = dst_index.dims()[0];
@@ -201,7 +201,7 @@ void GraphSendUVGradOpKernelLaunchHelper(const Context& ctx,
                                     s_index,
                                     out_grad_dims,
                                     x_grad_dims,
-                                    compute_type,
+                                    message_op,
                                     index_size,
                                     slice_size_x,
                                     x_grad_data,
@@ -214,7 +214,7 @@ void GraphSendUVGradOpKernelLaunchHelper(const Context& ctx,
                                     d_index,
                                     out_grad_dims,
                                     y_grad_dims,
-                                    compute_type,
+                                    message_op,
                                     index_size,
                                     slice_size_y,
                                     y_grad_data,
@@ -229,30 +229,16 @@ void GraphSendUVGradKernel(const Context& ctx,
                            const DenseTensor& src_index,
                            const DenseTensor& dst_index,
                            const DenseTensor& out_grad,
-                           const std::string& compute_type,
+                           const std::string& message_op,
                            DenseTensor* x_grad,
                            DenseTensor* y_grad) {
   auto index_type = src_index.dtype();
   if (index_type == phi::DataType::INT32) {
-    GraphSendUVGradOpKernelLaunchHelper<Context, T, int32_t>(ctx,
-                                                             x,
-                                                             y,
-                                                             out_grad,
-                                                             src_index,
-                                                             dst_index,
-                                                             compute_type,
-                                                             x_grad,
-                                                             y_grad);
+    GraphSendUVGradOpKernelLaunchHelper<Context, T, int32_t>(
+        ctx, x, y, out_grad, src_index, dst_index, message_op, x_grad, y_grad);
   } else if (index_type == phi::DataType::INT64) {
-    GraphSendUVGradOpKernelLaunchHelper<Context, T, int64_t>(ctx,
-                                                             x,
-                                                             y,
-                                                             out_grad,
-                                                             src_index,
-                                                             dst_index,
-                                                             compute_type,
-                                                             x_grad,
-                                                             y_grad);
+    GraphSendUVGradOpKernelLaunchHelper<Context, T, int64_t>(
+        ctx, x, y, out_grad, src_index, dst_index, message_op, x_grad, y_grad);
   }
 }
 
diff --git a/paddle/phi/kernels/cpu/graph_send_uv_kernel.cc b/paddle/phi/kernels/cpu/graph_send_uv_kernel.cc
index 3689f51199777..6404999ba23df 100644
--- a/paddle/phi/kernels/cpu/graph_send_uv_kernel.cc
+++ b/paddle/phi/kernels/cpu/graph_send_uv_kernel.cc
@@ -18,7 +18,7 @@
 #include "paddle/phi/core/hostdevice.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/cpu/graph_send_ue_recv_funcs.h"
-#include "paddle/phi/kernels/impl/graph_send_ue_recv_kernel_impl.h"
+#include "paddle/phi/kernels/impl/graph_message_passing_impl.h"
 
 namespace phi {
 
@@ -55,7 +55,7 @@ void GraphSendUVOpKernelLaunchHelper(const Context& ctx,
                                      const DenseTensor& y,
                                      const DenseTensor& src_index,
                                      const DenseTensor& dst_index,
-                                     const std::string& compute_type,
+                                     const std::string& message_op,
                                      DenseTensor* out) {
   const int& index_size = src_index.dims()[0];
   auto out_dims = out->dims();
@@ -72,7 +72,7 @@ void GraphSendUVOpKernelLaunchHelper(const Context& ctx,
   const T* y_data = y.data<T>();
   const IndexT* s_index = src_index.data<IndexT>();
   const IndexT* d_index = dst_index.data<IndexT>();
-  if (compute_type == "ADD") {
+  if (message_op == "ADD") {
     GraphAddFunctor<T> add_functor;
     GraphSendUVCpuKernel<T, IndexT, GraphAddFunctor<T>>(bcast_info,
                                                         x_data,
@@ -82,7 +82,7 @@ void GraphSendUVOpKernelLaunchHelper(const Context& ctx,
                                                         out_data,
                                                         index_size,
                                                         add_functor);
-  } else if (compute_type == "MUL") {
+  } else if (message_op == "MUL") {
     GraphMulFunctor<T> mul_functor;
     GraphSendUVCpuKernel<T, IndexT, GraphMulFunctor<T>>(bcast_info,
                                                         x_data,
@@ -101,15 +101,15 @@ void GraphSendUVKernel(const Context& ctx,
                        const DenseTensor& y,
                        const DenseTensor& src_index,
                        const DenseTensor& dst_index,
-                       const std::string& compute_type,
+                       const std::string& message_op,
                        DenseTensor* out) {
   auto index_type = src_index.dtype();
   if (index_type == phi::DataType::INT32) {
     GraphSendUVOpKernelLaunchHelper<Context, T, int32_t>(
-        ctx, x, y, src_index, dst_index, compute_type, out);
+        ctx, x, y, src_index, dst_index, message_op, out);
   } else if (index_type == phi::DataType::INT64) {
     GraphSendUVOpKernelLaunchHelper<Context, T, int64_t>(
-        ctx, x, y, src_index, dst_index, compute_type, out);
+        ctx, x, y, src_index, dst_index, message_op, out);
   }
 }
 
diff --git a/paddle/phi/kernels/gpu/graph_send_ue_recv_grad_kernel.cu b/paddle/phi/kernels/gpu/graph_send_ue_recv_grad_kernel.cu
index cb3d5591a7be6..c5d5fb7196fb2 100644
--- a/paddle/phi/kernels/gpu/graph_send_ue_recv_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/graph_send_ue_recv_grad_kernel.cu
@@ -21,7 +21,7 @@
 #include "paddle/phi/kernels/funcs/math_function.h"
 #include "paddle/phi/kernels/gpu/graph_send_recv_funcs.h"
 #include "paddle/phi/kernels/gpu/graph_send_ue_recv_funcs.h"
-#include "paddle/phi/kernels/impl/graph_messaage_passing_impl.h"
+#include "paddle/phi/kernels/impl/graph_message_passing_impl.h"
 #include "paddle/phi/kernels/reduce_sum_kernel.h"
 
 namespace phi {
diff --git a/paddle/phi/kernels/gpu/graph_send_ue_recv_kernel.cu b/paddle/phi/kernels/gpu/graph_send_ue_recv_kernel.cu
index f339387f0bbfc..7351c562dff9d 100644
--- a/paddle/phi/kernels/gpu/graph_send_ue_recv_kernel.cu
+++ b/paddle/phi/kernels/gpu/graph_send_ue_recv_kernel.cu
@@ -15,7 +15,7 @@
 #include "paddle/phi/kernels/graph_send_ue_recv_kernel.h"
 #include "paddle/phi/kernels/gpu/graph_send_recv_funcs.h"
 #include "paddle/phi/kernels/gpu/graph_send_ue_recv_funcs.h"
-#include "paddle/phi/kernels/impl/graph_messaage_passing_impl.h"
+#include "paddle/phi/kernels/impl/graph_message_passing_impl.h"
 
 #include <thrust/device_vector.h>
 #include <thrust/fill.h>
diff --git a/paddle/phi/kernels/gpu/graph_send_uv_grad_kernel.cu b/paddle/phi/kernels/gpu/graph_send_uv_grad_kernel.cu
index ae3c6eea19a39..4fb0335241d01 100644
--- a/paddle/phi/kernels/gpu/graph_send_uv_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/graph_send_uv_grad_kernel.cu
@@ -22,7 +22,7 @@
 #include "paddle/phi/kernels/funcs/math_function.h"
 #include "paddle/phi/kernels/gpu/graph_send_recv_funcs.h"
 #include "paddle/phi/kernels/gpu/graph_send_ue_recv_funcs.h"
-#include "paddle/phi/kernels/impl/graph_send_ue_recv_kernel_impl.h"
+#include "paddle/phi/kernels/impl/graph_message_passing_impl.h"
 #include "paddle/phi/kernels/reduce_sum_kernel.h"
 
 namespace phi {
@@ -59,7 +59,7 @@ void CalculateGrad(const Context& ctx,
                    const IndexT* d_index,
                    const phi::DDim& out_grad_dims,
                    const phi::DDim& x_grad_dims,
-                   const std::string& compute_type,
+                   const std::string& message_op,
                    int64_t index_size,
                    int64_t slice_size,
                    T* x_grad,
@@ -68,10 +68,10 @@ void CalculateGrad(const Context& ctx,
   std::vector<int64_t> reduce_idx;
   bool reduce = ReduceGrad(out_grad_dims, x_grad_dims, reduce_idx);
 
-  if (compute_type == "ADD") {
+  if (message_op == "ADD") {
     if (!reduce) {
-      const int ntx = FindNumThreads(slice_size);
-      const int nty = CUDA_MAX_NUM_THREADS / ntx;
+      const int ntx = FindNumThreads(slice_size, ctx.GetMaxThreadsPerBlock());
+      const int nty = ctx.GetMaxThreadsPerBlock() / ntx;
       const int nbx = (slice_size + ntx - 1) / ntx;
       const int nby = (index_size + nty - 1) / nty;
       const dim3 grid_tmp(nbx, nby);
@@ -89,8 +89,9 @@ void CalculateGrad(const Context& ctx,
       phi::funcs::SetConstant<Context, T>()(ctx, &x_grad_v2, T(0));
       T* x_grad_v2_data = x_grad_v2.data<T>();
 
-      const int ntx = FindNumThreads(bcast_info.out_len);
-      const int nty = CUDA_MAX_NUM_THREADS / ntx;
+      const int ntx =
+          FindNumThreads(bcast_info.out_len, ctx.GetMaxThreadsPerBlock());
+      const int nty = ctx.GetMaxThreadsPerBlock() / ntx;
       const int nbx = (bcast_info.out_len + ntx - 1) / ntx;
       const int nby = (index_size + nty - 1) / nty;
       const dim3 grid_tmp(nbx, nby);
@@ -122,15 +123,15 @@ void CalculateGrad(const Context& ctx,
                  cudaMemcpyDeviceToDevice);
 #endif
     }
-  } else if (compute_type == "MUL") {
+  } else if (message_op == "MUL") {
     const auto& bcast_info = phi::CalcBCastInfo(y.dims(), out_grad_dims);
     thrust::device_vector<int64_t> l_bcastoff, r_bcastoff;
     if (bcast_info.use_bcast) {
       CopyBCastOff(bcast_info, l_bcastoff, r_bcastoff);
     }
     int64_t out_len = bcast_info.out_len;
-    const int ntx = FindNumThreads(out_len);
-    const int nty = CUDA_MAX_NUM_THREADS / ntx;
+    const int ntx = FindNumThreads(out_len, ctx.GetMaxThreadsPerBlock());
+    const int nty = ctx.GetMaxThreadsPerBlock() / ntx;
     const int nbx = (out_len + ntx - 1) / ntx;
     const int nby = (index_size + nty - 1) / nty;
     const dim3 grid_(nbx, nby);
@@ -214,7 +215,7 @@ void GraphSendUVGradOpCUDAKernelLaunchHelper(const Context& ctx,
                                              const DenseTensor& out_grad,
                                              const DenseTensor& src_index,
                                              const DenseTensor& dst_index,
-                                             const std::string& compute_type,
+                                             const std::string& message_op,
                                              DenseTensor* x_grad,
                                              DenseTensor* y_grad) {
   const int64_t& index_size = dst_index.dims()[0];
@@ -258,7 +259,7 @@ void GraphSendUVGradOpCUDAKernelLaunchHelper(const Context& ctx,
                                     d_index,
                                     out_grad_dims,
                                     x_grad_dims,
-                                    compute_type,
+                                    message_op,
                                     index_size,
                                     slice_size_x,
                                     x_grad_data,
@@ -271,7 +272,7 @@ void GraphSendUVGradOpCUDAKernelLaunchHelper(const Context& ctx,
                                     s_index,
                                     out_grad_dims,
                                     y_grad_dims,
-                                    compute_type,
+                                    message_op,
                                     index_size,
                                     slice_size_y,
                                     y_grad_data,
@@ -286,30 +287,16 @@ void GraphSendUVGradKernel(const Context& ctx,
                            const DenseTensor& src_index,
                            const DenseTensor& dst_index,
                            const DenseTensor& out_grad,
-                           const std::string& compute_type,
+                           const std::string& message_op,
                            DenseTensor* x_grad,
                            DenseTensor* y_grad) {
   auto index_type = src_index.dtype();
   if (index_type == phi::DataType::INT32) {
-    GraphSendUVGradOpCUDAKernelLaunchHelper<Context, T, int32_t>(ctx,
-                                                                 x,
-                                                                 y,
-                                                                 out_grad,
-                                                                 src_index,
-                                                                 dst_index,
-                                                                 compute_type,
-                                                                 x_grad,
-                                                                 y_grad);
+    GraphSendUVGradOpCUDAKernelLaunchHelper<Context, T, int32_t>(
+        ctx, x, y, out_grad, src_index, dst_index, message_op, x_grad, y_grad);
   } else if (index_type == phi::DataType::INT64) {
-    GraphSendUVGradOpCUDAKernelLaunchHelper<Context, T, int64_t>(ctx,
-                                                                 x,
-                                                                 y,
-                                                                 out_grad,
-                                                                 src_index,
-                                                                 dst_index,
-                                                                 compute_type,
-                                                                 x_grad,
-                                                                 y_grad);
+    GraphSendUVGradOpCUDAKernelLaunchHelper<Context, T, int64_t>(
+        ctx, x, y, out_grad, src_index, dst_index, message_op, x_grad, y_grad);
   }
 }
 
diff --git a/paddle/phi/kernels/gpu/graph_send_uv_kernel.cu b/paddle/phi/kernels/gpu/graph_send_uv_kernel.cu
index 31e48f1f90704..05f38a5dd3cda 100644
--- a/paddle/phi/kernels/gpu/graph_send_uv_kernel.cu
+++ b/paddle/phi/kernels/gpu/graph_send_uv_kernel.cu
@@ -14,7 +14,7 @@
 
 #include "paddle/phi/kernels/graph_send_uv_kernel.h"
 #include "paddle/phi/kernels/gpu/graph_send_ue_recv_funcs.h"
-#include "paddle/phi/kernels/impl/graph_send_ue_recv_kernel_impl.h"
+#include "paddle/phi/kernels/impl/graph_message_passing_impl.h"
 
 #include <thrust/device_vector.h>
 
@@ -68,7 +68,7 @@ void GraphSendUVOpCUDAKernelLaunchHelper(const Context& ctx,
                                          const DenseTensor& y,
                                          const DenseTensor& src_index,
                                          const DenseTensor& dst_index,
-                                         const std::string& compute_type,
+                                         const std::string& message_op,
                                          DenseTensor* out) {
   const int64_t& index_size = src_index.dims()[0];
   auto out_dims = out->dims();
@@ -92,13 +92,13 @@ void GraphSendUVOpCUDAKernelLaunchHelper(const Context& ctx,
   }
 
   int64_t out_len = bcast_info.out_len;
-  const int ntx = FindNumThreads(out_len);
-  const int nty = CUDA_MAX_NUM_THREADS / ntx;
+  const int ntx = FindNumThreads(out_len, ctx.GetMaxThreadsPerBlock());
+  const int nty = ctx.GetMaxThreadsPerBlock() / ntx;
   const int nbx = (out_len + ntx - 1) / ntx;
   const int nby = (index_size + nty - 1) / nty;
   const dim3 grid(nbx, nby);
   const dim3 block(ntx, nty);
-  if (compute_type == "ADD") {
+  if (message_op == "ADD") {
     funcs::AddFunctor<T> add_functor;
     GraphSendUVCUDAKernel<T, IndexT, funcs::AddFunctor<T>>
         <<<grid, block, 0, ctx.stream()>>>(
@@ -115,7 +115,7 @@ void GraphSendUVOpCUDAKernelLaunchHelper(const Context& ctx,
             out_len,
             bcast_info.use_bcast,
             add_functor);
-  } else if (compute_type == "MUL") {
+  } else if (message_op == "MUL") {
     funcs::MultiplyFunctor<T> mul_functor;
     GraphSendUVCUDAKernel<T, IndexT, funcs::MultiplyFunctor<T>>
         <<<grid, block, 0, ctx.stream()>>>(
@@ -141,15 +141,15 @@ void GraphSendUVKernel(const Context& ctx,
                        const DenseTensor& y,
                        const DenseTensor& src_index,
                        const DenseTensor& dst_index,
-                       const std::string& compute_type,
+                       const std::string& message_op,
                        DenseTensor* out) {
   auto index_type = src_index.dtype();
   if (index_type == phi::DataType::INT32) {
     GraphSendUVOpCUDAKernelLaunchHelper<Context, T, int32_t>(
-        ctx, x, y, src_index, dst_index, compute_type, out);
+        ctx, x, y, src_index, dst_index, message_op, out);
   } else if (index_type == phi::DataType::INT64) {
     GraphSendUVOpCUDAKernelLaunchHelper<Context, T, int64_t>(
-        ctx, x, y, src_index, dst_index, compute_type, out);
+        ctx, x, y, src_index, dst_index, message_op, out);
   }
 }
 
diff --git a/paddle/phi/kernels/graph_send_uv_grad_kernel.h b/paddle/phi/kernels/graph_send_uv_grad_kernel.h
index 329e4e61e9e74..fa2285627a4b7 100644
--- a/paddle/phi/kernels/graph_send_uv_grad_kernel.h
+++ b/paddle/phi/kernels/graph_send_uv_grad_kernel.h
@@ -26,7 +26,7 @@ void GraphSendUVGradKernel(const Context& ctx,
                            const DenseTensor& src_index,
                            const DenseTensor& dst_index,
                            const DenseTensor& out_grad,
-                           const std::string& compute_type,
+                           const std::string& message_op,
                            DenseTensor* x_grad,
                            DenseTensor* y_grad);
 
diff --git a/paddle/phi/kernels/graph_send_uv_kernel.h b/paddle/phi/kernels/graph_send_uv_kernel.h
index 159a27fc30a3b..7b723122c1a7f 100644
--- a/paddle/phi/kernels/graph_send_uv_kernel.h
+++ b/paddle/phi/kernels/graph_send_uv_kernel.h
@@ -25,7 +25,7 @@ void GraphSendUVKernel(const Context& ctx,
                        const DenseTensor& y,
                        const DenseTensor& src_index,
                        const DenseTensor& dst_index,
-                       const std::string& compute_type,
+                       const std::string& message_op,
                        DenseTensor* out);
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/impl/graph_messaage_passing_impl.h b/paddle/phi/kernels/impl/graph_messaage_passing_impl.h
deleted file mode 100644
index dc1477e77227b..0000000000000
--- a/paddle/phi/kernels/impl/graph_messaage_passing_impl.h
+++ /dev/null
@@ -1,140 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-// Copyright The DGL team.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <vector>
-
-#include "paddle/phi/kernels/funcs/common_shape.h"
-#include "paddle/phi/kernels/funcs/eigen/common.h"
-
-namespace phi {
-
-struct BroadCastInfo {
-  bool use_bcast;
-  // l_offset[i] indicates the start position of tensor lhs that required to
-  // compute the i-th element in output, so as r_offset[i].
-  std::vector<int64_t> l_offset, r_offset;
-  int64_t l_len, r_len, out_len, reduce_size;
-};
-
-inline bool UseBroadCast(const phi::DDim& l_dims, const phi::DDim& r_dims) {
-  if (l_dims.size() != r_dims.size()) {
-    return true;
-  }
-  for (int i = 1; i < l_dims.size(); i++) {
-    if (l_dims[i] != r_dims[i]) {
-      return true;
-    }
-  }
-  return false;
-}
-
-inline BroadCastInfo CalcBCastInfo(const phi::DDim& l_dims,
-                                   const phi::DDim& r_dims) {
-  BroadCastInfo binfo;
-  binfo.use_bcast = UseBroadCast(l_dims, r_dims);
-  binfo.l_len = 1;
-  binfo.r_len = 1;
-  for (int i = 1; i < l_dims.size(); i++) {
-    binfo.l_len *= l_dims[i];
-  }
-  for (int i = 1; i < r_dims.size(); i++) {
-    binfo.r_len *= r_dims[i];
-  }
-  // TODO(daisiming): Whether to add dot.
-  binfo.reduce_size = 1;
-  if (binfo.use_bcast) {
-    const int max_dim = std::max(l_dims.size(), r_dims.size()) - 1;
-    int stride_l = 1, stride_r = 1;
-    binfo.l_offset.emplace_back(0);
-    binfo.r_offset.emplace_back(0);
-    int out_len = 1;
-    for (int i = 0; i < max_dim; i++) {
-      // Iterate the axis from back to front.
-      const int dl =
-          (l_dims.size() - 1 - i < 1) ? 1 : l_dims[l_dims.size() - 1 - i];
-      const int dr =
-          (r_dims.size() - 1 - i < 1) ? 1 : r_dims[r_dims.size() - 1 - i];
-      for (int j = 1; j < std::max(dl, dr); j++) {
-        for (int k = 0; k < out_len; k++) {
-          binfo.l_offset.emplace_back(binfo.l_offset[k] +
-                                      j * (j < dl) * stride_l);
-          binfo.r_offset.emplace_back(binfo.r_offset[k] +
-                                      j * (j < dr) * stride_r);
-        }
-      }
-      out_len *= std::max(dl, dr);
-      stride_l *= dl;
-      stride_r *= dr;
-    }
-    binfo.out_len = out_len;
-  } else {
-    binfo.out_len = binfo.l_len;
-  }
-  return binfo;
-}
-
-inline std::vector<int> InferBroadcastShape(const phi::DDim& x_dims,
-                                            const phi::DDim& e_dims,
-                                            const std::string& type = "x") {
-  auto x_dims1 = phi::vectorize<int>(x_dims);
-  auto e_dims1 = phi::vectorize<int>(e_dims);
-  std::vector<int> x_dims2(x_dims1.begin() + 1, x_dims1.end());
-  std::vector<int> e_dims2(e_dims1.begin() + 1, e_dims1.end());
-  int max_dim = std::max(x_dims2.size(), e_dims2.size());
-  int axis = std::abs(static_cast<int>(x_dims2.size() - e_dims2.size()));
-  std::vector<int> x_dims_array(max_dim);
-  std::vector<int> e_dims_array(max_dim);
-  std::vector<int> out_dims_array(max_dim);
-  // Only need to broadcast dimensions other than the 0th dimension.
-  phi::funcs::GetBroadcastDimsArrays(phi::make_ddim(x_dims2),
-                                     phi::make_ddim(e_dims2),
-                                     x_dims_array.data(),
-                                     e_dims_array.data(),
-                                     out_dims_array.data(),
-                                     max_dim,
-                                     axis);
-  if (type == "x") {
-    out_dims_array.insert(out_dims_array.begin(), x_dims[0]);
-  } else {
-    out_dims_array.insert(out_dims_array.begin(), e_dims[0]);
-  }
-  return out_dims_array;
-}
-
-inline bool ReduceGrad(const phi::DDim& out_grad_dims,
-                       const phi::DDim& x_dims,
-                       std::vector<int64_t>& axis) {
-  // We must ensure the ndim of out_grad and x are the same.
-  bool reduce = false;
-  for (int i = 1; i < out_grad_dims.size(); i++) {
-    if (out_grad_dims[i] != x_dims[i]) {
-      reduce = true;
-      break;
-    }
-  }
-  if (!reduce) return false;
-
-  // Get reduce axis.
-  for (int i = 1; i < out_grad_dims.size(); i++) {
-    if (out_grad_dims[i] - x_dims[i] != 0) {
-      axis.emplace_back(i);
-    }
-  }
-  return true;
-}
-
-}  // namespace phi
diff --git a/paddle/phi/kernels/impl/graph_send_ue_recv_kernel_impl.h b/paddle/phi/kernels/impl/graph_send_ue_recv_kernel_impl.h
deleted file mode 100644
index 35e51fb930c8d..0000000000000
--- a/paddle/phi/kernels/impl/graph_send_ue_recv_kernel_impl.h
+++ /dev/null
@@ -1,139 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <vector>
-
-#include "paddle/phi/kernels/funcs/common_shape.h"
-#include "paddle/phi/kernels/funcs/eigen/common.h"
-
-namespace phi {
-
-struct BroadCastInfo {
-  bool use_bcast;
-  // l_offset[i] indicates the start position of tensor lhs that required to
-  // compute the i-th element in output, so as r_offset[i].
-  std::vector<int64_t> l_offset, r_offset;
-  int64_t l_len, r_len, out_len, reduce_size;
-};
-
-inline bool UseBroadCast(const phi::DDim& l_dims, const phi::DDim& r_dims) {
-  if (l_dims.size() != r_dims.size()) {
-    return true;
-  }
-  for (int i = 1; i < l_dims.size(); i++) {
-    if (l_dims[i] != r_dims[i]) {
-      return true;
-    }
-  }
-  return false;
-}
-
-inline BroadCastInfo CalcBCastInfo(const phi::DDim& l_dims,
-                                   const phi::DDim& r_dims) {
-  BroadCastInfo binfo;
-  binfo.use_bcast = UseBroadCast(l_dims, r_dims);
-  binfo.l_len = 1;
-  binfo.r_len = 1;
-  for (int i = 1; i < l_dims.size(); i++) {
-    binfo.l_len *= l_dims[i];
-  }
-  for (int i = 1; i < r_dims.size(); i++) {
-    binfo.r_len *= r_dims[i];
-  }
-  // TODO(daisiming): Whether to add dot.
-  binfo.reduce_size = 1;
-  if (binfo.use_bcast) {
-    const int max_dim = std::max(l_dims.size(), r_dims.size()) - 1;
-    int stride_l = 1, stride_r = 1;
-    binfo.l_offset.emplace_back(0);
-    binfo.r_offset.emplace_back(0);
-    int out_len = 1;
-    for (int i = 0; i < max_dim; i++) {
-      // Iterate the axis from back to front.
-      const int dl =
-          (l_dims.size() - 1 - i < 1) ? 1 : l_dims[l_dims.size() - 1 - i];
-      const int dr =
-          (r_dims.size() - 1 - i < 1) ? 1 : r_dims[r_dims.size() - 1 - i];
-      for (int j = 1; j < std::max(dl, dr); j++) {
-        for (int k = 0; k < out_len; k++) {
-          binfo.l_offset.emplace_back(binfo.l_offset[k] +
-                                      j * (j < dl) * stride_l);
-          binfo.r_offset.emplace_back(binfo.r_offset[k] +
-                                      j * (j < dr) * stride_r);
-        }
-      }
-      out_len *= std::max(dl, dr);
-      stride_l *= dl;
-      stride_r *= dr;
-    }
-    binfo.out_len = out_len;
-  } else {
-    binfo.out_len = binfo.l_len;
-  }
-  return binfo;
-}
-
-inline std::vector<int> InferBroadcastShape(const phi::DDim& x_dims,
-                                            const phi::DDim& e_dims,
-                                            const std::string& type = "x") {
-  auto x_dims1 = phi::vectorize<int>(x_dims);
-  auto e_dims1 = phi::vectorize<int>(e_dims);
-  std::vector<int> x_dims2(x_dims1.begin() + 1, x_dims1.end());
-  std::vector<int> e_dims2(e_dims1.begin() + 1, e_dims1.end());
-  int max_dim = std::max(x_dims2.size(), e_dims2.size());
-  int axis = std::abs(static_cast<int>(x_dims2.size() - e_dims2.size()));
-  std::vector<int> x_dims_array(max_dim);
-  std::vector<int> e_dims_array(max_dim);
-  std::vector<int> out_dims_array(max_dim);
-  // Only need to broadcast dimensions other than the 0th dimension.
-  phi::funcs::GetBroadcastDimsArrays(phi::make_ddim(x_dims2),
-                                     phi::make_ddim(e_dims2),
-                                     x_dims_array.data(),
-                                     e_dims_array.data(),
-                                     out_dims_array.data(),
-                                     max_dim,
-                                     axis);
-  if (type == "x") {
-    out_dims_array.insert(out_dims_array.begin(), x_dims[0]);
-  } else {
-    out_dims_array.insert(out_dims_array.begin(), e_dims[0]);
-  }
-  return out_dims_array;
-}
-
-inline bool ReduceGrad(const phi::DDim& out_grad_dims,
-                       const phi::DDim& x_dims,
-                       std::vector<int64_t>& axis) {
-  // We must ensure the ndim of out_grad and x are the same.
-  bool reduce = false;
-  for (int i = 1; i < out_grad_dims.size(); i++) {
-    if (out_grad_dims[i] != x_dims[i]) {
-      reduce = true;
-      break;
-    }
-  }
-  if (!reduce) return false;
-
-  // Get reduce axis.
-  for (int i = 1; i < out_grad_dims.size(); i++) {
-    if (out_grad_dims[i] - x_dims[i] != 0) {
-      axis.emplace_back(i);
-    }
-  }
-  return true;
-}
-
-}  // namespace phi
diff --git a/python/paddle/fluid/tests/unittests/test_graph_send_uv_op.py b/python/paddle/fluid/tests/unittests/test_graph_send_uv_op.py
index aa44b172025ed..ee74016999c68 100644
--- a/python/paddle/fluid/tests/unittests/test_graph_send_uv_op.py
+++ b/python/paddle/fluid/tests/unittests/test_graph_send_uv_op.py
@@ -27,23 +27,23 @@ def compute_graph_send_uv(inputs, attributes):
     y = inputs['y']
     src_index = inputs['src_index']
     dst_index = inputs['dst_index']
-    compute_type = attributes['compute_type']
+    message_op = attributes['message_op']
 
     gather_x = x[src_index]
     gather_y = y[dst_index]
 
     # Calculate forward output.
-    if compute_type == "ADD":
+    if message_op == "ADD":
         results = gather_x + gather_y
-    elif compute_type == "MUL":
+    elif message_op == "MUL":
         results = gather_x * gather_y
 
     return results
 
 
-def graph_send_uv_wrapper(x, y, src_index, dst_index, compute_type="add"):
+def graph_send_uv_wrapper(x, y, src_index, dst_index, message_op="add"):
     return paddle.geometric.send_uv(x, y, src_index, dst_index,
-                                    compute_type.lower())
+                                    message_op.lower())
 
 
 class TestGraphSendUVOp(OpTest):
@@ -60,7 +60,7 @@ def setUp(self):
             'src_index': self.src_index,
             'dst_index': self.dst_index
         }
-        self.attrs = {'compute_type': self.compute_type}
+        self.attrs = {'message_op': self.message_op}
         out = compute_graph_send_uv(self.inputs, self.attrs)
         self.outputs = {'out': out}
 
@@ -76,7 +76,7 @@ def set_config(self):
         index = np.random.randint(0, 10, (15, 2)).astype(np.int64)
         self.src_index = index[:, 0]
         self.dst_index = index[:, 1]
-        self.compute_type = 'ADD'
+        self.message_op = 'ADD'
 
 
 class TestCase1(TestGraphSendUVOp):
@@ -87,7 +87,7 @@ def set_config(self):
         index = np.random.randint(0, 10, (15, 2)).astype(np.int64)
         self.src_index = index[:, 0]
         self.dst_index = index[:, 1]
-        self.compute_type = 'MUL'
+        self.message_op = 'MUL'
 
 
 class TestCase2(TestGraphSendUVOp):
@@ -98,7 +98,7 @@ def set_config(self):
         index = np.random.randint(0, 100, (15, 2)).astype(np.int64)
         self.src_index = index[:, 0]
         self.dst_index = index[:, 1]
-        self.compute_type = 'ADD'
+        self.message_op = 'ADD'
 
 
 class TestCase3(TestGraphSendUVOp):
@@ -109,7 +109,7 @@ def set_config(self):
         index = np.random.randint(0, 100, (15, 2)).astype(np.int64)
         self.src_index = index[:, 0]
         self.dst_index = index[:, 1]
-        self.compute_type = 'ADD'
+        self.message_op = 'ADD'
 
 
 class TestCase4(TestGraphSendUVOp):
@@ -120,7 +120,7 @@ def set_config(self):
         index = np.random.randint(0, 100, (15, 2)).astype(np.int64)
         self.src_index = index[:, 0]
         self.dst_index = index[:, 1]
-        self.compute_type = 'MUL'
+        self.message_op = 'MUL'
 
 
 class TestCase5(TestGraphSendUVOp):
@@ -131,7 +131,7 @@ def set_config(self):
         index = np.random.randint(0, 100, (15, 2)).astype(np.int64)
         self.src_index = index[:, 0]
         self.dst_index = index[:, 1]
-        self.compute_type = 'MUL'
+        self.message_op = 'MUL'
 
 
 class TestCase6(TestGraphSendUVOp):
@@ -142,7 +142,7 @@ def set_config(self):
         index = np.random.randint(0, 10, (15, 2)).astype(np.int64)
         self.src_index = index[:, 0]
         self.dst_index = index[:, 1]
-        self.compute_type = 'ADD'
+        self.message_op = 'ADD'
 
 
 class TestCase7(TestGraphSendUVOp):
@@ -153,7 +153,7 @@ def set_config(self):
         index = np.random.randint(0, 10, (15, 2)).astype(np.int64)
         self.src_index = index[:, 0]
         self.dst_index = index[:, 1]
-        self.compute_type = 'MUL'
+        self.message_op = 'MUL'
 
 
 class API_GeometricSendUVTest(unittest.TestCase):
@@ -169,22 +169,22 @@ def test_compute_all_dygraph(self):
                                            y,
                                            src_index,
                                            dst_index,
-                                           compute_type="add")
+                                           message_op="add")
         res_sub = paddle.geometric.send_uv(x,
                                            y,
                                            src_index,
                                            dst_index,
-                                           compute_type="sub")
+                                           message_op="sub")
         res_mul = paddle.geometric.send_uv(x,
                                            y,
                                            src_index,
                                            dst_index,
-                                           compute_type="mul")
+                                           message_op="mul")
         res_div = paddle.geometric.send_uv(x,
                                            y,
                                            src_index,
                                            dst_index,
-                                           compute_type="div")
+                                           message_op="div")
         res = [res_add, res_sub, res_mul, res_div]
 
         np_add = np.array([[2, 5, 7], [5, 9, 11], [4, 9, 11], [1, 3, 5]],
@@ -213,22 +213,22 @@ def test_compute_all_static(self):
                                                y,
                                                src_index,
                                                dst_index,
-                                               compute_type="add")
+                                               message_op="add")
             res_sub = paddle.geometric.send_uv(x,
                                                y,
                                                src_index,
                                                dst_index,
-                                               compute_type="sub")
+                                               message_op="sub")
             res_mul = paddle.geometric.send_uv(x,
                                                y,
                                                src_index,
                                                dst_index,
-                                               compute_type="mul")
+                                               message_op="mul")
             res_div = paddle.geometric.send_uv(x,
                                                y,
                                                src_index,
                                                dst_index,
-                                               compute_type="div")
+                                               message_op="div")
 
             exe = paddle.static.Executor(paddle.CPUPlace())
             data1 = np.array([[0, 2, 3], [1, 4, 5], [2, 6, 7]], dtype="float32")
diff --git a/python/paddle/geometric/message_passing/.send.py.swp b/python/paddle/geometric/message_passing/.send.py.swp
deleted file mode 100644
index 7175c240dbb5b37d0b35e02de41dd55b64e35373..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 12288
zcmeI2&5j#I5XWcXzyTHz9N|z*4(x(=>~$gq1?6Bii!x#vBYQVdPNVirji;rV?xwqE
zy~Blf;0lidyaC@L1a3S5uR!(MmNpPUuApk^@1E|duCDr(FP3XDx|e<kH<KF?jhCZH
zG=D7KeD2%bhp+EMR%;)&|7}R+yif%mTwkp8;>KZrFgkqFnV}J7+2HV_^<;WqYqi~6
zqHfV!SqxirPAoAu!a1pyLx-wNO#dX0yvGEXz*7<Ee0A{7?%DP6;V9T&i{F4(UpafK
zqin<km;e)C0!)AjFaajO1en18gMc?X(WjKkGn;f?+sr$EnKw(Gm;e)C0!)AjFaajO
z1egF5U;<2l2{3{GAOTTE(eVpW^bINe|G)kH|Mg2z^d;#N(oNDX>3Py`FGkTL(r2WM
z^epMC-6;Bk^f^hArlgGYHt8+WKIwPz|10Sc=@-&N($A!D?@y#3Nk5RjCw)ia!33B9
z6JP>NfC(@GCcp%k02BB}0ze~<<4c?EbBPs%4&uB!-@6z<KLBToT&fajRXV?YEOcu+
zkGby+-j1n5IUax#E#A(w!u?z1v$DE{r1P?IAX}rY2aB#22J@zNxvkyCX5HqzwaJfi
zZQT0w`b|<boh*|^^l0-M8-sRhd=mEVUT#s3CN#2@UU;dMgN{PB;A?~vS)g*5kii`B
z>K0UbZQ=|TvXTWfLM=P$riP_HN2{ox8tA%KQfqZ^DpW~c?x&}Br)Q^de=?g*veWc<
z4$~PNO|v`cX*$hl^*&6p2k=3f-5CH%3c><j7)sv(-AFVViDd|=l#ZIX7euoBd&D5!
zV+LAL01!{+kj`UxZ!%Bk1M+dEn&8;t$DOp~XVHT&bVDH)4N^v{aHm*Wq<3C}RIswr
zQ)*OOUHO%;7+oVjVbtE6Tf^aMwMv8$MU9EJ%OUv+1l{l?Jvz?j#|PtNba~`~?w}Pt
z=!F;LS3sc+t*XZ$3s27bjiu5aIv1)y$yJg9#ESxr4`GGxR3k%cJgf-XRVRVfrjA!Z
zO)g{Hz8Szhw4qQy@xflGeY`mo@4vgr@rWX{a#?%WEB0YL8joS!V88x^Nk?t=)+I1$
Q8khw`2QzeN&qKxj1d0w(O8@`>

diff --git a/python/paddle/geometric/message_passing/send.py b/python/paddle/geometric/message_passing/send.py
index 1d26c45c5f9ef..27c57ecfeb9ca 100644
--- a/python/paddle/geometric/message_passing/send.py
+++ b/python/paddle/geometric/message_passing/send.py
@@ -21,7 +21,7 @@
 from .utils import reshape_lhs_rhs
 
 
-def send_uv(x, y, src_index, dst_index, compute_type="add", name=None):
+def send_uv(x, y, src_index, dst_index, message_op="add", name=None):
     """
 
     Graph Learning message passing api.
@@ -29,17 +29,17 @@ def send_uv(x, y, src_index, dst_index, compute_type="add", name=None):
     This api is mainly used in Graph Learning domain, and the main purpose is to reduce intermediate memory 
     consumption in the process of message passing. Take `x` as the source node feature tensor, take `y` as 
     the destination node feature tensor. Then we use `src_index` and `dst_index` to gather the corresponding data,
-    and then compute the edge features in different compute_types like `add`, `sub`, `mul`, `div`.
+    and then compute the edge features in different message_ops like `add`, `sub`, `mul`, `div`.
 
     .. code-block:: text
 
            Given:
 
-           X = [[0, 2, 3],
+           x = [[0, 2, 3],
                 [1, 4, 5],
                 [2, 6, 7]]
 
-           Y = [[0, 1, 2],
+           y = [[0, 1, 2],
                 [2, 3, 4],
                 [4, 5, 6]]
 
@@ -47,11 +47,11 @@ def send_uv(x, y, src_index, dst_index, compute_type="add", name=None):
 
            dst_index = [1, 2, 1, 0]
 
-           compute_type = "add"
+           message_op = "add"
 
            Then:
 
-           Out = [[2, 5, 7],
+           out = [[2, 5, 7],
                   [5, 9, 11],
                   [4, 9, 11],
                   [0, 3, 5]]
@@ -62,7 +62,7 @@ def send_uv(x, y, src_index, dst_index, compute_type="add", name=None):
         src_index (Tensor): An 1-D tensor, and the available data type is int32, int64.
         dst_index (Tensor): An 1-D tensor, and should have the same shape as `src_index`. 
                             The available data type is int32, int64. 
-        compute_type (Tensor): Different compute types for x and y, including `add`, `sub`, `mul` and `div`.
+        message_op (Tensor): Different message ops for x and y, including `add`, `sub`, `mul` and `div`.
         name (str, optional): Name for the operation (optional, default is None).
                               For more information, please refer to :ref:`api_guide_Name`.
 
@@ -80,32 +80,32 @@ def send_uv(x, y, src_index, dst_index, compute_type="add", name=None):
             indexes = paddle.to_tensor([[0, 1], [1, 2], [2, 1], [0, 0]], dtype="int32")
             src_index = indexes[:, 0]
             dst_index = indexes[:, 1]
-            out = paddle.geometric.send_uv(x, y, src_index, dst_index, compute_type="add")
+            out = paddle.geometric.send_uv(x, y, src_index, dst_index, message_op="add")
             # Outputs: [[2., 5., 7.], [5., 9., 11.], [4., 9., 11.], [0., 3., 5.]]
 
     """
 
-    if compute_type not in ['add', 'sub', 'mul', 'div']:
+    if message_op not in ['add', 'sub', 'mul', 'div']:
         raise ValueError(
-            "compute_type should be `add`, `sub`, `mul`, `div`, but received %s"
-            % compute_type)
+            "message_op should be `add`, `sub`, `mul`, `div`, but received %s" %
+            message_op)
 
     x, y = reshape_lhs_rhs(x, y)
 
-    if compute_type == 'sub':
-        compute_type = 'add'
+    if message_op == 'sub':
+        message_op = 'add'
         y = -y
-    if compute_type == 'div':
-        compute_type = 'mul'
+    if message_op == 'div':
+        message_op = 'mul'
         y = 1. / y
 
     if in_dygraph_mode():
         return _C_ops.final_state_graph_send_uv(x, y, src_index, dst_index,
-                                                compute_type.upper())
+                                                message_op.upper())
     else:
         if _in_legacy_dygraph():
             return _C_ops.graph_send_uv(x, y, src_index, dst_index,
-                                        "compute_type", compute_type.upper())
+                                        "message_op", message_op.upper())
         else:
             helper = LayerHelper("send_uv", **locals())
             check_variable_and_dtype(
@@ -126,7 +126,7 @@ def send_uv(x, y, src_index, dst_index, compute_type="add", name=None):
                 'src_index': src_index,
                 'dst_index': dst_index
             }
-            attrs = {'compute_type': compute_type.upper()}
+            attrs = {'message_op': message_op.upper()}
             helper.append_op(type="graph_send_uv",
                              inputs=inputs,
                              attrs=attrs,

From ffc2e3e418299cf585176a5ad3b6229f1a667f29 Mon Sep 17 00:00:00 2001
From: DesmonDay <908660116@qq.com>
Date: Fri, 12 Aug 2022 04:54:10 +0000
Subject: [PATCH 10/12] add impl file

---
 .../kernels/impl/graph_message_passing_impl.h | 140 ++++++++++++++++++
 1 file changed, 140 insertions(+)
 create mode 100644 paddle/phi/kernels/impl/graph_message_passing_impl.h

diff --git a/paddle/phi/kernels/impl/graph_message_passing_impl.h b/paddle/phi/kernels/impl/graph_message_passing_impl.h
new file mode 100644
index 0000000000000..dc1477e77227b
--- /dev/null
+++ b/paddle/phi/kernels/impl/graph_message_passing_impl.h
@@ -0,0 +1,140 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+// Copyright The DGL team.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <vector>
+
+#include "paddle/phi/kernels/funcs/common_shape.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+
+namespace phi {
+
+struct BroadCastInfo {
+  bool use_bcast;
+  // l_offset[i] indicates the start position of tensor lhs that required to
+  // compute the i-th element in output, so as r_offset[i].
+  std::vector<int64_t> l_offset, r_offset;
+  int64_t l_len, r_len, out_len, reduce_size;
+};
+
+inline bool UseBroadCast(const phi::DDim& l_dims, const phi::DDim& r_dims) {
+  if (l_dims.size() != r_dims.size()) {
+    return true;
+  }
+  for (int i = 1; i < l_dims.size(); i++) {
+    if (l_dims[i] != r_dims[i]) {
+      return true;
+    }
+  }
+  return false;
+}
+
+inline BroadCastInfo CalcBCastInfo(const phi::DDim& l_dims,
+                                   const phi::DDim& r_dims) {
+  BroadCastInfo binfo;
+  binfo.use_bcast = UseBroadCast(l_dims, r_dims);
+  binfo.l_len = 1;
+  binfo.r_len = 1;
+  for (int i = 1; i < l_dims.size(); i++) {
+    binfo.l_len *= l_dims[i];
+  }
+  for (int i = 1; i < r_dims.size(); i++) {
+    binfo.r_len *= r_dims[i];
+  }
+  // TODO(daisiming): Whether to add dot.
+  binfo.reduce_size = 1;
+  if (binfo.use_bcast) {
+    const int max_dim = std::max(l_dims.size(), r_dims.size()) - 1;
+    int stride_l = 1, stride_r = 1;
+    binfo.l_offset.emplace_back(0);
+    binfo.r_offset.emplace_back(0);
+    int out_len = 1;
+    for (int i = 0; i < max_dim; i++) {
+      // Iterate the axis from back to front.
+      const int dl =
+          (l_dims.size() - 1 - i < 1) ? 1 : l_dims[l_dims.size() - 1 - i];
+      const int dr =
+          (r_dims.size() - 1 - i < 1) ? 1 : r_dims[r_dims.size() - 1 - i];
+      for (int j = 1; j < std::max(dl, dr); j++) {
+        for (int k = 0; k < out_len; k++) {
+          binfo.l_offset.emplace_back(binfo.l_offset[k] +
+                                      j * (j < dl) * stride_l);
+          binfo.r_offset.emplace_back(binfo.r_offset[k] +
+                                      j * (j < dr) * stride_r);
+        }
+      }
+      out_len *= std::max(dl, dr);
+      stride_l *= dl;
+      stride_r *= dr;
+    }
+    binfo.out_len = out_len;
+  } else {
+    binfo.out_len = binfo.l_len;
+  }
+  return binfo;
+}
+
+inline std::vector<int> InferBroadcastShape(const phi::DDim& x_dims,
+                                            const phi::DDim& e_dims,
+                                            const std::string& type = "x") {
+  auto x_dims1 = phi::vectorize<int>(x_dims);
+  auto e_dims1 = phi::vectorize<int>(e_dims);
+  std::vector<int> x_dims2(x_dims1.begin() + 1, x_dims1.end());
+  std::vector<int> e_dims2(e_dims1.begin() + 1, e_dims1.end());
+  int max_dim = std::max(x_dims2.size(), e_dims2.size());
+  int axis = std::abs(static_cast<int>(x_dims2.size() - e_dims2.size()));
+  std::vector<int> x_dims_array(max_dim);
+  std::vector<int> e_dims_array(max_dim);
+  std::vector<int> out_dims_array(max_dim);
+  // Only need to broadcast dimensions other than the 0th dimension.
+  phi::funcs::GetBroadcastDimsArrays(phi::make_ddim(x_dims2),
+                                     phi::make_ddim(e_dims2),
+                                     x_dims_array.data(),
+                                     e_dims_array.data(),
+                                     out_dims_array.data(),
+                                     max_dim,
+                                     axis);
+  if (type == "x") {
+    out_dims_array.insert(out_dims_array.begin(), x_dims[0]);
+  } else {
+    out_dims_array.insert(out_dims_array.begin(), e_dims[0]);
+  }
+  return out_dims_array;
+}
+
+inline bool ReduceGrad(const phi::DDim& out_grad_dims,
+                       const phi::DDim& x_dims,
+                       std::vector<int64_t>& axis) {
+  // We must ensure the ndim of out_grad and x are the same.
+  bool reduce = false;
+  for (int i = 1; i < out_grad_dims.size(); i++) {
+    if (out_grad_dims[i] != x_dims[i]) {
+      reduce = true;
+      break;
+    }
+  }
+  if (!reduce) return false;
+
+  // Get reduce axis.
+  for (int i = 1; i < out_grad_dims.size(); i++) {
+    if (out_grad_dims[i] - x_dims[i] != 0) {
+      axis.emplace_back(i);
+    }
+  }
+  return true;
+}
+
+}  // namespace phi

From 0752fdf6989fb4d97343f29b68f826ef2d7fd1e6 Mon Sep 17 00:00:00 2001
From: DesmonDay <908660116@qq.com>
Date: Fri, 12 Aug 2022 06:27:21 +0000
Subject: [PATCH 11/12] fix unittest timeout time

---
 python/paddle/fluid/tests/unittests/CMakeLists.txt           | 1 +
 python/paddle/fluid/tests/unittests/test_graph_send_uv_op.py | 4 ++++
 2 files changed, 5 insertions(+)

diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 9f899552e6987..32e89576a914b 100755
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -1563,6 +1563,7 @@ set_tests_properties(test_pool3d_api PROPERTIES TIMEOUT 120)
 set_tests_properties(test_cumprod_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_split_program PROPERTIES TIMEOUT 120)
 set_tests_properties(test_graph_send_ue_recv_op PROPERTIES TIMEOUT 60)
+set_tests_properties(test_graph_send_uv_op PROPERTIES TIMEOUT 60)
 if(WITH_DISTRIBUTE
    AND WITH_GPU
    AND WITH_NCCL)
diff --git a/python/paddle/fluid/tests/unittests/test_graph_send_uv_op.py b/python/paddle/fluid/tests/unittests/test_graph_send_uv_op.py
index ee74016999c68..3d0cc3a57c6b3 100644
--- a/python/paddle/fluid/tests/unittests/test_graph_send_uv_op.py
+++ b/python/paddle/fluid/tests/unittests/test_graph_send_uv_op.py
@@ -259,3 +259,7 @@ def test_compute_all_static(self):
                 self.assertTrue(
                     np.allclose(np_res, paddle_res, atol=1e-6), "two value is\
                     {}\n{}, check diff!".format(np_res, paddle_res))
+
+    def test_api_eager_dygraph(self):
+        with _test_eager_guard():
+            self.test_compute_all_dygraph()

From 9cb1b16ee2bca999760f58072d3591665ee90da5 Mon Sep 17 00:00:00 2001
From: DesmonDay <908660116@qq.com>
Date: Mon, 15 Aug 2022 07:13:54 +0000
Subject: [PATCH 12/12] add review revise

---
 .../kernels/cpu/graph_send_uv_grad_kernel.cc  |  12 +-
 .../phi/kernels/cpu/graph_send_uv_kernel.cc   |   8 +-
 .../kernels/gpu/graph_send_uv_grad_kernel.cu  |   8 +-
 .../phi/kernels/gpu/graph_send_uv_kernel.cu   |   8 +-
 .../geometric/message_passing/__init__.py     |   8 +-
 .../paddle/geometric/message_passing/send.py  | 134 ------------------
 .../geometric/message_passing/send_recv.py    | 115 +++++++++++++++
 7 files changed, 151 insertions(+), 142 deletions(-)
 delete mode 100644 python/paddle/geometric/message_passing/send.py

diff --git a/paddle/phi/kernels/cpu/graph_send_uv_grad_kernel.cc b/paddle/phi/kernels/cpu/graph_send_uv_grad_kernel.cc
index bd92875461da7..4e28acdad3db4 100644
--- a/paddle/phi/kernels/cpu/graph_send_uv_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/graph_send_uv_grad_kernel.cc
@@ -63,7 +63,7 @@ void CalculateGrad(const Context& ctx,
       auto out_grad_dims_1 = phi::vectorize<int>(out_grad_dims);
       std::vector<int> out_grad_dims_2(out_grad_dims_1.begin() + 1,
                                        out_grad_dims_1.end());
-      out_grad_dims_2.insert(out_grad_dims_2.begin(), x_grad_dims[0]);
+      out_grad_dims_2.emplace(out_grad_dims_2.begin(), x_grad_dims[0]);
       DenseTensor x_grad_v2 = phi::Empty<T, Context>(ctx, out_grad_dims_2);
       phi::funcs::SetConstant<Context, T>()(ctx, &x_grad_v2, T(0));
       T* x_grad_v2_data = x_grad_v2.data<T>();
@@ -120,7 +120,7 @@ void CalculateGrad(const Context& ctx,
       auto out_grad_dims_1 = phi::vectorize<int>(out_grad_dims);
       std::vector<int> out_grad_dims_2(out_grad_dims_1.begin() + 1,
                                        out_grad_dims_1.end());
-      out_grad_dims_2.insert(out_grad_dims_2.begin(), x_grad_dims[0]);
+      out_grad_dims_2.emplace(out_grad_dims_2.begin(), x_grad_dims[0]);
       DenseTensor x_grad_v2 = phi::Empty<T, Context>(ctx, out_grad_dims_2);
       phi::funcs::SetConstant<Context, T>()(ctx, &x_grad_v2, T(0));
       T* x_grad_v2_data = x_grad_v2.data<T>();
@@ -168,6 +168,13 @@ void GraphSendUVGradOpKernelLaunchHelper(const Context& ctx,
                                          DenseTensor* y_grad) {
   const int64_t& index_size = dst_index.dims()[0];
 
+  PADDLE_ENFORCE_GT(
+      index_size,
+      0,
+      errors::InvalidArgument("The first dimension of src_index or dst_index "
+                              "shoule be greater than 0, but received %d.",
+                              index_size));
+
   ctx.template Alloc<T>(x_grad);
   T* x_grad_data = x_grad->data<T>();
   ctx.template Alloc<T>(y_grad);
@@ -189,7 +196,6 @@ void GraphSendUVGradOpKernelLaunchHelper(const Context& ctx,
   memset(x_grad_data, 0, memset_bytes_x);
   memset(y_grad_data, 0, memset_bytes_y);
 
-  if (index_size == 0) return;
   const T* out_grad_data = out_grad.data<T>();
   const IndexT* s_index = src_index.data<IndexT>();
   const IndexT* d_index = dst_index.data<IndexT>();
diff --git a/paddle/phi/kernels/cpu/graph_send_uv_kernel.cc b/paddle/phi/kernels/cpu/graph_send_uv_kernel.cc
index 6404999ba23df..2183eb2a4c593 100644
--- a/paddle/phi/kernels/cpu/graph_send_uv_kernel.cc
+++ b/paddle/phi/kernels/cpu/graph_send_uv_kernel.cc
@@ -58,6 +58,13 @@ void GraphSendUVOpKernelLaunchHelper(const Context& ctx,
                                      const std::string& message_op,
                                      DenseTensor* out) {
   const int& index_size = src_index.dims()[0];
+  PADDLE_ENFORCE_GT(
+      index_size,
+      0,
+      errors::InvalidArgument("The first dimension of src_index or dst_index "
+                              "shoule be greater than 0, but received %d.",
+                              index_size));
+
   auto out_dims = out->dims();
   int64_t memset_size = 1;
   for (int i = 0; i < out_dims.size(); i++) {
@@ -65,7 +72,6 @@ void GraphSendUVOpKernelLaunchHelper(const Context& ctx,
   }
   ctx.template Alloc<T>(out);
   T* out_data = out->data<T>();
-  if (index_size == 0) return;
 
   const auto& bcast_info = phi::CalcBCastInfo(x.dims(), y.dims());
   const T* x_data = x.data<T>();
diff --git a/paddle/phi/kernels/gpu/graph_send_uv_grad_kernel.cu b/paddle/phi/kernels/gpu/graph_send_uv_grad_kernel.cu
index 4fb0335241d01..5b8d7b28dcc29 100644
--- a/paddle/phi/kernels/gpu/graph_send_uv_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/graph_send_uv_grad_kernel.cu
@@ -219,6 +219,12 @@ void GraphSendUVGradOpCUDAKernelLaunchHelper(const Context& ctx,
                                              DenseTensor* x_grad,
                                              DenseTensor* y_grad) {
   const int64_t& index_size = dst_index.dims()[0];
+  PADDLE_ENFORCE_GT(
+      index_size,
+      0,
+      errors::InvalidArgument("The first dimension of src_index or dst_index "
+                              "shoule be greater than 0, but received %d.",
+                              index_size));
 
   ctx.template Alloc<T>(x_grad);
   T* x_grad_data = x_grad->data<T>();
@@ -246,8 +252,6 @@ void GraphSendUVGradOpCUDAKernelLaunchHelper(const Context& ctx,
   cudaMemset(y_grad_data, 0, memset_bytes_y);
 #endif
 
-  if (index_size == 0) return;
-
   const T* out_grad_data = out_grad.data<T>();
   const IndexT* s_index = src_index.data<IndexT>();
   const IndexT* d_index = dst_index.data<IndexT>();
diff --git a/paddle/phi/kernels/gpu/graph_send_uv_kernel.cu b/paddle/phi/kernels/gpu/graph_send_uv_kernel.cu
index 05f38a5dd3cda..f1e4581773f54 100644
--- a/paddle/phi/kernels/gpu/graph_send_uv_kernel.cu
+++ b/paddle/phi/kernels/gpu/graph_send_uv_kernel.cu
@@ -71,6 +71,13 @@ void GraphSendUVOpCUDAKernelLaunchHelper(const Context& ctx,
                                          const std::string& message_op,
                                          DenseTensor* out) {
   const int64_t& index_size = src_index.dims()[0];
+  PADDLE_ENFORCE_GT(
+      index_size,
+      0,
+      errors::InvalidArgument("The first dimension of src_index or dst_index "
+                              "shoule be greater than 0, but received %d.",
+                              index_size));
+
   auto out_dims = out->dims();
   int64_t memset_size = 1;
   for (int i = 0; i < out_dims.size(); i++) {
@@ -78,7 +85,6 @@ void GraphSendUVOpCUDAKernelLaunchHelper(const Context& ctx,
   }
   ctx.template Alloc<T>(out);
   T* out_data = out->data<T>();
-  if (index_size == 0) return;
 
   const auto& bcast_info = phi::CalcBCastInfo(x.dims(), y.dims());
   const T* x_data = x.data<T>();
diff --git a/python/paddle/geometric/message_passing/__init__.py b/python/paddle/geometric/message_passing/__init__.py
index dd195e45e3d14..f215e5be74a48 100644
--- a/python/paddle/geometric/message_passing/__init__.py
+++ b/python/paddle/geometric/message_passing/__init__.py
@@ -14,4 +14,10 @@
 
 from .send_recv import send_u_recv  # noqa: F401
 from .send_recv import send_ue_recv  # noqa: F401
-from .send import send_uv  # noqa: F401
+from .send_recv import send_uv  # noqa: F401
+
+__all__ = [
+    'send_u_recv',
+    'send_ue_recv',
+    'send_uv',
+]
diff --git a/python/paddle/geometric/message_passing/send.py b/python/paddle/geometric/message_passing/send.py
deleted file mode 100644
index 27c57ecfeb9ca..0000000000000
--- a/python/paddle/geometric/message_passing/send.py
+++ /dev/null
@@ -1,134 +0,0 @@
-#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import numpy as np
-from paddle.fluid.layer_helper import LayerHelper
-from paddle.fluid.framework import _in_legacy_dygraph, in_dygraph_mode
-from paddle.fluid.data_feeder import check_variable_and_dtype
-from paddle import _C_ops
-
-from .utils import reshape_lhs_rhs
-
-
-def send_uv(x, y, src_index, dst_index, message_op="add", name=None):
-    """
-
-    Graph Learning message passing api.
-
-    This api is mainly used in Graph Learning domain, and the main purpose is to reduce intermediate memory 
-    consumption in the process of message passing. Take `x` as the source node feature tensor, take `y` as 
-    the destination node feature tensor. Then we use `src_index` and `dst_index` to gather the corresponding data,
-    and then compute the edge features in different message_ops like `add`, `sub`, `mul`, `div`.
-
-    .. code-block:: text
-
-           Given:
-
-           x = [[0, 2, 3],
-                [1, 4, 5],
-                [2, 6, 7]]
-
-           y = [[0, 1, 2],
-                [2, 3, 4],
-                [4, 5, 6]]
-
-           src_index = [0, 1, 2, 0]
-
-           dst_index = [1, 2, 1, 0]
-
-           message_op = "add"
-
-           Then:
-
-           out = [[2, 5, 7],
-                  [5, 9, 11],
-                  [4, 9, 11],
-                  [0, 3, 5]]
-
-    Args:
-        x (Tensor): The source node feature tensor, and the available data type is float32, float64, int32, int64. And we support float16 in gpu version.
-        y (Tensor): The destination node feature tensor, and the available data type is float32, float64, int32, int64. And we support float16 in gpu version.
-        src_index (Tensor): An 1-D tensor, and the available data type is int32, int64.
-        dst_index (Tensor): An 1-D tensor, and should have the same shape as `src_index`. 
-                            The available data type is int32, int64. 
-        message_op (Tensor): Different message ops for x and y, including `add`, `sub`, `mul` and `div`.
-        name (str, optional): Name for the operation (optional, default is None).
-                              For more information, please refer to :ref:`api_guide_Name`.
-
-    Returns:
-        out (Tensor): The output tensor.
-
-    Examples:
-
-        .. code-block:: python
-
-            import paddle
-
-            x = paddle.to_tensor([[0, 2, 3], [1, 4, 5], [2, 6, 7]], dtype="float32")
-            y = paddle.to_tensor([[0, 1, 2], [2, 3, 4], [4, 5, 6]], dtype="float32")
-            indexes = paddle.to_tensor([[0, 1], [1, 2], [2, 1], [0, 0]], dtype="int32")
-            src_index = indexes[:, 0]
-            dst_index = indexes[:, 1]
-            out = paddle.geometric.send_uv(x, y, src_index, dst_index, message_op="add")
-            # Outputs: [[2., 5., 7.], [5., 9., 11.], [4., 9., 11.], [0., 3., 5.]]
-
-    """
-
-    if message_op not in ['add', 'sub', 'mul', 'div']:
-        raise ValueError(
-            "message_op should be `add`, `sub`, `mul`, `div`, but received %s" %
-            message_op)
-
-    x, y = reshape_lhs_rhs(x, y)
-
-    if message_op == 'sub':
-        message_op = 'add'
-        y = -y
-    if message_op == 'div':
-        message_op = 'mul'
-        y = 1. / y
-
-    if in_dygraph_mode():
-        return _C_ops.final_state_graph_send_uv(x, y, src_index, dst_index,
-                                                message_op.upper())
-    else:
-        if _in_legacy_dygraph():
-            return _C_ops.graph_send_uv(x, y, src_index, dst_index,
-                                        "message_op", message_op.upper())
-        else:
-            helper = LayerHelper("send_uv", **locals())
-            check_variable_and_dtype(
-                x, 'x', ['int32', 'int64', 'float32', 'float64', 'float16'],
-                'graph_send_uv')
-            check_variable_and_dtype(
-                y, 'y', ['int32', 'int64', 'float32', 'float64', 'float16'],
-                'graph_send_uv')
-            check_variable_and_dtype(src_index, 'src_index', ['int32', 'int64'],
-                                     'graph_send_uv')
-            check_variable_and_dtype(dst_index, 'dst_index', ['int32', 'int64'],
-                                     'graph_send_uv')
-            out = helper.create_variable_for_type_inference(dtype=x.dtype)
-
-            inputs = {
-                'x': x,
-                'y': y,
-                'src_index': src_index,
-                'dst_index': dst_index
-            }
-            attrs = {'message_op': message_op.upper()}
-            helper.append_op(type="graph_send_uv",
-                             inputs=inputs,
-                             attrs=attrs,
-                             outputs={"out": out})
-            return out
diff --git a/python/paddle/geometric/message_passing/send_recv.py b/python/paddle/geometric/message_passing/send_recv.py
index bfe63f1f04d73..de8fd3b005e29 100644
--- a/python/paddle/geometric/message_passing/send_recv.py
+++ b/python/paddle/geometric/message_passing/send_recv.py
@@ -21,6 +21,8 @@
 
 from .utils import convert_out_size_to_list, get_out_size_tensor_inputs, reshape_lhs_rhs
 
+__all__ = []
+
 
 def send_u_recv(x,
                 src_index,
@@ -336,3 +338,116 @@ def send_ue_recv(x,
                      },
                      attrs=attrs)
     return out
+
+
+def send_uv(x, y, src_index, dst_index, message_op="add", name=None):
+    """
+
+    Graph Learning message passing api.
+
+    This api is mainly used in Graph Learning domain, and the main purpose is to reduce intermediate memory 
+    consumption in the process of message passing. Take `x` as the source node feature tensor, take `y` as 
+    the destination node feature tensor. Then we use `src_index` and `dst_index` to gather the corresponding data,
+    and then compute the edge features in different message_ops like `add`, `sub`, `mul`, `div`.
+
+    .. code-block:: text
+
+           Given:
+
+           x = [[0, 2, 3],
+                [1, 4, 5],
+                [2, 6, 7]]
+
+           y = [[0, 1, 2],
+                [2, 3, 4],
+                [4, 5, 6]]
+
+           src_index = [0, 1, 2, 0]
+
+           dst_index = [1, 2, 1, 0]
+
+           message_op = "add"
+
+           Then:
+
+           out = [[2, 5, 7],
+                  [5, 9, 11],
+                  [4, 9, 11],
+                  [0, 3, 5]]
+
+    Args:
+        x (Tensor): The source node feature tensor, and the available data type is float32, float64, int32, int64. And we support float16 in gpu version.
+        y (Tensor): The destination node feature tensor, and the available data type is float32, float64, int32, int64. And we support float16 in gpu version.
+        src_index (Tensor): An 1-D tensor, and the available data type is int32, int64.
+        dst_index (Tensor): An 1-D tensor, and should have the same shape as `src_index`. 
+                            The available data type is int32, int64. 
+        message_op (Tensor): Different message ops for x and y, including `add`, `sub`, `mul` and `div`.
+        name (str, optional): Name for the operation (optional, default is None).
+                              For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        out (Tensor): The output tensor.
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+
+            x = paddle.to_tensor([[0, 2, 3], [1, 4, 5], [2, 6, 7]], dtype="float32")
+            y = paddle.to_tensor([[0, 1, 2], [2, 3, 4], [4, 5, 6]], dtype="float32")
+            indexes = paddle.to_tensor([[0, 1], [1, 2], [2, 1], [0, 0]], dtype="int32")
+            src_index = indexes[:, 0]
+            dst_index = indexes[:, 1]
+            out = paddle.geometric.send_uv(x, y, src_index, dst_index, message_op="add")
+            # Outputs: [[2., 5., 7.], [5., 9., 11.], [4., 9., 11.], [0., 3., 5.]]
+
+    """
+
+    if message_op not in ['add', 'sub', 'mul', 'div']:
+        raise ValueError(
+            "message_op should be `add`, `sub`, `mul`, `div`, but received %s" %
+            message_op)
+
+    x, y = reshape_lhs_rhs(x, y)
+
+    if message_op == 'sub':
+        message_op = 'add'
+        y = -y
+    if message_op == 'div':
+        message_op = 'mul'
+        y = 1. / y
+
+    if in_dygraph_mode():
+        return _C_ops.final_state_graph_send_uv(x, y, src_index, dst_index,
+                                                message_op.upper())
+    else:
+        if _in_legacy_dygraph():
+            return _C_ops.graph_send_uv(x, y, src_index, dst_index,
+                                        "message_op", message_op.upper())
+        else:
+            helper = LayerHelper("send_uv", **locals())
+            check_variable_and_dtype(
+                x, 'x', ['int32', 'int64', 'float32', 'float64', 'float16'],
+                'graph_send_uv')
+            check_variable_and_dtype(
+                y, 'y', ['int32', 'int64', 'float32', 'float64', 'float16'],
+                'graph_send_uv')
+            check_variable_and_dtype(src_index, 'src_index', ['int32', 'int64'],
+                                     'graph_send_uv')
+            check_variable_and_dtype(dst_index, 'dst_index', ['int32', 'int64'],
+                                     'graph_send_uv')
+            out = helper.create_variable_for_type_inference(dtype=x.dtype)
+
+            inputs = {
+                'x': x,
+                'y': y,
+                'src_index': src_index,
+                'dst_index': dst_index
+            }
+            attrs = {'message_op': message_op.upper()}
+            helper.append_op(type="graph_send_uv",
+                             inputs=inputs,
+                             attrs=attrs,
+                             outputs={"out": out})
+            return out