From e860538122b25f71f03955269ba0b8de3fa52759 Mon Sep 17 00:00:00 2001
From: zhangchunle <clzhang_cauc@163.com>
Date: Mon, 6 Jun 2022 10:51:25 +0800
Subject: [PATCH 01/53] set serial in cmakelist need approve (#43165)

---
 tools/check_file_diff_approvals.sh | 4 ++--
 tools/final_ut_parallel_rule.py    | 2 ++
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/tools/check_file_diff_approvals.sh b/tools/check_file_diff_approvals.sh
index 6e086d9d7ca58..ee282fb294aea 100644
--- a/tools/check_file_diff_approvals.sh
+++ b/tools/check_file_diff_approvals.sh
@@ -428,13 +428,13 @@ RUNTYPE_FILE_CHANGED=`git diff --name-only --diff-filter=AM upstream/$BRANCH|gre
 if [ "${RUNTYPE_FILE_CHANGED}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
     for CMAKELISTS_FILE in ${RUNTYPE_FILE_CHANGED};
     do
-        RUNTYPE_ADD=`git diff -U0 upstream/$BRANCH ${PADDLE_ROOT}/${CMAKELISTS_FILE} |grep "^+" |grep -E "RUN_TYPE=EXCLUSIVE|RUN_TYPE=DIST|RUN_TYPE=NIGHTLY|RUN_TYPE=EXCLUSIVE:NIGHTLY|RUN_TYPE=DIST:NIGHTLY|PROPERTIES[[:space:]]+TIMEOUT" || true`
+        RUNTYPE_ADD=`git diff -U0 upstream/$BRANCH ${PADDLE_ROOT}/${CMAKELISTS_FILE} |grep "^+" |grep -E "SERIAL|RUN_TYPE=EXCLUSIVE|RUN_TYPE=DIST|RUN_TYPE=NIGHTLY|RUN_TYPE=EXCLUSIVE:NIGHTLY|RUN_TYPE=DIST:NIGHTLY|PROPERTIES[[:space:]]+TIMEOUT" || true`
 	if [[ ${RUNTYPE_ADD} != "" ]];then
 	    RUNTYPE_ADD_LINES="${RUNTYPE_ADD_LINES}\n${CMAKELISTS_FILE}\n${RUNTYPE_ADD}\n"
 	fi
     done
     if [[ ${RUNTYPE_ADD_LINES} != "" ]];then
-        echo_line="You must have one QA (XieYunshen(Recommend) or chalsliu) approval for setting parameter RUN_TYPE as EXCLUSIVE, DIST, NIGHTLY, EXCLUSIVE:NIGHTLY or DISTNIGHTLY, or setting TIMEOUT properties.\nThe corresponding lines are as follows:\n${RUNTYPE_ADD_LINES}\nFor more information, please refer to:https://github.com/PaddlePaddle/Paddle/wiki/PaddlePaddle-Unit-test-specification"
+        echo_line="You must have one QA (XieYunshen(Recommend) or chalsliu) approval for setting parameter RUN_TYPE as EXCLUSIVE, DIST, NIGHTLY, EXCLUSIVE:NIGHTLY or DISTNIGHTLY, or setting parameter SERIAL, or setting TIMEOUT properties.\nThe corresponding lines are as follows:\n${RUNTYPE_ADD_LINES}\nFor more information, please refer to:https://github.com/PaddlePaddle/Paddle/wiki/PaddlePaddle-Unit-test-specification"
 	check_approval 1 32428676 45041955
     fi
 fi
diff --git a/tools/final_ut_parallel_rule.py b/tools/final_ut_parallel_rule.py
index a5c9f921148ac..7a25eee71b227 100644
--- a/tools/final_ut_parallel_rule.py
+++ b/tools/final_ut_parallel_rule.py
@@ -99,6 +99,8 @@ def classify_cases_by_mem(rootPath):
                 continue
             if case in case_always_timeout:
                 no_parallel_case = no_parallel_case + '|^' + case + '$'
+                continue
+
             if case not in new_lastest_mem:
                 continue
 

From 19b4ff47818c492cd40977a3d0fc044c6830790c Mon Sep 17 00:00:00 2001
From: Sing_chan <51314274+betterpig@users.noreply.github.com>
Date: Mon, 6 Jun 2022 11:05:18 +0800
Subject: [PATCH 02/53] =?UTF-8?q?=E3=80=90code=20format=20check=20upgrade?=
 =?UTF-8?q?=E3=80=91=20step1=EF=BC=9Acpplint=20(#43175)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* test=document_fix

* cpplint set version 1.6.0;test=document_fix
---
 tools/codestyle/cpplint_pre_commit.hook | 9 +++++++--
 tools/codestyle/pylint_pre_commit.hook  | 2 +-
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/tools/codestyle/cpplint_pre_commit.hook b/tools/codestyle/cpplint_pre_commit.hook
index c90bf29ecb794..cef11ab1351b7 100755
--- a/tools/codestyle/cpplint_pre_commit.hook
+++ b/tools/codestyle/cpplint_pre_commit.hook
@@ -1,10 +1,15 @@
 #!/bin/bash
 
 TOTAL_ERRORS=0
+
+readonly VERSION="1.6.0"
+
+version=$(cpplint --version)
+
 if [[ ! $TRAVIS_BRANCH ]]; then
   # install cpplint on local machine.
-  if [[ ! $(which cpplint) ]]; then
-    pip install cpplint
+  if ! [[ $version == *"$VERSION"* ]]; then
+    pip install cpplint==1.6.0
   fi
   # diff files on local machine. 
   files=$(git diff --cached --name-status | awk '$1 != "D" {print $2}')
diff --git a/tools/codestyle/pylint_pre_commit.hook b/tools/codestyle/pylint_pre_commit.hook
index f0d2f0a22d79e..1c81f4b456339 100755
--- a/tools/codestyle/pylint_pre_commit.hook
+++ b/tools/codestyle/pylint_pre_commit.hook
@@ -10,7 +10,7 @@ readonly VERSION="2.12.0"
 version=$(pylint --version | grep 'pylint')
 
 if ! [[ $version == *"$VERSION"* ]]; then
-    pip install pylint==2.12.0 1>nul
+    pip install pylint==2.12.0
 fi
 
 # The trick to remove deleted files: https://stackoverflow.com/a/2413151

From 2a17e3c1b5a67341205f5e9c4cac7cef525b0af6 Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Sun, 5 Jun 2022 23:26:19 -0500
Subject: [PATCH 03/53] update relu custom op demo (#43173)

---
 .../fluid/tests/custom_op/custom_relu_op.cc   | 26 ++++++------
 .../fluid/tests/custom_op/custom_relu_op.cu   | 41 +++++++++----------
 2 files changed, 32 insertions(+), 35 deletions(-)

diff --git a/python/paddle/fluid/tests/custom_op/custom_relu_op.cc b/python/paddle/fluid/tests/custom_op/custom_relu_op.cc
index 04399a9826cfe..f1860635ed5f4 100644
--- a/python/paddle/fluid/tests/custom_op/custom_relu_op.cc
+++ b/python/paddle/fluid/tests/custom_op/custom_relu_op.cc
@@ -17,8 +17,7 @@
 
 #include "paddle/extension.h"
 
-#define CHECK_CPU_INPUT(x) \
-  PD_CHECK(x.place() == paddle::PlaceType::kCPU, #x " must be a CPU Tensor.")
+#define CHECK_CPU_INPUT(x) PD_CHECK(x.is_cpu(), #x " must be a CPU Tensor.")
 
 template <typename data_t>
 void relu_cpu_forward_kernel(const data_t* x_data,
@@ -26,7 +25,7 @@ void relu_cpu_forward_kernel(const data_t* x_data,
                              int64_t x_numel) {
   PD_CHECK(x_data != nullptr, "x_data is nullptr.");
   PD_CHECK(out_data != nullptr, "out_data is nullptr.");
-  for (int i = 0; i < x_numel; ++i) {
+  for (int64_t i = 0; i < x_numel; ++i) {
     out_data[i] = std::max(static_cast<data_t>(0.), x_data[i]);
   }
 }
@@ -36,7 +35,7 @@ void relu_cpu_backward_kernel(const data_t* grad_out_data,
                               const data_t* out_data,
                               data_t* grad_x_data,
                               int64_t out_numel) {
-  for (int i = 0; i < out_numel; ++i) {
+  for (int64_t i = 0; i < out_numel; ++i) {
     grad_x_data[i] =
         grad_out_data[i] * (out_data[i] > static_cast<data_t>(0) ? 1. : 0.);
   }
@@ -54,12 +53,12 @@ void relu_cpu_double_backward_kernel(const data_t* out_data,
 }
 
 std::vector<paddle::Tensor> relu_cpu_forward(const paddle::Tensor& x) {
-  auto out = paddle::empty(x.shape(), x.dtype(), x.place());
+  auto out = paddle::empty_like(x);
 
   PD_DISPATCH_FLOATING_TYPES(
       x.type(), "relu_cpu_forward", ([&] {
         relu_cpu_forward_kernel<data_t>(
-            x.data<data_t>(), out.mutable_data<data_t>(x.place()), x.size());
+            x.data<data_t>(), out.data<data_t>(), x.numel());
       }));
 
   return {out};
@@ -68,13 +67,13 @@ std::vector<paddle::Tensor> relu_cpu_forward(const paddle::Tensor& x) {
 std::vector<paddle::Tensor> relu_cpu_backward(const paddle::Tensor& x,
                                               const paddle::Tensor& out,
                                               const paddle::Tensor& grad_out) {
-  auto grad_x = paddle::empty(x.shape(), x.dtype(), x.place());
+  auto grad_x = paddle::empty_like(x);
 
   PD_DISPATCH_FLOATING_TYPES(out.type(), "relu_cpu_backward", ([&] {
                                relu_cpu_backward_kernel<data_t>(
                                    grad_out.data<data_t>(),
                                    out.data<data_t>(),
-                                   grad_x.mutable_data<data_t>(x.place()),
+                                   grad_x.data<data_t>(),
                                    out.size());
                              }));
 
@@ -108,9 +107,9 @@ std::vector<paddle::Tensor> relu_cuda_double_backward(
     const paddle::Tensor& out, const paddle::Tensor& ddx);
 
 std::vector<paddle::Tensor> ReluForward(const paddle::Tensor& x) {
-  if (x.place() == paddle::PlaceType::kCPU) {
+  if (x.is_cpu()) {
     return relu_cpu_forward(x);
-  } else if (x.place() == paddle::PlaceType::kGPU) {
+  } else if (x.is_gpu()) {
     return relu_cuda_forward(x);
   } else {
     PD_THROW("Not implemented.");
@@ -120,10 +119,9 @@ std::vector<paddle::Tensor> ReluForward(const paddle::Tensor& x) {
 std::vector<paddle::Tensor> ReluBackward(const paddle::Tensor& x,
                                          const paddle::Tensor& out,
                                          const paddle::Tensor& grad_out) {
-  // TODO(chenweihang): Check Input
-  if (x.place() == paddle::PlaceType::kCPU) {
+  if (x.is_cpu()) {
     return relu_cpu_backward(x, out, grad_out);
-  } else if (x.place() == paddle::PlaceType::kGPU) {
+  } else if (x.is_gpu()) {
     return relu_cuda_backward(x, out, grad_out);
   } else {
     PD_THROW("Not implemented.");
@@ -214,7 +212,7 @@ void relu_cpu_forward_out(const paddle::Tensor& x, paddle::Tensor* out) {
   PD_DISPATCH_FLOATING_TYPES(
       x.type(), "relu_cpu_forward", ([&] {
         relu_cpu_forward_kernel<data_t>(
-            x.data<data_t>(), out->mutable_data<data_t>(x.place()), x.size());
+            x.data<data_t>(), out->mutable_data<data_t>(x.place()), x.numel());
       }));
 }
 
diff --git a/python/paddle/fluid/tests/custom_op/custom_relu_op.cu b/python/paddle/fluid/tests/custom_op/custom_relu_op.cu
index 18f1a2b95c2ee..f9314ea4b1066 100644
--- a/python/paddle/fluid/tests/custom_op/custom_relu_op.cu
+++ b/python/paddle/fluid/tests/custom_op/custom_relu_op.cu
@@ -14,15 +14,14 @@
 
 #include "paddle/extension.h"
 
-#define CHECK_GPU_INPUT(x) \
-  PD_CHECK(x.place() == paddle::PlaceType::kGPU, #x " must be a GPU Tensor.")
+#define CHECK_GPU_INPUT(x) PD_CHECK(x.is_gpu(), #x " must be a GPU Tensor.")
 
 template <typename data_t>
 __global__ void relu_cuda_forward_kernel(const data_t* x,
                                          data_t* y,
-                                         const int num) {
-  int gid = blockIdx.x * blockDim.x + threadIdx.x;
-  for (int i = gid; i < num; i += blockDim.x * gridDim.x) {
+                                         int64_t num) {
+  int64_t gid = blockIdx.x * blockDim.x + threadIdx.x;
+  for (int64_t i = gid; i < num; i += blockDim.x * gridDim.x) {
     y[i] = x[i] > static_cast<data_t>(0.) ? x[i] : static_cast<data_t>(0.);
   }
 }
@@ -31,9 +30,9 @@ template <typename data_t>
 __global__ void relu_cuda_backward_kernel(const data_t* dy,
                                           const data_t* y,
                                           data_t* dx,
-                                          const int num) {
-  int gid = blockIdx.x * blockDim.x + threadIdx.x;
-  for (int i = gid; i < num; i += blockDim.x * gridDim.x) {
+                                          int64_t num) {
+  int64_t gid = blockIdx.x * blockDim.x + threadIdx.x;
+  for (int64_t i = gid; i < num; i += blockDim.x * gridDim.x) {
     dx[i] = dy[i] * (y[i] > static_cast<data_t>(0.) ? static_cast<data_t>(1.)
                                                     : static_cast<data_t>(0.));
   }
@@ -54,15 +53,15 @@ __global__ void relu_cuda_double_backward_kernel(const data_t* out_data,
 
 std::vector<paddle::Tensor> relu_cuda_forward(const paddle::Tensor& x) {
   CHECK_GPU_INPUT(x);
-  auto out = paddle::empty(x.shape(), x.dtype(), x.place());
+  auto out = paddle::empty_like(x);
 
-  int numel = x.size();
-  int block = 512;
-  int grid = (numel + block - 1) / block;
+  int64_t numel = x.numel();
+  int64_t block = 512;
+  int64_t grid = (numel + block - 1) / block;
   PD_DISPATCH_FLOATING_AND_HALF_TYPES(
       x.type(), "relu_cuda_forward_kernel", ([&] {
         relu_cuda_forward_kernel<data_t><<<grid, block, 0, x.stream()>>>(
-            x.data<data_t>(), out.mutable_data<data_t>(x.place()), numel);
+            x.data<data_t>(), out.data<data_t>(), numel);
       }));
 
   return {out};
@@ -74,11 +73,11 @@ std::vector<paddle::Tensor> relu_cuda_backward(const paddle::Tensor& x,
   CHECK_GPU_INPUT(x);
   CHECK_GPU_INPUT(out);
   CHECK_GPU_INPUT(grad_out);
-  auto grad_x = paddle::empty(x.shape(), x.dtype(), x.place());
+  auto grad_x = paddle::empty_like(x);
 
-  int numel = out.size();
-  int block = 512;
-  int grid = (numel + block - 1) / block;
+  int64_t numel = out.numel();
+  int64_t block = 512;
+  int64_t grid = (numel + block - 1) / block;
   PD_DISPATCH_FLOATING_AND_HALF_TYPES(
       out.type(), "relu_cuda_backward_kernel", ([&] {
         relu_cuda_backward_kernel<data_t><<<grid, block, 0, x.stream()>>>(
@@ -97,7 +96,7 @@ std::vector<paddle::Tensor> relu_cuda_double_backward(
   CHECK_GPU_INPUT(ddx);
   auto ddout = paddle::empty(out.shape(), out.dtype(), out.place());
 
-  int64_t numel = out.size();
+  int64_t numel = out.numel();
   int64_t block = 512;
   int64_t grid = (numel + block - 1) / block;
   PD_DISPATCH_FLOATING_AND_HALF_TYPES(
@@ -119,7 +118,7 @@ std::vector<paddle::Tensor> relu_cuda_backward_without_x(
     const paddle::Tensor& out, const paddle::Tensor& grad_out) {
   auto grad_x = paddle::empty(out.shape(), out.dtype(), out.place());
 
-  int numel = out.size();
+  int numel = out.numel();
   int block = 512;
   int grid = (numel + block - 1) / block;
   PD_DISPATCH_FLOATING_AND_HALF_TYPES(
@@ -135,7 +134,7 @@ std::vector<paddle::Tensor> relu_cuda_backward_without_x(
 }
 
 void relu_cuda_forward_out(const paddle::Tensor& x, paddle::Tensor* out) {
-  int numel = x.size();
+  int numel = x.numel();
   int block = 512;
   int grid = (numel + block - 1) / block;
   out->reshape(x.shape());
@@ -150,7 +149,7 @@ void relu_cuda_backward_out(const paddle::Tensor& x,
                             const paddle::Tensor& out,
                             const paddle::Tensor& grad_out,
                             paddle::Tensor* grad_x) {
-  int numel = out.size();
+  int numel = out.numel();
   int block = 512;
   int grid = (numel + block - 1) / block;
   grad_x->reshape(x.shape());

From 39903f72b5b3904126b07e818b0f6bebfb2c8c4c Mon Sep 17 00:00:00 2001
From: niuliling123 <51102941+niuliling123@users.noreply.github.com>
Date: Mon, 6 Jun 2022 16:23:34 +0800
Subject: [PATCH 04/53] Replace ReduceAmax/Amax.part.cu with KP  (#43202)

---
 .../reduce_ops/reduce_amax_op.part.cu         | 19 ++--
 .../reduce_ops/reduce_amin_op.part.cu         | 19 ++--
 paddle/fluid/operators/reduce_ops/reduce_op.h | 96 ++++++++++++++++++-
 paddle/phi/kernels/funcs/broadcast_function.h | 19 +++-
 .../phi/kernels/gpu/frobenius_norm_kernel.cu  | 22 ++++-
 5 files changed, 150 insertions(+), 25 deletions(-)

diff --git a/paddle/fluid/operators/reduce_ops/reduce_amax_op.part.cu b/paddle/fluid/operators/reduce_ops/reduce_amax_op.part.cu
index 18c846bc2b469..ed6df1e558bed 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_amax_op.part.cu
+++ b/paddle/fluid/operators/reduce_ops/reduce_amax_op.part.cu
@@ -12,15 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/operators/reduce_ops/reduce_min_max_op.h"
+#include "paddle/fluid/operators/reduce_ops/reduce_op.h"
 
-REGISTER_OP_CUDA_KERNEL(
-    reduce_amax_grad,
-    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, float,
-                          ops::AMaxOrAMinGradFunctor>,
-    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, double,
-                          ops::AMaxOrAMinGradFunctor>,
-    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int,
-                          ops::AMaxOrAMinGradFunctor>,
-    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int64_t,
-                          ops::AMaxOrAMinGradFunctor>);
+template <typename T>
+using CUDAReduceMaxGradKernel =
+    ops::ReduceCudaAMaxAMinGradKernel<T, kps::IdentityFunctor>;
+REGISTER_OP_CUDA_KERNEL(reduce_amax_grad, CUDAReduceMaxGradKernel<int>,
+                        CUDAReduceMaxGradKernel<int64_t>,
+                        CUDAReduceMaxGradKernel<float>,
+                        CUDAReduceMaxGradKernel<double>);
diff --git a/paddle/fluid/operators/reduce_ops/reduce_amin_op.part.cu b/paddle/fluid/operators/reduce_ops/reduce_amin_op.part.cu
index c7a26049634ce..69854da3c4f25 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_amin_op.part.cu
+++ b/paddle/fluid/operators/reduce_ops/reduce_amin_op.part.cu
@@ -12,15 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/operators/reduce_ops/reduce_min_max_op.h"
+#include "paddle/fluid/operators/reduce_ops/reduce_op.h"
 
-REGISTER_OP_CUDA_KERNEL(
-    reduce_amin_grad,
-    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, float,
-                          ops::AMaxOrAMinGradFunctor>,
-    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, double,
-                          ops::AMaxOrAMinGradFunctor>,
-    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int,
-                          ops::AMaxOrAMinGradFunctor>,
-    ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int64_t,
-                          ops::AMaxOrAMinGradFunctor>);
+template <typename T>
+using CUDAReduceMinGradKernel =
+    ops::ReduceCudaAMaxAMinGradKernel<T, kps::IdentityFunctor>;
+REGISTER_OP_CUDA_KERNEL(reduce_amin_grad, CUDAReduceMinGradKernel<int>,
+                        CUDAReduceMinGradKernel<int64_t>,
+                        CUDAReduceMinGradKernel<float>,
+                        CUDAReduceMinGradKernel<double>);
diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.h b/paddle/fluid/operators/reduce_ops/reduce_op.h
index 322ef1fdff67a..ff7429f75ebe3 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_op.h
+++ b/paddle/fluid/operators/reduce_ops/reduce_op.h
@@ -24,7 +24,6 @@ limitations under the License. */
 #include "paddle/fluid/operators/cast_op.h"
 #include "paddle/fluid/operators/reduce_ops/reduce_op_function.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
-
 // only can include the headers in paddle/phi/api dirs
 #include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/phi/api/lib/utils/tensor_utils.h"
@@ -655,6 +654,7 @@ class ReduceCudaGradKernel : public framework::OpKernel<T> {
     bool reduce_all = context.Attr<bool>("reduce_all");
     std::vector<int> dims = context.Attr<std::vector<int>>("dim");
     auto* in_x = context.Input<Tensor>("X");
+
     auto* d_out =
         context.Input<framework::Tensor>(framework::GradVarName("Out"));
     auto* d_x = context.Output<framework::Tensor>(framework::GradVarName("X"));
@@ -685,12 +685,106 @@ class ReduceCudaGradKernel : public framework::OpKernel<T> {
     if (out_dtype <= 0) {
       pt_out_dtype = d_out->dtype();
     }
+
     using MPType = typename kps::details::MPTypeTrait<T>::Type;
     phi::ReduceGrad<T, TransformOp<T, MPType>>(
         dev_ctx, pt_d_out.get(), pt_d_x.get(), pt_out_dtype,
         TransformOp<T, MPType>(reduce_num));
   }
 };
+
+template <typename T>
+struct EqualFunctor {
+  inline T initial() { return static_cast<T>(0.0f); }
+
+  inline HOSTDEVICE T operator()(const T a, const T b) const {
+    return static_cast<T>(a == b);
+  }
+};
+
+template <typename T, typename Enable = void>
+struct DivideFunctor {
+  inline T initial() { return static_cast<T>(1.0f); }
+
+  inline HOSTDEVICE T operator()(const T a, const T b) const { return a / b; }
+};
+
+template <typename T, template <typename, typename> class TransformOp>
+class ReduceCudaAMaxAMinGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    bool reduce_all = context.Attr<bool>("reduce_all");
+    std::vector<int> dims = context.Attr<std::vector<int>>("dim");
+    auto* in_x = context.Input<Tensor>("X");
+    auto* out_y = context.Input<Tensor>("Out");
+    auto* d_out =
+        context.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto* d_x = context.Output<framework::Tensor>(framework::GradVarName("X"));
+    auto out_dtype = context.Attr<int>("in_dtype");
+    auto pt_out_dtype = framework::TransToPhiDataType(
+        static_cast<framework::proto::VarType::Type>(out_dtype));
+    // get reduce_dim and reduce_num for reduce_mean_grad
+    int dim_size = in_x->dims().size();
+    std::vector<int> reduce_dims = GetReduceDim(dims, dim_size, reduce_all);
+    auto update_dims = vectorize(d_x->dims());
+    int reduce_num = 1;
+    for (auto i : reduce_dims) {
+      reduce_num *= (in_x->dims())[i];
+      update_dims[i] = 1;
+    }
+    auto& dev_ctx = context.cuda_device_context();
+
+    // make new tensor reduce_out
+    phi::DenseTensor new_y(out_y->type());
+    new_y.ShareDataWith(*out_y);
+    new_y.Resize(phi::make_ddim(update_dims));
+
+    // make new tensor d_out
+    phi::DenseTensor new_dout(d_out->type());
+    new_dout.ShareDataWith(*d_out);
+    new_dout.Resize(phi::make_ddim(update_dims));
+    d_x->mutable_data(dev_ctx.GetPlace(), d_out->dtype());
+
+    auto new_in = paddle::experimental::MakePhiDenseTensor(*in_x);
+    auto new_in_tensor = new_in.get();
+
+    auto new_dx = paddle::experimental::MakePhiDenseTensor(*d_x);
+    auto new_dx_tensor = new_dx.get();
+
+    // make equal_out
+    phi::DenseTensor* equal_out = new phi::DenseTensor();
+    equal_out->Resize(in_x->dims());
+    dev_ctx.template Alloc<T>(equal_out);
+    auto equal_out_tensor = *equal_out;
+
+    // make new tensor equal_count
+    phi::DenseTensor* equal_count = new phi::DenseTensor();
+    equal_count->Resize(phi::make_ddim(update_dims));
+    dev_ctx.template Alloc<T>(equal_count);
+
+    // compute
+    // 1. equal_out = Equal(x, y)
+    std::vector<const phi::DenseTensor*> equal_inputs = {&new_y, new_in_tensor};
+    std::vector<phi::DenseTensor*> equal_outputs = {&equal_out_tensor};
+    phi::funcs::BroadcastKernel<phi::ElementwiseType::kBinary, T, T>(
+        dev_ctx, equal_inputs, &equal_outputs, 0, EqualFunctor<T>());
+    // 2. equal_count = reduceSum(equal_out)
+    using MPType = typename kps::details::MPTypeTrait<T>::Type;
+    phi::funcs::ReduceKernel<T, T, kps::AddFunctor,
+                             kps::IdentityFunctor<T, MPType>>(
+        dev_ctx, equal_out_tensor, equal_count,
+        kps::IdentityFunctor<T, MPType>(), reduce_dims, false);
+
+    // 3. dx = Div(dout, equal_out)
+    std::vector<const phi::DenseTensor*> grad_inputs = {&equal_out_tensor,
+                                                        equal_count};
+    std::vector<phi::DenseTensor*> grad_outputs = {new_dx_tensor};
+    phi::funcs::BroadcastKernel<phi::ElementwiseType::kBinary, T, T>(
+        dev_ctx, grad_inputs, &grad_outputs, 0, DivideFunctor<T>());
+    delete equal_out;
+    delete equal_count;
+  }
+};
 #endif
 #endif
 
diff --git a/paddle/phi/kernels/funcs/broadcast_function.h b/paddle/phi/kernels/funcs/broadcast_function.h
index 88b87c07c7615..74e48f3918548 100644
--- a/paddle/phi/kernels/funcs/broadcast_function.h
+++ b/paddle/phi/kernels/funcs/broadcast_function.h
@@ -605,7 +605,22 @@ void ElementwiseCompute(const GPUContext &dev_ctx,
       dev_ctx, ins, &outs, axis, func);
 }
 
-#endif
+template <typename DeviceContext,
+          typename T,
+          typename Functor,
+          typename InverseFunctor>
+void DefaultElementwiseOperator(const DeviceContext &dev_ctx,
+                                const DenseTensor &x,
+                                const DenseTensor &y,
+                                DenseTensor *z,
+                                int axis = -1) {
+  auto x_dims = x.dims();
+  auto y_dims = y.dims();
+  dev_ctx.template Alloc<T>(z);
+  funcs::ElementwiseCompute<Functor, T>(dev_ctx, x, y, axis, Functor(), z);
+}
+
+#else
 
 template <typename DeviceContext,
           typename T,
@@ -627,5 +642,7 @@ void DefaultElementwiseOperator(const DeviceContext &dev_ctx,
   }
 }
 
+#endif
+
 }  // namespace funcs
 }  // namespace phi
diff --git a/paddle/phi/kernels/gpu/frobenius_norm_kernel.cu b/paddle/phi/kernels/gpu/frobenius_norm_kernel.cu
index b921d2d640330..f41ccfd37735d 100644
--- a/paddle/phi/kernels/gpu/frobenius_norm_kernel.cu
+++ b/paddle/phi/kernels/gpu/frobenius_norm_kernel.cu
@@ -14,7 +14,27 @@
 
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/frobenius_norm_kernel.h"
-#include "paddle/phi/kernels/impl/frobenius_norm_kernel_impl.h"
+#include "paddle/phi/kernels/funcs/activation_functor.h"
+#include "paddle/phi/kernels/gpu/reduce.h"
+namespace phi {
+
+template <typename T, typename Context>
+void FrobeniusNormKernel(const Context& dev_ctx,
+                         const DenseTensor& x,
+                         const std::vector<int64_t>& dims,
+                         bool keep_dim,
+                         bool reduce_all,
+                         DenseTensor* out) {
+  auto out_dtype = x.dtype();
+  phi::Reduce<T, kps::AddFunctor, kps::SquareFunctor>(
+      dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
+  std::vector<const DenseTensor*> ins = {out};
+  std::vector<DenseTensor*> outs = {out};
+  auto functor = funcs::CudaSqrtFunctor<T>();
+  funcs::ElementwiseKernel<T>(dev_ctx, ins, &outs, functor);
+}
+
+}  // namespace phi
 
 PD_REGISTER_KERNEL(
     frobenius_norm, GPU, ALL_LAYOUT, phi::FrobeniusNormKernel, float, double) {}

From 398b96c6b887298e1e721b4e62d52480b37d6f63 Mon Sep 17 00:00:00 2001
From: SmirnovKol <31559413+SmirnovKol@users.noreply.github.com>
Date: Mon, 6 Jun 2022 18:31:52 +0800
Subject: [PATCH 05/53] Update optimizer.py (#43201)

---
 python/paddle/optimizer/optimizer.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/python/paddle/optimizer/optimizer.py b/python/paddle/optimizer/optimizer.py
index e3e7257f75705..ec367c7c710ed 100644
--- a/python/paddle/optimizer/optimizer.py
+++ b/python/paddle/optimizer/optimizer.py
@@ -338,9 +338,6 @@ def set_state_dict(self, state_dict):
                 adam.set_state_dict(opti_state_dict)
 
         '''
-        if isinstance(self._learning_rate, LRScheduler):
-            self._learning_rate.set_dict(state_dict["LR_Scheduler"])
-
         if isinstance(self._learning_rate, LRScheduler):
             self._learning_rate.set_state_dict(state_dict["LR_Scheduler"])
 

From c22e1123091d7b6592b07a9f6acb1c8c108e271b Mon Sep 17 00:00:00 2001
From: zhaoyingli <86812880+zhaoyinglia@users.noreply.github.com>
Date: Mon, 6 Jun 2022 19:06:42 +0800
Subject: [PATCH 06/53] [AutoParallel] fix gradient merge optimize parse
 (#43169)

* fix gradient merge

* bug fix

* update annotation
---
 .../auto_parallel/parallelizer_v2.py          |  6 +-
 .../passes/auto_parallel_gradient_merge.py    | 70 ++++++++-------
 .../distributed_passes/CMakeLists.txt         |  2 +-
 ...test_auto_parallel_gradient_merge_pass.py} | 88 +++++--------------
 4 files changed, 66 insertions(+), 100 deletions(-)
 rename python/paddle/fluid/tests/unittests/distributed_passes/{test_dist_gradient_merge_pass.py => test_auto_parallel_gradient_merge_pass.py} (72%)

diff --git a/python/paddle/distributed/auto_parallel/parallelizer_v2.py b/python/paddle/distributed/auto_parallel/parallelizer_v2.py
index ce543988ea4e1..f02eb38f45877 100644
--- a/python/paddle/distributed/auto_parallel/parallelizer_v2.py
+++ b/python/paddle/distributed/auto_parallel/parallelizer_v2.py
@@ -148,7 +148,7 @@ def _apply_pre_optimization(self, main_program, startup_program, loss,
                                                     config)
             auto_parallel_recompute_pass.apply([main_program],
                                                [startup_program],
-                                               self._dist_context)
+                                               self._pass_context)
 
     def _apply_post_optimization(self, main_program, startup_program, rank,
                                  params_grads):
@@ -162,7 +162,7 @@ def _apply_post_optimization(self, main_program, startup_program, rank,
             auto_parallel_sharding_pass = new_pass("auto_parallel_sharding",
                                                    config)
             auto_parallel_sharding_pass.apply([main_program], [startup_program],
-                                              self._dist_context)
+                                              self._pass_context)
 
         if self._strategy.gradient_merge:
             config = copy.deepcopy(self._strategy.gradient_merge_configs)
@@ -172,4 +172,4 @@ def _apply_post_optimization(self, main_program, startup_program, rank,
                 "auto_parallel_gradient_merge_pass", config)
             auto_parallel_gradient_merge_pass.apply([main_program],
                                                     [startup_program],
-                                                    self._dist_context)
+                                                    self._pass_context)
diff --git a/python/paddle/distributed/passes/auto_parallel_gradient_merge.py b/python/paddle/distributed/passes/auto_parallel_gradient_merge.py
index bc40dad8ac0d9..394d71706c4c4 100644
--- a/python/paddle/distributed/passes/auto_parallel_gradient_merge.py
+++ b/python/paddle/distributed/passes/auto_parallel_gradient_merge.py
@@ -18,10 +18,10 @@
 
 import paddle
 from paddle.framework import core
+from paddle.fluid import layers
 from paddle.fluid.framework import program_guard, device_guard
-from paddle.fluid import unique_name, layers
-from paddle.fluid.clip import append_gradient_clip_ops
 from .pass_base import PassBase, PassType, register_pass
+from paddle.distributed.fleet.meta_optimizers.common import OpRole
 from paddle.distributed.auto_parallel.utils import set_var_dist_attr
 from paddle.distributed.auto_parallel.utils import naive_set_dist_op_attr_for_program_by_mesh_and_mapping
 from paddle.distributed.auto_parallel.process_group import get_world_process_group
@@ -29,16 +29,8 @@
 world_process_group = get_world_process_group()
 
 
-def _is_the_backward_op(op):
-    OP_ROLE_KEY = core.op_proto_and_checker_maker.kOpRoleAttrName()
-    OpRole = core.op_proto_and_checker_maker.OpRole
-    return OP_ROLE_KEY in op.attr_names and \
-            int(op.all_attrs()[OP_ROLE_KEY]) & int(OpRole.Backward)
-
-
 def _is_the_optimizer_op(op):
     OP_ROLE_KEY = core.op_proto_and_checker_maker.kOpRoleAttrName()
-    OpRole = core.op_proto_and_checker_maker.OpRole
     return OP_ROLE_KEY in op.attr_names and \
             int(op.all_attrs()[OP_ROLE_KEY]) & int(OpRole.Optimize)
 
@@ -47,13 +39,13 @@ def _remove_and_get_optimizer_op(main_program, dist_context):
     # 1 create tmp block
     # 2 mv optimizer op from global program to tmp block
     # 3 del the op from dist_context
-    from paddle.distributed.fleet.meta_optimizers.common import OpRole
     main_block = main_program.global_block()
     temp_block = main_program._create_block()
     removed_op_idx = []
     optimize_ops_desc = []
+    skip_ops = ["increment", "elementwise_mod", "equal"]
     for idx, op in enumerate(main_block.ops):
-        if _is_the_optimizer_op(op):
+        if _is_the_optimizer_op(op) and op.type not in skip_ops:
             # append optimizer op to tmp block
             new_op_desc = temp_block.desc.append_op()
             new_op_desc.copy_from(op.desc)
@@ -111,8 +103,17 @@ def _get_gm_cond_var(main_program, k_steps, dist_context):
     set_var_dist_attr(dist_context, cond_var, [-1], world_process_group.ranks)
 
     with device_guard("cpu"):
-        # step_var = (step_var + 1) % k_step
-        layers.increment(x=step_var, value=1.0, in_place=True)
+        # step_var += 1
+        increment_op = main_block.append_op(type='increment',
+                                            inputs={'X': [step_var]},
+                                            outputs={'Out': [step_var]},
+                                            attrs={
+                                                'step': float(1.0),
+                                                'op_role': OpRole.Optimize
+                                            })
+        naive_set_dist_op_attr_for_program_by_mesh_and_mapping(
+            increment_op, world_process_group.ranks, [-1], dist_context)
+        # step_var %= k_step
         elementwise_mod_op = main_block.append_op(type='elementwise_mod',
                                                   inputs={
                                                       'X': step_var,
@@ -121,18 +122,19 @@ def _get_gm_cond_var(main_program, k_steps, dist_context):
                                                   outputs={'Out': step_var},
                                                   attrs={
                                                       'axis': -1,
-                                                      'use_mkldnn': False
+                                                      'use_mkldnn': False,
+                                                      'op_role': OpRole.Optimize
                                                   })
         naive_set_dist_op_attr_for_program_by_mesh_and_mapping(
             elementwise_mod_op, world_process_group.ranks, [-1], dist_context)
-
         # cond_var = (step_var == 0)
         equal_op = main_block.append_op(type='equal',
                                         inputs={
                                             'X': step_var,
                                             'Y': zero_var
                                         },
-                                        outputs={'Out': cond_var})
+                                        outputs={'Out': cond_var},
+                                        attrs={'op_role': OpRole.Optimize})
         naive_set_dist_op_attr_for_program_by_mesh_and_mapping(
             equal_op, world_process_group.ranks, [-1], dist_context)
 
@@ -154,7 +156,9 @@ def _append_gradient_merge_backward_op(
 
         _remove_op_role_var(param, grad)
 
-    param_to_gradient_merge = {}
+    # {grad.name: gradient_merge_var.name} to rename opt inputs
+    grad_to_gradient_merge = {}
+    # {param: gradient_merge_var} to insert scale op and fill_constant op
     new_params_to_grads = []
     # step2: create gradient_merge var and init with 0
     for param, grad in params_grads:
@@ -168,7 +172,6 @@ def _append_gradient_merge_backward_op(
                                                    shape=param_var.shape,
                                                    dtype=param_var.dtype,
                                                    persistable=True)
-        param_to_gradient_merge[param_name] = gradient_merge_var
         ref_process_mesh = ref_dist_attr.process_mesh
         ref_dims_mapping = ref_dist_attr.dims_mapping
 
@@ -197,17 +200,19 @@ def _append_gradient_merge_backward_op(
                                            outputs={'Out': gradient_merge_var},
                                            attrs={
                                                'axis': -1,
-                                               'use_mkldnn': False
+                                               'use_mkldnn': False,
+                                               'op_role': OpRole.Optimize
                                            })
         new_params_to_grads.append([param, gradient_merge_var])
+        grad_to_gradient_merge[grad.name] = gradient_merge_var.name
         naive_set_dist_op_attr_for_program_by_mesh_and_mapping(
             new_grad_op, ref_process_mesh, ref_dims_mapping, dist_context)
-    return new_params_to_grads, param_to_gradient_merge
+    return new_params_to_grads, grad_to_gradient_merge
 
 
 def _create_cond_block_and_update_optimizer(
         main_program, cond_var, new_params_to_grads: List[Tuple[Any, Any]],
-        param_to_gradient_merge: Dict[str, Any], optimize_ops_desc: List[Any],
+        grad_to_gradient_merge: Dict[str, str], optimize_ops_desc: List[Any],
         k_steps, avg):
 
     def true_apply_gradient():
@@ -229,7 +234,7 @@ def true_apply_gradient():
                                         'bias_after_scale': False
                                     })
                 new_grad.op._set_attr(op_maker.kOpRoleAttrName(),
-                                      op_maker.OpRole.Optimize)
+                                      OpRole.Optimize)
 
         # append optimizer ops
         for op_desc in optimize_ops_desc:
@@ -238,14 +243,14 @@ def true_apply_gradient():
 
             #update input/output
             for input_name in new_op_desc.input_arg_names():
-                if input_name in new_params_to_grads:
-                    new_op_desc._rename_input(input_name,
-                                              new_params_to_grads[input_name])
+                if input_name in grad_to_gradient_merge:
+                    new_op_desc._rename_input(
+                        input_name, grad_to_gradient_merge[input_name])
 
             for output_name in new_op_desc.output_arg_names():
-                if output_name in new_params_to_grads:
-                    new_op_desc._rename_output(output_name,
-                                               new_params_to_grads[output_name])
+                if output_name in grad_to_gradient_merge:
+                    new_op_desc._rename_output(
+                        output_name, grad_to_gradient_merge[output_name])
 
             # remove op_role_var
             if new_op_desc.has_attr(op_maker.kOpRoleVarAttrName()):
@@ -271,6 +276,8 @@ def true_apply_gradient():
                                   op_maker.OpRole.Optimize)
 
     layers.cond(cond_var, true_fn=true_apply_gradient, false_fn=None)
+    cond_op = main_program.global_block().ops[-1]
+    cond_op._set_attr('op_role', OpRole.Optimize)
 
 
 def parse_program(main_program, startup_program, params_grads, k_steps, avg,
@@ -285,14 +292,14 @@ def parse_program(main_program, startup_program, params_grads, k_steps, avg,
     main_program._rollback()
 
     # 3 append gradient merge backward op to main_program
-    new_params_to_grads, param_to_gradient_merge = _append_gradient_merge_backward_op(
+    new_params_to_grads, grad_to_gradient_merge = _append_gradient_merge_backward_op(
         main_program, startup_program, params_grads, cond_var.name,
         dist_context)
 
     # 4 create ConditionalBlock and append gradient merge optimizer ops
     _create_cond_block_and_update_optimizer(main_program, cond_var,
                                             new_params_to_grads,
-                                            param_to_gradient_merge,
+                                            grad_to_gradient_merge,
                                             optimize_ops_desc, k_steps, avg)
 
 
@@ -303,7 +310,6 @@ def __init__(self):
         super(GradientMergePass, self).__init__()
         self.set_attr("k_steps", -1)
         self.set_attr("avg", True)
-        self.set_attr("inner_optimizer", None)
 
     def _check_self(self):
         if self.get_attr("k_steps") < 1:
diff --git a/python/paddle/fluid/tests/unittests/distributed_passes/CMakeLists.txt b/python/paddle/fluid/tests/unittests/distributed_passes/CMakeLists.txt
index c68cebaa25b22..29e528edce914 100644
--- a/python/paddle/fluid/tests/unittests/distributed_passes/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/distributed_passes/CMakeLists.txt
@@ -14,12 +14,12 @@ if((NOT WITH_GPU)
   list(REMOVE_ITEM TEST_OPS "test_dist_fuse_momentum_pass")
   list(REMOVE_ITEM TEST_OPS "test_dist_fuse_relu_depthwise_conv_pass")
   list(REMOVE_ITEM TEST_OPS "test_dist_fuse_sgd_pass")
-  list(REMOVE_ITEM TEST_OPS "test_dist_gradient_merge_pass")
   list(REMOVE_ITEM TEST_OPS "test_dist_inplace_addto_pass")
   list(REMOVE_ITEM TEST_OPS "test_auto_parallel_amp_pass")
   list(REMOVE_ITEM TEST_OPS "test_auto_parallel_recompute_pass")
   list(REMOVE_ITEM TEST_OPS "test_auto_parallel_sharding_pass")
   list(REMOVE_ITEM TEST_OPS "test_auto_parallel_fp16_pass")
+  list(REMOVE_ITEM TEST_OPS "test_auto_parallel_gradient_merge_pass")
 endif()
 
 foreach(TEST_OP ${TEST_OPS})
diff --git a/python/paddle/fluid/tests/unittests/distributed_passes/test_dist_gradient_merge_pass.py b/python/paddle/fluid/tests/unittests/distributed_passes/test_auto_parallel_gradient_merge_pass.py
similarity index 72%
rename from python/paddle/fluid/tests/unittests/distributed_passes/test_dist_gradient_merge_pass.py
rename to python/paddle/fluid/tests/unittests/distributed_passes/test_auto_parallel_gradient_merge_pass.py
index f856059402efb..50e1871820186 100644
--- a/python/paddle/fluid/tests/unittests/distributed_passes/test_dist_gradient_merge_pass.py
+++ b/python/paddle/fluid/tests/unittests/distributed_passes/test_auto_parallel_gradient_merge_pass.py
@@ -25,20 +25,14 @@
 import paddle.utils as utils
 import paddle.static as static
 import paddle.nn.functional as F
+import paddle.distributed.fleet as fleet
 import paddle.distributed.auto_parallel as auto
-from paddle.fluid.initializer import NumpyArrayInitializer
 
-from paddle.distributed.passes import new_pass, PassManager, PassContext
-import paddle.distributed.fleet as fleet
-from dist_pass_test_base import DistPassTestBase
-from paddle.distributed.auto_parallel.dist_context import DistributedContext
+from paddle.fluid.initializer import NumpyArrayInitializer
+from auto_parallel_pass_test_base import AutoPallelPassTestBase
 
 logging.getLogger().setLevel(logging.INFO)
 paddle.enable_static()
-_global_parallel_strategy = None
-_global_process_mesh = None
-
-#np.set_printoptions(suppress=True)
 
 
 class MLPLayer(nn.Layer):
@@ -103,13 +97,11 @@ def forward(self, input):
 
 
 def mlp_forward(input, label, hidden_size):
-    if _global_parallel_strategy == "dp":
-        auto.shard_tensor(input,
-                          dist_attr={
-                              "process_mesh": _global_process_mesh,
-                              "dims_mapping": [0, -1]
-                          })
-
+    auto.shard_tensor(input,
+                      dist_attr={
+                          "process_mesh": [0],
+                          "dims_mapping": [-1, -1]
+                      })
     mlp = MLPLayer(hidden_size=hidden_size,
                    intermediate_size=4 * hidden_size,
                    initializer_range=0.02)
@@ -119,40 +111,33 @@ def mlp_forward(input, label, hidden_size):
     return loss
 
 
-class TestGradientMergePass(DistPassTestBase):
+class TestGradientMergePass(AutoPallelPassTestBase):
 
     def init(self):
-        self._params_grads = None
-        self._config = {"k_steps": 4, "avg": True}
-        #self._config["dist_context"] = DistributedContext()
-
-    def apply_passes(self, main_prog, startup_prog):
-        #self._config["params_grads"] = self._params_grads
-        #pass_context = PassContext()
-        #auto_parallel_gradient_merge_pass = new_pass(
-        #    "auto_parallel_gradient_merge_pass", self._config)
-        #auto_parallel_gradient_merge_pass.apply([main_prog], [startup_prog],
-        #                                        pass_context)
+        paddle.seed(2022)
+        random.seed(2022)
+        np.random.seed(2022)
+
+    def apply_passes(self):
         dist_strategy = fleet.DistributedStrategy()
+        dist_strategy.semi_auto = True
         dist_strategy.gradient_merge = True
         dist_strategy.gradient_merge_configs = {"k_steps": 4, "avg": True}
-        dist_strategy.semi_auto = True
         fleet.init(is_collective=True, strategy=dist_strategy)
 
     def test_result(self):
         no_pass_rets = self._distributed_launch(model=None,
                                                 apply_pass=False,
                                                 gpus=[0],
-                                                gradient_merge=False,
                                                 batch_size=32,
+                                                hidden_size=128,
                                                 max_step=2)
         pass_rets = self._distributed_launch(model=None,
                                              apply_pass=True,
                                              gpus=[0],
-                                             gradient_merge=True,
                                              batch_size=8,
+                                             hidden_size=128,
                                              max_step=8)
-        """
         # avg loss for gradient_merge pass
         avg_loss = 0
         pass_avg_ret_list = []
@@ -167,40 +152,16 @@ def test_result(self):
         for no_pass_ret, pass_ret in zip(no_pass_rets[0], pass_avg_ret_list):
             print(f"no_pass_ret={no_pass_ret}, pass_ret={pass_ret}")
             self.assertTrue(
-                np.isclose(
-                    no_pass_ret,
-                    pass_ret,
-                    rtol=self.rtol,
-                    atol=self.atol,
-                    equal_nan=self.equal_nan))
-        """
-
-    def get_model(self, place, gradient_merge, batch_size, max_step):
-        paddle.seed(2021)
-        random.seed(2021)
-        np.random.seed(2021)
+                np.isclose(no_pass_ret,
+                           pass_ret,
+                           rtol=self.rtol,
+                           atol=self.atol,
+                           equal_nan=self.equal_nan))
 
-        hidden_size = 128
-
-        global _global_parallel_strategy
-        global _global_process_mesh
-        world_size = paddle.distributed.get_world_size()
-        if world_size == 1:
-            _global_parallel_strategy = "dp"
-            _global_process_mesh = auto.ProcessMesh([0])
-        elif world_size == 2:
-            _global_parallel_strategy = "dp"
-            _global_process_mesh = auto.ProcessMesh([0, 1])
+    def get_model(self, place, batch_size, hidden_size, max_step):
 
         train_program = static.Program()
         startup_program = static.Program()
-        dist_strategy = fleet.DistributedStrategy()
-        dist_strategy.semi_auto = True
-        #if gradient_merge:
-        #    dist_strategy.gradient_merge = True
-        #    dist_strategy.gradient_merge_configs = {"k_steps": 4, "avg": True}
-        fleet.init(is_collective=True, strategy=dist_strategy)
-
         with static.program_guard(train_program, startup_program), \
             utils.unique_name.guard():
             input = static.data(name="input",
@@ -212,8 +173,7 @@ def get_model(self, place, gradient_merge, batch_size, max_step):
             input.stop_gradient = False
             loss = mlp_forward(input, label, hidden_size)
 
-        optimizer = paddle.fluid.optimizer.SGDOptimizer(learning_rate=0.01)
-        #optimizer = paddle.fluid.optimizer.Adam(learning_rate=0.01)
+        optimizer = paddle.fluid.optimizer.AdamOptimizer(learning_rate=0.01)
         optimizer = fleet.distributed_optimizer(optimizer)
         _, self._params_grads, dist_startup_prog, dist_main_prog = optimizer.minimize(
             loss, startup_program)

From 607a1d65de8f9c01a7e17e95160928030c45553b Mon Sep 17 00:00:00 2001
From: heliqi <1101791222@qq.com>
Date: Mon, 6 Jun 2022 06:59:27 -0500
Subject: [PATCH 07/53] [inference]Resolve protobuf of ORT Backend conflict
 (#43159)

* modify paddle2onnx cmake

* modify paddle2onnx cmake

* modify export interface

* modify paddle2onnx export

* paddle2onnx add mac windows

* modify paddle2onnx mac windows cmake

* modify paddle2onnx mac windows cmake

* modify paddle2onnx cmake support windows

* modify paddle2onnx cmake support windows

* modify paddle2onnx cmake support windows

Co-authored-by: xiegegege <xiege01@baidu.com>
---
 cmake/external/paddle2onnx.cmake              | 102 +++++++++---------
 cmake/external/protobuf.cmake                 |   9 +-
 cmake/inference_lib.cmake                     |  16 +--
 .../eager/auto_code_generator/CMakeLists.txt  |   2 +-
 .../inference/api/onnxruntime_predictor.cc    |  27 +++--
 paddle/fluid/pybind/CMakeLists.txt            |   2 +-
 python/setup.py.in                            |  10 +-
 7 files changed, 84 insertions(+), 84 deletions(-)

diff --git a/cmake/external/paddle2onnx.cmake b/cmake/external/paddle2onnx.cmake
index 8252b2a73e943..75e2c42cb5a29 100644
--- a/cmake/external/paddle2onnx.cmake
+++ b/cmake/external/paddle2onnx.cmake
@@ -25,82 +25,82 @@ include(ExternalProject)
 
 set(PADDLE2ONNX_PROJECT "extern_paddle2onnx")
 set(PADDLE2ONNX_PREFIX_DIR ${THIRD_PARTY_PATH}/paddle2onnx)
+set(PADDLE2ONNX_SOURCE_DIR
+    ${THIRD_PARTY_PATH}/paddle2onnx/src/${PADDLE2ONNX_PROJECT})
 set(PADDLE2ONNX_INSTALL_DIR ${THIRD_PARTY_PATH}/install/paddle2onnx)
 set(PADDLE2ONNX_INC_DIR
     "${PADDLE2ONNX_INSTALL_DIR}/include"
     CACHE PATH "paddle2onnx include directory." FORCE)
-set(PADDLE2ONNX_REPOSITORY ${GIT_URL}/PaddlePaddle/Paddle2ONNX.git)
-set(PADDLE2ONNX_TAG cpp)
-set(LIBDIR "lib")
+set(PADDLE2ONNX_LIB_DIR
+    "${PADDLE2ONNX_INSTALL_DIR}/lib"
+    CACHE PATH "onnxruntime lib directory." FORCE)
 set(CMAKE_BUILD_RPATH "${CMAKE_BUILD_RPATH}"
                       "${PADDLE2ONNX_INSTALL_DIR}/${LIBDIR}")
 
 include_directories(${PADDLE2ONNX_INC_DIR}
 )# For PADDLE2ONNX code to include internal headers.
 if(WIN32)
+  set(PADDLE2ONNX_SOURCE_LIB
+      "${PADDLE2ONNX_SOURCE_DIR}/lib/libpaddle2onnx.dylib"
+      CACHE FILEPATH "Paddle2ONNX source library." FORCE)
   set(PADDLE2ONNX_LIB
-      "${PADDLE2ONNX_INSTALL_DIR}/${LIBDIR}/paddle2onnx.lib"
-      CACHE FILEPATH "paddle2onnx static library." FORCE)
-  set(PADDLE2ONNX_SHARED_LIB
-      "${PADDLE2ONNX_INSTALL_DIR}/${LIBDIR}/paddle2onnx.dll"
-      CACHE FILEPATH "paddle2onnx shared library." FORCE)
+      "${PADDLE2ONNX_INSTALL_DIR}/lib/paddle2onnx.dll"
+      CACHE FILEPATH "paddle2onnx library." FORCE)
+  set(PADDLE2ONNX_COMPILE_LIB
+      "${PADDLE2ONNX_INSTALL_DIR}/lib/paddle2onnx.lib"
+      CACHE FILEPATH "paddle2onnx compile library." FORCE)
 elseif(APPLE)
+  set(PADDLE2ONNX_SOURCE_LIB
+      "${PADDLE2ONNX_SOURCE_DIR}/lib/libpaddle2onnx.dylib"
+      CACHE FILEPATH "Paddle2ONNX source library." FORCE)
   set(PADDLE2ONNX_LIB
-      "${PADDLE2ONNX_INSTALL_DIR}/${LIBDIR}/libpaddle2onnx.dylib"
+      "${PADDLE2ONNX_INSTALL_DIR}/lib/libpaddle2onnx.dylib"
       CACHE FILEPATH "PADDLE2ONNX library." FORCE)
+  set(PADDLE2ONNX_COMPILE_LIB
+      "${PADDLE2ONNX_INSTALL_DIR}/lib/libpaddle2onnx.dylib"
+      CACHE FILEPATH "paddle2onnx compile library." FORCE)
 else()
+  set(PADDLE2ONNX_SOURCE_LIB
+      "${PADDLE2ONNX_SOURCE_DIR}/lib/libpaddle2onnx.so"
+      CACHE FILEPATH "Paddle2ONNX source library." FORCE)
   set(PADDLE2ONNX_LIB
-      "${PADDLE2ONNX_INSTALL_DIR}/${LIBDIR}/libpaddle2onnx.so"
+      "${PADDLE2ONNX_INSTALL_DIR}/lib/libpaddle2onnx.so"
       CACHE FILEPATH "PADDLE2ONNX library." FORCE)
+  set(PADDLE2ONNX_COMPILE_LIB
+      "${PADDLE2ONNX_INSTALL_DIR}/lib/libpaddle2onnx.so"
+      CACHE FILEPATH "paddle2onnx compile library." FORCE)
 endif(WIN32)
 
-# The protoc path is required to compile onnx.
-string(REPLACE "/" ";" PROTOC_BIN_PATH ${PROTOBUF_PROTOC_EXECUTABLE})
-list(POP_BACK PROTOC_BIN_PATH)
-list(JOIN PROTOC_BIN_PATH "/" PROTOC_BIN_PATH)
-
-set(PADDLE2ONNX_OPTIONAL_ARGS
-    -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
-    -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-    -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
-    -DCMAKE_CXX_STANDARD=14
-    -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
-    -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
-    -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
-    -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}
-    -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}
-    -DONNX_CUSTOM_PROTOC_PATH=${PROTOC_BIN_PATH}
-    -DWITH_STATIC=OFF
-    -DMSVC_STATIC_CRT=${MSVC_STATIC_CRT}
-    -DCMAKE_INSTALL_PREFIX=${PADDLE2ONNX_INSTALL_DIR}
-    -DCMAKE_INSTALL_LIBDIR=${PADDLE2ONNX_INSTALL_DIR}/${LIBDIR}
-    -DCMAKE_POSITION_INDEPENDENT_CODE=ON
-    -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
-    ${EXTERNAL_OPTIONAL_ARGS})
-
-if(WITH_PYTHON)
-  set(PADDLE2ONNX_OPTIONAL_ARGS
-      ${PADDLE2ONNX_OPTIONAL_ARGS}
-      -DPYTHON_EXECUTABLE:FILEPATH=${PYTHON_EXECUTABLE}
-      -DPYTHON_INCLUDE_DIR:PATH=${PYTHON_INCLUDE_DIR}
-      -DPYTHON_LIBRARY:FILEPATH=${PYTHON_LIBRARY})
+if(WIN32)
+  set(PADDLE2ONNX_URL
+      "https://github.com/PaddlePaddle/Paddle2ONNX/releases/download/v0.9.7/paddle2onnx-win-x64-0.9.7.zip"
+  )
+elseif(APPLE)
+  set(PADDLE2ONNX_URL
+      "https://github.com/PaddlePaddle/Paddle2ONNX/releases/download/v0.9.7/paddle2onnx-osx-x86_64-0.9.7.tgz"
+  )
+else()
+  set(PADDLE2ONNX_URL
+      "https://github.com/PaddlePaddle/Paddle2ONNX/releases/download/v0.9.7/paddle2onnx-linux-x64-0.9.7.tgz"
+  )
 endif()
 
 ExternalProject_Add(
   ${PADDLE2ONNX_PROJECT}
-  ${EXTERNAL_PROJECT_LOG_ARGS} ${SHALLOW_CLONE}
-  GIT_REPOSITORY ${PADDLE2ONNX_REPOSITORY}
-  GIT_TAG ${PADDLE2ONNX_TAG}
-  DEPENDS protobuf
+  ${EXTERNAL_PROJECT_LOG_ARGS}
+  URL ${PADDLE2ONNX_URL}
   PREFIX ${PADDLE2ONNX_PREFIX_DIR}
+  DOWNLOAD_NO_PROGRESS 1
+  CONFIGURE_COMMAND ""
+  BUILD_COMMAND ""
   UPDATE_COMMAND ""
-  CMAKE_ARGS ${PADDLE2ONNX_OPTIONAL_ARGS}
-  CMAKE_CACHE_ARGS
-    -DCMAKE_INSTALL_PREFIX:PATH=${PADDLE2ONNX_INSTALL_DIR}
-    -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-    -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
-  BUILD_BYPRODUCTS ${PADDLE2ONNX_LIB})
+  INSTALL_COMMAND
+    ${CMAKE_COMMAND} -E copy_directory ${PADDLE2ONNX_SOURCE_DIR}/lib
+    ${PADDLE2ONNX_LIB_DIR} && ${CMAKE_COMMAND} -E copy_directory
+    ${PADDLE2ONNX_SOURCE_DIR}/include ${PADDLE2ONNX_INC_DIR}
+  BUILD_BYPRODUCTS ${PADDLE2ONNX_COMPILE_LIB})
 
 add_library(paddle2onnx STATIC IMPORTED GLOBAL)
-set_property(TARGET paddle2onnx PROPERTY IMPORTED_LOCATION ${PADDLE2ONNX_LIB})
+set_property(TARGET paddle2onnx PROPERTY IMPORTED_LOCATION
+                                         ${PADDLE2ONNX_COMPILE_LIB})
 add_dependencies(paddle2onnx ${PADDLE2ONNX_PROJECT})
diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake
index 1368081b58fda..7c5de92362db4 100755
--- a/cmake/external/protobuf.cmake
+++ b/cmake/external/protobuf.cmake
@@ -234,10 +234,7 @@ function(build_protobuf TARGET_NAME BUILD_FOR_HOST)
         "-Dprotobuf_MSVC_STATIC_RUNTIME=${MSVC_STATIC_CRT}")
   endif()
 
-  if(WITH_ONNXRUNTIME)
-    set(PROTOBUF_REPOSITORY ${GIT_URL}/protocolbuffers/protobuf.git)
-    set(PROTOBUF_TAG v3.18.0)
-  elseif(WITH_ASCEND AND NOT WITH_ASCEND_CXX11)
+  if(WITH_ASCEND AND NOT WITH_ASCEND_CXX11)
     set(PROTOBUF_REPOSITORY https://gitee.com/tianjianhe/protobuf.git)
     set(PROTOBUF_TAG v3.8.0)
   elseif(WITH_ASCEND_CL AND NOT WITH_ASCEND_CXX11)
@@ -319,9 +316,7 @@ function(build_protobuf TARGET_NAME BUILD_FOR_HOST)
   endif()
 endfunction()
 
-if(WITH_ONNXRUNTIME)
-  set(PROTOBUF_VERSION 3.18.0)
-elseif(WITH_ASCEND OR WITH_ASCEND_CL)
+if(WITH_ASCEND OR WITH_ASCEND_CL)
   set(PROTOBUF_VERSION 3.8.0)
 elseif(WITH_IPU)
   set(PROTOBUF_VERSION 3.6.1)
diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake
index bf69ddc8fb49a..14ae8efb5b4f8 100644
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -148,18 +148,10 @@ function(copy_part_of_thrid_party TARGET DST)
       DSTS ${dst_dir} ${dst_dir})
 
     set(dst_dir "${DST}/third_party/install/paddle2onnx")
-    if(WIN32)
-      copy(
-        ${TARGET}
-        SRCS ${PADDLE2ONNX_INC_DIR}/paddle2onnx ${PADDLE2ONNX_SHARED_LIB}
-             ${PADDLE2ONNX_LIB}
-        DSTS ${dst_dir}/include ${dst_dir}/lib ${dst_dir}/lib)
-    else()
-      copy(
-        ${TARGET}
-        SRCS ${PADDLE2ONNX_INC_DIR}/paddle2onnx ${PADDLE2ONNX_LIB}
-        DSTS ${dst_dir}/include ${dst_dir}/lib)
-    endif()
+    copy(
+      ${TARGET}
+      SRCS ${PADDLE2ONNX_INC_DIR}/paddle2onnx ${PADDLE2ONNX_LIB_DIR}
+      DSTS ${dst_dir}/include ${dst_dir})
   endif()
 
   set(dst_dir "${DST}/third_party/install/gflags")
diff --git a/paddle/fluid/eager/auto_code_generator/CMakeLists.txt b/paddle/fluid/eager/auto_code_generator/CMakeLists.txt
index 8c067074d6efd..aff7f057f4601 100644
--- a/paddle/fluid/eager/auto_code_generator/CMakeLists.txt
+++ b/paddle/fluid/eager/auto_code_generator/CMakeLists.txt
@@ -103,7 +103,7 @@ if(WIN32)
     list(APPEND EAGER_CODEGEN_DEPS ${eager_generator_path}/onnxruntime.dll)
     add_custom_command(
       OUTPUT ${eager_generator_path}/paddle2onnx.dll
-      COMMAND ${CMAKE_COMMAND} -E copy ${PADDLE2ONNX_SHARED_LIB}
+      COMMAND ${CMAKE_COMMAND} -E copy ${PADDLE2ONNX_LIB}
               ${eager_generator_path}
       DEPENDS paddle2onnx)
     list(APPEND EAGER_CODEGEN_DEPS ${eager_generator_path}/paddle2onnx.dll)
diff --git a/paddle/fluid/inference/api/onnxruntime_predictor.cc b/paddle/fluid/inference/api/onnxruntime_predictor.cc
index e42e395ce90f8..93a96863053e5 100644
--- a/paddle/fluid/inference/api/onnxruntime_predictor.cc
+++ b/paddle/fluid/inference/api/onnxruntime_predictor.cc
@@ -74,8 +74,14 @@ bool CheckConvertToONNX(const AnalysisConfig &config) {
         config.model_dir(), config.prog_file(), config.params_file());
     return false;
   }
-  return paddle2onnx::IsExportable(config.prog_file(), config.params_file(),
-                                   config.model_from_memory());
+  if (config.model_from_memory()) {
+    return paddle2onnx::IsExportable(
+        config.prog_file().data(), config.prog_file().size(),
+        config.params_file().data(), config.params_file().size());
+  } else {
+    return paddle2onnx::IsExportable(config.prog_file().c_str(),
+                                     config.params_file().c_str());
+  }
 }
 
 bool ONNXRuntimePredictor::Init() {
@@ -89,9 +95,16 @@ bool ONNXRuntimePredictor::Init() {
     place_ = paddle::platform::CPUPlace();
   }
 
-  std::string onnx_proto;
-  paddle2onnx::Export(config_.prog_file(), config_.params_file(), &onnx_proto,
-                      config_.model_from_memory());
+  char *onnx_proto = nullptr;
+  int out_size;
+  if (config_.model_from_memory()) {
+    paddle2onnx::Export(config_.prog_file().data(), config_.prog_file().size(),
+                        config_.params_file().data(),
+                        config_.params_file().size(), &onnx_proto, &out_size);
+  } else {
+    paddle2onnx::Export(config_.prog_file().c_str(),
+                        config_.params_file().c_str(), &onnx_proto, &out_size);
+  }
 
   Ort::SessionOptions session_options;
   if (config_.ort_optimization_enabled()) {
@@ -118,7 +131,7 @@ bool ONNXRuntimePredictor::Init() {
                "will be "
                "generated.";
   }
-  session_ = {env_, onnx_proto.data(), onnx_proto.size(), session_options};
+  session_ = {env_, onnx_proto, static_cast<size_t>(out_size), session_options};
   binding_ = std::make_shared<Ort::IoBinding>(session_);
 
   Ort::MemoryInfo memory_info(device_name, OrtDeviceAllocator,
@@ -153,6 +166,8 @@ bool ONNXRuntimePredictor::Init() {
 
     allocator.Free(output_name);
   }
+  delete onnx_proto;
+  onnx_proto = nullptr;
   return true;
 }
 
diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt
index bf74d1184322c..a99dded4d5af1 100755
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -335,7 +335,7 @@ if(WITH_PYTHON)
     if(WITH_ONNXRUNTIME)
       add_custom_command(
         OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/paddle2onnx.dll
-        COMMAND ${CMAKE_COMMAND} -E copy ${PADDLE2ONNX_SHARED_LIB}
+        COMMAND ${CMAKE_COMMAND} -E copy ${PADDLE2ONNX_LIB}
                 ${CMAKE_CURRENT_BINARY_DIR}
         DEPENDS paddle2onnx)
       list(APPEND OP_IMPL_DEPS ${CMAKE_CURRENT_BINARY_DIR}/paddle2onnx.dll)
diff --git a/python/setup.py.in b/python/setup.py.in
index ca1768c9462f0..bb6416038f198 100755
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -531,15 +531,13 @@ if '${WITH_MKLDNN}' == 'ON':
 
 if '${WITH_ONNXRUNTIME}' == 'ON':
     shutil.copy('${ONNXRUNTIME_SHARED_LIB}', libs_path)
+    shutil.copy('${PADDLE2ONNX_LIB}', libs_path)
     if os.name == 'nt':
-        shutil.copy('${PADDLE2ONNX_SHARED_LIB}', libs_path)
         package_data['paddle.libs']+=['paddle2onnx.dll', 'onnxruntime.dll']
+    elif sys.platform == 'darwin':
+        package_data['paddle.libs']+=['libpaddle2onnx.dylib', 'libonnxruntime.1.10.0.dylib']
     else:
-        shutil.copy('${PADDLE2ONNX_LIB}', libs_path)
-        if sys.platform == 'darwin':
-            package_data['paddle.libs']+=['libpaddle2onnx.dylib', 'libonnxruntime.1.10.0.dylib']
-        else:
-            package_data['paddle.libs']+=['libpaddle2onnx.so', 'libonnxruntime.so.1.10.0']
+        package_data['paddle.libs']+=['libpaddle2onnx.so', 'libonnxruntime.so.1.10.0']
 
 if '${WITH_XPU}' == 'ON':
     # only change rpath in Release mode,

From 3af98de560659956c98882e37d039b1349b4d0c2 Mon Sep 17 00:00:00 2001
From: Sing_chan <51314274+betterpig@users.noreply.github.com>
Date: Mon, 6 Jun 2022 20:03:56 +0800
Subject: [PATCH 08/53] format CMakeLists.txt;add cmakelint hook and its config
 file (#43222)

---
 .pre-commit-config.yaml      | 116 +++++-
 CMakeLists.txt               | 733 ++++++++++++++++++++---------------
 tools/codestyle/.cmakelintrc |   1 +
 3 files changed, 536 insertions(+), 314 deletions(-)
 create mode 100644 tools/codestyle/.cmakelintrc

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 4b588cbeb91dc..bf9aa6e915a46 100755
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -5,7 +5,7 @@ repos:
     -   id: remove-crlf
         files: (?!.*third_party)^.*$ | (?!.*book)^.*$
 -   repo: https://github.com/google/yapf
-    sha: v0.32.0
+    rev: v0.32.0
     hooks:
     -   id: yapf
         files: (.*\.(py|bzl)|BUILD|.*\.BUILD|WORKSPACE)$
@@ -74,3 +74,117 @@ repos:
             (?x)^(
                 paddle/fluid/operators/CMakeLists.txt
             )$
+
+-   repo: https://github.com/cmake-lint/cmake-lint
+    rev: 1.4.2
+    hooks:
+    -   id: cmakelint
+        args: [--config=./tools/codestyle/.cmakelintrc]
+        # exclude files which need to be fixed
+        exclude: |
+            (?x)^(
+                cmake/generic.cmake|
+                CMakeLists.txt|
+                paddle/fluid/pybind/CMakeLists.txt|
+                python/paddle/fluid/tests/unittests/CMakeLists.txt|
+                paddle/fluid/eager/auto_code_generator/CMakeLists.txt|
+                paddle/fluid/framework/CMakeLists.txt|
+                paddle/fluid/eager/auto_code_generator/final_state_generator/CMakeLists.txt|
+                cmake/third_party.cmake|
+                paddle/fluid/inference/tests/infer_ut/CMakeLists.txt|
+                cmake/configure.cmake|
+                paddle/fluid/inference/api/demo_ci/CMakeLists.txt|
+                cmake/flags.cmake|
+                cmake/inference_lib.cmake|
+                cmake/external/protobuf.cmake|
+                cmake/system.cmake|
+                cmake/cudnn.cmake|
+                cmake/external/mkldnn.cmake|
+                cmake/unity_build.cmake|
+                paddle/fluid/framework/fleet/CMakeLists.txt|
+                paddle/fluid/inference/CMakeLists.txt|
+                paddle/fluid/inference/tests/api/CMakeLists.txt|
+                paddle/fluid/operators/CMakeLists.txt|
+                paddle/phi/api/lib/CMakeLists.txt|
+                cmake/external/gflags.cmake|
+                cmake/external/lite.cmake|
+                cmake/external/poplar.cmake|
+                cmake/python_module.cmake|
+                python/paddle/fluid/tests/unittests/asp/CMakeLists.txt|
+                cmake/cuda.cmake|
+                cmake/FindNumPy.cmake|
+                cmake/phi.cmake|
+                paddle/fluid/framework/ir/CMakeLists.txt|
+                paddle/fluid/platform/CMakeLists.txt|
+                python/paddle/fluid/tests/unittests/mlu/CMakeLists.txt|
+                python/paddle/tests/CMakeLists.txt|
+                cmake/ccache.cmake|
+                cmake/coveralls.cmake|
+                cmake/external/glog.cmake|
+                cmake/external/onnxruntime.cmake|
+                cmake/external/openblas.cmake|
+                cmake/external/xpu.cmake|
+                cmake/hip.cmake|
+                paddle/fluid/distributed/CMakeLists.txt|
+                paddle/fluid/framework/details/CMakeLists.txt|
+                paddle/fluid/imperative/CMakeLists.txt|
+                paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt|
+                paddle/fluid/inference/api/CMakeLists.txt|
+                paddle/fluid/operators/controlflow/CMakeLists.txt|
+                python/paddle/fluid/tests/unittests/distributed_passes/CMakeLists.txt|
+                cmake/cblas.cmake|
+                cmake/coverallsGcovJsons.cmake|
+                cmake/external/brpc.cmake|
+                cmake/external/cryptopp.cmake|
+                cmake/external/gtest.cmake|
+                cmake/external/llvm.cmake|
+                cmake/external/utf8proc.cmake|
+                cmake/external/warpctc.cmake|
+                cmake/external/zlib.cmake|
+                cmake/FindGperftools.cmake|
+                cmake/operators.cmake|
+                cmake/tensorrt.cmake|
+                paddle/fluid/inference/api/details/CMakeLists.txt|
+                python/paddle/fluid/tests/unittests/xpu/CMakeLists.txt|
+                cmake/external/arm_brpc.cmake|
+                cmake/external/concurrentqueue.cmake|
+                cmake/external/eigen.cmake|
+                cmake/external/mklml.cmake|
+                cmake/external/paddle2onnx.cmake|
+                cmake/miopen.cmake|
+                cmake/nccl.cmake|
+                cmake/simd.cmake|
+                paddle/fluid/distributed/fleet_executor/CMakeLists.txt|
+                paddle/fluid/eager/api/generated/fluid_generated/forwards/CMakeLists.txt|
+                paddle/fluid/framework/io/CMakeLists.txt|
+                paddle/fluid/imperative/tests/CMakeLists.txt|
+                paddle/fluid/inference/analysis/CMakeLists.txt|
+                paddle/fluid/inference/tests/infer_ut/external-cmake/gtest-cpp.cmake|
+                paddle/fluid/memory/allocation/CMakeLists.txt|
+                paddle/fluid/memory/CMakeLists.txt|
+                paddle/fluid/operators/cinn/CMakeLists.txt|
+                paddle/fluid/operators/collective/CMakeLists.txt|
+                paddle/fluid/operators/ipu/CMakeLists.txt|
+                paddle/fluid/operators/jit/CMakeLists.txt|
+                paddle/fluid/operators/pscore/CMakeLists.txt|
+                paddle/fluid/platform/device/ipu/CMakeLists.txt|
+                paddle/fluid/platform/dynload/CMakeLists.txt|
+                paddle/infrt/external_kernels/CMakeLists.txt|
+                paddle/infrt/kernel/phi/CMakeLists.txt|
+                paddle/phi/backends/dynload/CMakeLists.txt|
+                paddle/phi/CMakeLists.txt|
+                paddle/phi/kernels/CMakeLists.txt|
+                paddle/phi/tests/core/CMakeLists.txt|
+                python/CMakeLists.txt|
+                python/paddle/fluid/contrib/slim/tests/CMakeLists.txt|
+                python/paddle/fluid/tests/unittests/autograd/CMakeLists.txt|
+                python/paddle/fluid/tests/unittests/distribution/CMakeLists.txt|
+                python/paddle/fluid/tests/unittests/dygraph_to_static/CMakeLists.txt|
+                python/paddle/fluid/tests/unittests/fft/CMakeLists.txt|
+                python/paddle/fluid/tests/unittests/ipu/CMakeLists.txt|
+                python/paddle/fluid/tests/unittests/mkldnn/CMakeLists.txt|
+                python/paddle/fluid/tests/unittests/npu/CMakeLists.txt|
+                python/paddle/fluid/tests/unittests/ps/CMakeLists.txt|
+                python/paddle/fluid/tests/unittests/rnn/CMakeLists.txt|
+                python/paddle/fluid/tests/unittests/sequence/CMakeLists.txt
+            )$
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 70eb5f11ea168..ba438a74718f2 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -13,12 +13,12 @@
 # limitations under the License
 
 if(APPLE AND WITH_ARM)
-    # cmake 3.19.2 version starts to support M1
-    cmake_minimum_required(VERSION 3.19.2)
-    cmake_policy(VERSION 3.19.2)
+  # cmake 3.19.2 version starts to support M1
+  cmake_minimum_required(VERSION 3.19.2)
+  cmake_policy(VERSION 3.19.2)
 else(APPLE AND WITH_ARM)
-    cmake_minimum_required(VERSION 3.15)
-    cmake_policy(VERSION 3.10)
+  cmake_minimum_required(VERSION 3.15)
+  cmake_policy(VERSION 3.10)
 endif(APPLE AND WITH_ARM)
 # use to get_property location of static lib
 # https://cmake.org/cmake/help/v3.0/policy/CMP0026.html?highlight=cmp0026
@@ -31,9 +31,12 @@ include(system)
 
 # Note(zhouwei): Ninja Generator will set CMAKE_BUILD_TYPE to Debug
 if(NOT CMAKE_BUILD_TYPE)
-    set(CMAKE_BUILD_TYPE "Release" CACHE STRING
-      "Choose the type of build, options are: Debug Release RelWithDebInfo MinSizeRel"
-      FORCE)
+  set(CMAKE_BUILD_TYPE
+      "Release"
+      CACHE
+        STRING
+        "Choose the type of build, options are: Debug Release RelWithDebInfo MinSizeRel"
+        FORCE)
 endif()
 
 project(paddle CXX C)
@@ -42,157 +45,185 @@ project(paddle CXX C)
 # TODO(Shibo Tao): remove find_package(CUDA) completely.
 find_package(CUDA QUIET)
 find_package(MKL CONFIG QUIET)
-option(WITH_ONEMKL      "Compile PaddlePaddle with oneMKL"              OFF)
-option(WITH_GPU         "Compile PaddlePaddle with NVIDIA GPU"          ${CUDA_FOUND})
-option(WITH_TENSORRT    "Compile PaddlePaddle with NVIDIA TensorRT"     OFF)
-option(WITH_XPU         "Compile PaddlePaddle with BAIDU KUNLUN XPU"    OFF)
-option(WITH_XPU_KP      "Compile PaddlePaddle with BAIDU XPU compiler " OFF)
-option(WITH_MLU    "Compile PaddlePaddle with CAMBRICON MLU"     OFF)
-option(WITH_WIN_DUMP_DBG "Compile with windows core dump debug mode"    OFF)
-option(WITH_ASCEND         "Compile PaddlePaddle with ASCEND"        OFF)
-option(WITH_ROCM        "Compile PaddlePaddle with ROCM platform"       OFF)
-option(WITH_IPU         "Compile PaddlePaddle with Graphcore IPU"    OFF)
+option(WITH_ONEMKL "Compile PaddlePaddle with oneMKL" OFF)
+option(WITH_GPU "Compile PaddlePaddle with NVIDIA GPU" ${CUDA_FOUND})
+option(WITH_TENSORRT "Compile PaddlePaddle with NVIDIA TensorRT" OFF)
+option(WITH_XPU "Compile PaddlePaddle with BAIDU KUNLUN XPU" OFF)
+option(WITH_XPU_KP "Compile PaddlePaddle with BAIDU XPU compiler " OFF)
+option(WITH_MLU "Compile PaddlePaddle with CAMBRICON MLU" OFF)
+option(WITH_WIN_DUMP_DBG "Compile with windows core dump debug mode" OFF)
+option(WITH_ASCEND "Compile PaddlePaddle with ASCEND" OFF)
+option(WITH_ROCM "Compile PaddlePaddle with ROCM platform" OFF)
+option(WITH_IPU "Compile PaddlePaddle with Graphcore IPU" OFF)
 # NOTE(zhiqiu): WITH_ASCEND_CL can be compile on x86_64, so we can set WITH_ASCEND=OFF and WITH_ASCEND_CL=ON
 # to develop some acl related functionality on x86
-option(WITH_ASCEND_CL         "Compile PaddlePaddle with ASCEND CL"        ${WITH_ASCEND})
-option(WITH_ASCEND_CXX11         "Compile PaddlePaddle with ASCEND and CXX11 ABI"        OFF)
-option(WITH_ONNXRUNTIME         "Compile PaddlePaddle with ONNXRUNTIME"          OFF)
+option(WITH_ASCEND_CL "Compile PaddlePaddle with ASCEND CL" ${WITH_ASCEND})
+option(WITH_ASCEND_CXX11 "Compile PaddlePaddle with ASCEND and CXX11 ABI" OFF)
+option(WITH_ONNXRUNTIME "Compile PaddlePaddle with ONNXRUNTIME" OFF)
 # Note(zhouwei): It use option above, so put here
 include(init)
-include(generic)            # simplify cmake module
-include(experimental)       # experimental build options
+include(generic) # simplify cmake module
+include(experimental) # experimental build options
 
-if (WITH_GPU  AND WITH_XPU)
-    message(FATAL_ERROR "Error when compile GPU and XPU at the same time")
+if(WITH_GPU AND WITH_XPU)
+  message(FATAL_ERROR "Error when compile GPU and XPU at the same time")
 endif()
-if (WITH_GPU AND WITH_XPU_KP)
-    message(FATAL_ERROR "Error when compile GPU and XPU2 at the same time")
+if(WITH_GPU AND WITH_XPU_KP)
+  message(FATAL_ERROR "Error when compile GPU and XPU2 at the same time")
 endif()
-if (WITH_GPU AND WITH_ASCEND)
-    message(FATAL_ERROR "Error when compile GPU and ASCEND at the same time")
+if(WITH_GPU AND WITH_ASCEND)
+  message(FATAL_ERROR "Error when compile GPU and ASCEND at the same time")
 endif()
-if (WITH_GPU AND WITH_ROCM)
-    message(FATAL_ERROR "Error when compile CUDA and ROCM at the same time")
+if(WITH_GPU AND WITH_ROCM)
+  message(FATAL_ERROR "Error when compile CUDA and ROCM at the same time")
 endif()
-if (WITH_GPU AND WITH_MLU)
-    message(FATAL_ERROR "Error when compile GPU and MLU at the same time")
+if(WITH_GPU AND WITH_MLU)
+  message(FATAL_ERROR "Error when compile GPU and MLU at the same time")
 endif()
 
 if(WITH_GPU AND NOT APPLE)
-    enable_language(CUDA)
-    message(STATUS "CUDA compiler: ${CMAKE_CUDA_COMPILER}, version: "
-        "${CMAKE_CUDA_COMPILER_ID} ${CMAKE_CUDA_COMPILER_VERSION}")
+  enable_language(CUDA)
+  message(STATUS "CUDA compiler: ${CMAKE_CUDA_COMPILER}, version: "
+                 "${CMAKE_CUDA_COMPILER_ID} ${CMAKE_CUDA_COMPILER_VERSION}")
 endif()
 
 message(STATUS "CXX compiler: ${CMAKE_CXX_COMPILER}, version: "
-        "${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION}")
+               "${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION}")
 message(STATUS "C compiler: ${CMAKE_C_COMPILER}, version: "
-        "${CMAKE_C_COMPILER_ID} ${CMAKE_C_COMPILER_VERSION}")
+               "${CMAKE_C_COMPILER_ID} ${CMAKE_C_COMPILER_VERSION}")
 message(STATUS "AR tools: ${CMAKE_AR}")
 
 # MUSL build turn off warnings
 if(WITH_MUSL)
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=deprecated-declarations -Wno-deprecated-declarations -Wno-error=pessimizing-move -Wno-error=deprecated-copy")
+  set(CMAKE_CXX_FLAGS
+      "${CMAKE_CXX_FLAGS} -Wno-error=deprecated-declarations -Wno-deprecated-declarations -Wno-error=pessimizing-move -Wno-error=deprecated-copy"
+  )
 endif()
 
 if(APPLE AND WITH_ARM)
-    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -target arm64-apple-darwin")
-    set(CMAKE_CXX_FLAGS "${CMAKE_C_FLAGS} -target arm64-apple-darwin")
+  set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -target arm64-apple-darwin")
+  set(CMAKE_CXX_FLAGS "${CMAKE_C_FLAGS} -target arm64-apple-darwin")
 endif()
 
 if(WITH_ASCEND_CL AND NOT WITH_ASCEND_CXX11)
-    if(WITH_ARM_BRPC)
-        set(CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=1")
-    else()
-        set(CMAKE_CXX_FLAGS  "${CMAKE_CXX_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=0")
-    endif()
+  if(WITH_ARM_BRPC)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=1")
+  else()
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=0")
+  endif()
 endif()
 
 if(WIN32)
-    option(MSVC_STATIC_CRT "use static C Runtime library by default" ON)
-
-    set(CMAKE_SUPPRESS_REGENERATION ON)
-    set(CMAKE_STATIC_LIBRARY_PREFIX lib)
-
-    set(CMAKE_C_FLAGS_DEBUG   "${CMAKE_C_FLAGS_DEBUG} /bigobj")
-    set(CMAKE_C_FLAGS_RELEASE  "${CMAKE_C_FLAGS_RELEASE} /bigobj")
-    set(CMAKE_CXX_FLAGS_DEBUG  "${CMAKE_CXX_FLAGS_DEBUG} /bigobj")
-    set(CMAKE_CXX_FLAGS_RELEASE   "${CMAKE_CXX_FLAGS_RELEASE} /bigobj")
-
-    if("${CMAKE_GENERATOR}" STREQUAL "Ninja")
-        set(CMAKE_C_FLAGS_DEBUG   "${CMAKE_C_FLAGS_DEBUG} /Zc:inline")
-        set(CMAKE_C_FLAGS_RELEASE  "${CMAKE_C_FLAGS_RELEASE} /Zc:inline")
-        set(CMAKE_CXX_FLAGS_DEBUG  "${CMAKE_CXX_FLAGS_DEBUG} /Zc:inline")
-        set(CMAKE_CXX_FLAGS_RELEASE   "${CMAKE_CXX_FLAGS_RELEASE} /Zc:inline")
+  option(MSVC_STATIC_CRT "use static C Runtime library by default" ON)
+
+  set(CMAKE_SUPPRESS_REGENERATION ON)
+  set(CMAKE_STATIC_LIBRARY_PREFIX lib)
+
+  set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} /bigobj")
+  set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} /bigobj")
+  set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /bigobj")
+  set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /bigobj")
+
+  if("${CMAKE_GENERATOR}" STREQUAL "Ninja")
+    set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} /Zc:inline")
+    set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} /Zc:inline")
+    set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /Zc:inline")
+    set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /Zc:inline")
+  endif()
+
+  if(MSVC_STATIC_CRT)
+    message(
+      STATUS
+        "Use static C runtime time, refer to https://docs.microsoft.com/en-us/cpp/c-runtime-library/crt-library-features?view=vs-2019"
+    )
+    foreach(
+      flag_var
+      CMAKE_CXX_FLAGS
+      CMAKE_CXX_FLAGS_DEBUG
+      CMAKE_CXX_FLAGS_RELEASE
+      CMAKE_CXX_FLAGS_MINSIZEREL
+      CMAKE_CXX_FLAGS_RELWITHDEBINFO
+      CMAKE_C_FLAGS
+      CMAKE_C_FLAGS_DEBUG
+      CMAKE_C_FLAGS_RELEASE
+      CMAKE_C_FLAGS_MINSIZEREL
+      CMAKE_C_FLAGS_RELWITHDEBINFO)
+      if(${flag_var} MATCHES "/MD")
+        string(REGEX REPLACE "/MD" "/MT" ${flag_var} "${${flag_var}}")
+      endif()
+    endforeach(flag_var)
+  endif()
+
+  # NOTE(zhouwei): msvc max/min macro conflict with std::min/max, define NOMINMAX globally
+  add_definitions("-DNOMINMAX")
+  # windows build turn off warnings, use parallel compiling.
+  foreach(
+    flag_var
+    CMAKE_CXX_FLAGS
+    CMAKE_CXX_FLAGS_DEBUG
+    CMAKE_CXX_FLAGS_RELEASE
+    CMAKE_CXX_FLAGS_MINSIZEREL
+    CMAKE_CXX_FLAGS_RELWITHDEBINFO
+    CMAKE_C_FLAGS
+    CMAKE_C_FLAGS_DEBUG
+    CMAKE_C_FLAGS_RELEASE
+    CMAKE_C_FLAGS_MINSIZEREL
+    CMAKE_C_FLAGS_RELWITHDEBINFO)
+    string(REGEX REPLACE "/W[1-4]" " /W0 " ${flag_var} "${${flag_var}}")
+
+    # NOTE(zhouwei25): GPU compile have too high memory utilization when parallel compiling,
+    # For Visual Studio generators, /MP should be added.
+    # For other generators like Ninja, it is not need to add /MP.
+    if(CMAKE_GENERATOR MATCHES "Visual Studio" AND NOT WITH_GPU)
+      math(EXPR PROCESS_MAX "${CPU_CORES} * 2 / 3")
+      set(${flag_var} "${${flag_var}} /MP${PROCESS_MAX}")
     endif()
-
-    if (MSVC_STATIC_CRT)
-        message(STATUS "Use static C runtime time, refer to https://docs.microsoft.com/en-us/cpp/c-runtime-library/crt-library-features?view=vs-2019")
-        foreach(flag_var
-            CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE
-            CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO
-            CMAKE_C_FLAGS CMAKE_C_FLAGS_DEBUG CMAKE_C_FLAGS_RELEASE
-            CMAKE_C_FLAGS_MINSIZEREL CMAKE_C_FLAGS_RELWITHDEBINFO)
-            if(${flag_var} MATCHES "/MD")
-                string(REGEX REPLACE "/MD" "/MT" ${flag_var} "${${flag_var}}")
-            endif()
-        endforeach(flag_var)
+  endforeach(flag_var)
+  foreach(flag_var CMAKE_CXX_FLAGS CMAKE_C_FLAGS)
+    set(${flag_var} "${${flag_var}} /w")
+  endforeach(flag_var)
+
+  # Windows Remove /Zi, /ZI for Release, MinSizeRel builds
+  foreach(flag_var
+          CMAKE_C_FLAGS CMAKE_C_FLAGS_RELEASE CMAKE_C_FLAGS_MINSIZEREL
+          CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_RELEASE CMAKE_CXX_FLAGS_MINSIZEREL)
+    if(${flag_var} MATCHES "/Z[iI]")
+      string(REGEX REPLACE "/Z[iI]" "" ${flag_var} "${${flag_var}}")
     endif()
+  endforeach(flag_var)
+
+  set(CMAKE_C_FLAGS
+      "${CMAKE_C_FLAGS} /wd4068 /wd4129 /wd4244 /wd4267 /wd4297 /wd4530 /wd4577 /wd4819 /wd4838"
+  )
+  set(CMAKE_CXX_FLAGS
+      "${CMAKE_CXX_FLAGS} /wd4068 /wd4129 /wd4244 /wd4267 /wd4297 /wd4530 /wd4577 /wd4819 /wd4838"
+  )
+
+  foreach(flag_var CMAKE_SHARED_LINKER_FLAGS CMAKE_STATIC_LINKER_FLAGS
+                   CMAKE_EXE_LINKER_FLAGS CMAKE_LINKER_FLAGS)
+    set(${flag_var}
+        "${${flag_var}} /ignore:4049 /ignore:4217 /ignore:4006 /ignore:4221")
+    if(MSVC_STATIC_CRT)
+      set(${flag_var} "${${flag_var}} /NODEFAULTLIB:MSVCRT.LIB")
+    endif()
+  endforeach(flag_var)
 
-    # NOTE(zhouwei): msvc max/min macro conflict with std::min/max, define NOMINMAX globally
-    add_definitions("-DNOMINMAX")
-    # windows build turn off warnings, use parallel compiling.
-    foreach(flag_var
-        CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE
-        CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO
-        CMAKE_C_FLAGS CMAKE_C_FLAGS_DEBUG CMAKE_C_FLAGS_RELEASE
-        CMAKE_C_FLAGS_MINSIZEREL CMAKE_C_FLAGS_RELWITHDEBINFO)
-        string(REGEX REPLACE "/W[1-4]" " /W0 " ${flag_var} "${${flag_var}}")
-        
-        # NOTE(zhouwei25): GPU compile have too high memory utilization when parallel compiling,
-        # For Visual Studio generators, /MP should be added. 
-        # For other generators like Ninja, it is not need to add /MP.
-        if(CMAKE_GENERATOR MATCHES "Visual Studio" AND NOT WITH_GPU)
-            math(EXPR PROCESS_MAX "${CPU_CORES} * 2 / 3")
-            set(${flag_var} "${${flag_var}} /MP${PROCESS_MAX}")
-        endif()
-    endforeach(flag_var)
-    foreach(flag_var CMAKE_CXX_FLAGS CMAKE_C_FLAGS)
-        set(${flag_var} "${${flag_var}} /w")
-    endforeach(flag_var)
-
-    # Windows Remove /Zi, /ZI for Release, MinSizeRel builds
-    foreach(flag_var
-        CMAKE_C_FLAGS CMAKE_C_FLAGS_RELEASE CMAKE_C_FLAGS_MINSIZEREL
-        CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_RELEASE CMAKE_CXX_FLAGS_MINSIZEREL)
-        if(${flag_var} MATCHES "/Z[iI]")
-            string(REGEX REPLACE "/Z[iI]" "" ${flag_var} "${${flag_var}}")
-        endif()
-    endforeach(flag_var)
-
-    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /wd4068 /wd4129 /wd4244 /wd4267 /wd4297 /wd4530 /wd4577 /wd4819 /wd4838")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4068 /wd4129 /wd4244 /wd4267 /wd4297 /wd4530 /wd4577 /wd4819 /wd4838")
+  if(WITH_WIN_DUMP_DBG)
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /Zi")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /Zi")
 
-    foreach(flag_var CMAKE_SHARED_LINKER_FLAGS CMAKE_STATIC_LINKER_FLAGS CMAKE_EXE_LINKER_FLAGS CMAKE_LINKER_FLAGS)
-        set(${flag_var} "${${flag_var}} /ignore:4049 /ignore:4217 /ignore:4006 /ignore:4221")
-        if(MSVC_STATIC_CRT)
-            set(${flag_var} "${${flag_var}} /NODEFAULTLIB:MSVCRT.LIB")
-        endif()
+    foreach(flag_var CMAKE_SHARED_LINKER_FLAGS CMAKE_STATIC_LINKER_FLAGS
+                     CMAKE_EXE_LINKER_FLAGS CMAKE_LINKER_FLAGS)
+      set(${flag_var} "${${flag_var}} /DEBUG /OPT:REF /OPT:ICF")
     endforeach(flag_var)
 
-    if (WITH_WIN_DUMP_DBG)
-        set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /Zi")
-        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /Zi")
-
-        foreach(flag_var CMAKE_SHARED_LINKER_FLAGS CMAKE_STATIC_LINKER_FLAGS CMAKE_EXE_LINKER_FLAGS CMAKE_LINKER_FLAGS)
-            set(${flag_var} "${${flag_var}} /DEBUG /OPT:REF /OPT:ICF")
-        endforeach(flag_var)
-
-        add_definitions("-DWITH_WIN_DUMP_DBG")
-    endif()
+    add_definitions("-DWITH_WIN_DUMP_DBG")
+  endif()
 
 else(WIN32)
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=deprecated-declarations -Wno-deprecated-declarations")
+  set(CMAKE_CXX_FLAGS
+      "${CMAKE_CXX_FLAGS} -Wno-error=deprecated-declarations -Wno-deprecated-declarations"
+  )
 endif(WIN32)
 
 find_package(Git REQUIRED)
@@ -200,7 +231,7 @@ find_package(Git REQUIRED)
 # config GIT_URL with github mirrors to speed up dependent repos clone
 option(GIT_URL "Git URL to clone dependent repos" ${GIT_URL})
 if(NOT GIT_URL)
-    set(GIT_URL "https://github.com")
+  set(GIT_URL "https://github.com")
 endif()
 
 find_package(Threads REQUIRED)
@@ -208,60 +239,83 @@ find_package(Threads REQUIRED)
 include(simd)
 
 ################################ Exposed Configurations #######################################
-option(WITH_AVX         "Compile PaddlePaddle with AVX intrinsics"      ${AVX_FOUND})
-option(WITH_PYTHON      "Compile PaddlePaddle with python interpreter"  ON)
-option(WITH_TESTING     "Compile PaddlePaddle with unit testing"        OFF)
-option(WITH_MKL         "Compile PaddlePaddle with MKL support."        ${AVX_FOUND})
-option(WITH_SYSTEM_BLAS   "Use system blas library"           OFF)
-option(WITH_DISTRIBUTE  "Compile with distributed support"              OFF)
-option(WITH_BRPC_RDMA     "Use brpc rdma as the rpc protocal"           OFF)
-option(ON_INFER         "Turn on inference optimization and inference-lib generation" OFF)
+option(WITH_AVX "Compile PaddlePaddle with AVX intrinsics" ${AVX_FOUND})
+option(WITH_PYTHON "Compile PaddlePaddle with python interpreter" ON)
+option(WITH_TESTING "Compile PaddlePaddle with unit testing" OFF)
+option(WITH_MKL "Compile PaddlePaddle with MKL support." ${AVX_FOUND})
+option(WITH_SYSTEM_BLAS "Use system blas library" OFF)
+option(WITH_DISTRIBUTE "Compile with distributed support" OFF)
+option(WITH_BRPC_RDMA "Use brpc rdma as the rpc protocal" OFF)
+option(ON_INFER "Turn on inference optimization and inference-lib generation"
+       OFF)
 ################################ Internal Configurations #######################################
-option(WITH_NV_JETSON   "Compile PaddlePaddle with NV JETSON"             OFF)
-option(WITH_PROFILER    "Compile PaddlePaddle with GPU profiler and gperftools"        OFF)
-option(WITH_COVERAGE    "Compile PaddlePaddle with code coverage"       OFF)
-option(WITH_INCREMENTAL_COVERAGE    "Generate coverage reports only for incremental code"       OFF)
-OPTION(WITH_LIBXSMM     "Compile with libxsmm"                          OFF)
-option(COVERALLS_UPLOAD "Package code coverage data to coveralls"       OFF)
-option(WITH_PSLIB       "Compile with pslib support"                    OFF)
-option(WITH_BOX_PS      "Compile with box_ps support"                   OFF)
-option(WITH_XBYAK       "Compile with xbyak support"                    ON)
-option(WITH_CONTRIB     "Compile the third-party contributation"        OFF)
-option(WITH_PSCORE     "Compile with parameter server support"         ${WITH_DISTRIBUTE})
-option(WITH_HETERPS     "Compile with heterps"                          OFF})
-option(WITH_INFERENCE_API_TEST   "Test fluid inference C++ high-level api interface"  OFF)
-option(PY_VERSION       "Compile PaddlePaddle with python3 support"     ${PY_VERSION})
-option(WITH_DGC   "Use DGC(Deep Gradient Compression) or not" ${WITH_DISTRIBUTE})
-option(SANITIZER_TYPE "Choose the type of sanitizer, options are: Address, Leak, Memory, Thread, Undefined" OFF)
-option(WITH_LITE   "Compile Paddle Fluid with Lite Engine" OFF)
-option(WITH_CINN   "Compile PaddlePaddle with CINN" OFF)
-option(WITH_INFRT  "Compile PaddlePaddle with INFRT" OFF)
-option(WITH_NCCL   "Compile PaddlePaddle with NCCL support"             ON)
-option(WITH_RCCL   "Compile PaddlePaddle with RCCL support"             ON)
-option(WITH_XPU_BKCL    "Compile PaddlePaddle with BAIDU KUNLUN XPU BKCL"   OFF)
-option(WITH_CNCL   "Compile PaddlePaddle with CNCL support"             OFF)
-option(WITH_CRYPTO   "Compile PaddlePaddle with crypto support"         ON)
-option(WITH_ARM   "Compile PaddlePaddle with arm support"         OFF)
-option(WITH_SW   "Compile PaddlePaddle with sw support"         OFF)
-option(WITH_MIPS   "Compile PaddlePaddle with mips support"         OFF)
-option(WITH_MUSL        "Compile with musl libc instead of gblic"  OFF)
-option(WITH_UNITY_BUILD "Compile with UnityBuild mode"             OFF)
-option(WITH_STRIP       "Strip so files of Whl packages"         OFF)
-option(NEW_RELEASE_PYPI   "PaddlePaddle next-level release strategy for pypi cubin package"             OFF)
-option(NEW_RELEASE_ALL   "PaddlePaddle next-level release strategy for all arches cubin package"             OFF)
-option(NEW_RELEASE_JIT   "PaddlePaddle next-level release strategy for backup jit package"             OFF)
-option(WITH_ASCEND_INT64 "Compile with int64 kernel for ascend NPU"    OFF)
-option(WITH_POCKETFFT    "Compile with pocketfft support"      ON)
-option(WITH_RECORD_BUILDTIME    "Compile PaddlePaddle with record all targets build time"       OFF)
-option(WITH_CUSTOM_DEVICE "Compile with custom device support"    OFF)
-option(WITH_ARM_BRPC "Supprot Brpc in Arm"    OFF)
-option(WITH_FLPS     "FL PS mode"    OFF)
+option(WITH_NV_JETSON "Compile PaddlePaddle with NV JETSON" OFF)
+option(WITH_PROFILER "Compile PaddlePaddle with GPU profiler and gperftools"
+       OFF)
+option(WITH_COVERAGE "Compile PaddlePaddle with code coverage" OFF)
+option(WITH_INCREMENTAL_COVERAGE
+       "Generate coverage reports only for incremental code" OFF)
+option(WITH_LIBXSMM "Compile with libxsmm" OFF)
+option(COVERALLS_UPLOAD "Package code coverage data to coveralls" OFF)
+option(WITH_PSLIB "Compile with pslib support" OFF)
+option(WITH_BOX_PS "Compile with box_ps support" OFF)
+option(WITH_XBYAK "Compile with xbyak support" ON)
+option(WITH_CONTRIB "Compile the third-party contributation" OFF)
+option(WITH_PSCORE "Compile with parameter server support" ${WITH_DISTRIBUTE})
+option(WITH_HETERPS "Compile with heterps" OFF})
+option(WITH_INFERENCE_API_TEST
+       "Test fluid inference C++ high-level api interface" OFF)
+option(PY_VERSION "Compile PaddlePaddle with python3 support" ${PY_VERSION})
+option(WITH_DGC "Use DGC(Deep Gradient Compression) or not" ${WITH_DISTRIBUTE})
+option(
+  SANITIZER_TYPE
+  "Choose the type of sanitizer, options are: Address, Leak, Memory, Thread, Undefined"
+  OFF)
+option(WITH_LITE "Compile Paddle Fluid with Lite Engine" OFF)
+option(WITH_CINN "Compile PaddlePaddle with CINN" OFF)
+option(WITH_INFRT "Compile PaddlePaddle with INFRT" OFF)
+option(WITH_NCCL "Compile PaddlePaddle with NCCL support" ON)
+option(WITH_RCCL "Compile PaddlePaddle with RCCL support" ON)
+option(WITH_XPU_BKCL "Compile PaddlePaddle with BAIDU KUNLUN XPU BKCL" OFF)
+option(WITH_CNCL "Compile PaddlePaddle with CNCL support" OFF)
+option(WITH_CRYPTO "Compile PaddlePaddle with crypto support" ON)
+option(WITH_ARM "Compile PaddlePaddle with arm support" OFF)
+option(WITH_SW "Compile PaddlePaddle with sw support" OFF)
+option(WITH_MIPS "Compile PaddlePaddle with mips support" OFF)
+option(WITH_MUSL "Compile with musl libc instead of gblic" OFF)
+option(WITH_UNITY_BUILD "Compile with UnityBuild mode" OFF)
+option(WITH_STRIP "Strip so files of Whl packages" OFF)
+option(NEW_RELEASE_PYPI
+       "PaddlePaddle next-level release strategy for pypi cubin package" OFF)
+option(NEW_RELEASE_ALL
+       "PaddlePaddle next-level release strategy for all arches cubin package"
+       OFF)
+option(NEW_RELEASE_JIT
+       "PaddlePaddle next-level release strategy for backup jit package" OFF)
+option(WITH_ASCEND_INT64 "Compile with int64 kernel for ascend NPU" OFF)
+option(WITH_POCKETFFT "Compile with pocketfft support" ON)
+option(WITH_RECORD_BUILDTIME
+       "Compile PaddlePaddle with record all targets build time" OFF)
+option(WITH_CUSTOM_DEVICE "Compile with custom device support" OFF)
+option(WITH_ARM_BRPC "Supprot Brpc in Arm" OFF)
+option(WITH_FLPS "FL PS mode" OFF)
 
 if(WITH_RECORD_BUILDTIME)
-    set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE "${CMAKE_CURRENT_SOURCE_DIR}/tools/get_build_time.sh ${CMAKE_CURRENT_BINARY_DIR}")
-    set_property(GLOBAL PROPERTY RULE_LAUNCH_LINK "${CMAKE_CURRENT_SOURCE_DIR}/tools/get_build_time.sh ${CMAKE_CURRENT_BINARY_DIR}")
-else()            
-    include(ccache) # set ccache for compilation ; if WITH_RECORD_BUILDTIME=ON can't use ccache
+  set_property(
+    GLOBAL
+    PROPERTY
+      RULE_LAUNCH_COMPILE
+      "${CMAKE_CURRENT_SOURCE_DIR}/tools/get_build_time.sh ${CMAKE_CURRENT_BINARY_DIR}"
+  )
+  set_property(
+    GLOBAL
+    PROPERTY
+      RULE_LAUNCH_LINK
+      "${CMAKE_CURRENT_SOURCE_DIR}/tools/get_build_time.sh ${CMAKE_CURRENT_BINARY_DIR}"
+  )
+else()
+  include(ccache
+  )# set ccache for compilation ; if WITH_RECORD_BUILDTIME=ON can't use ccache
 endif()
 unset(WITH_RECORD_BUILDTIME CACHE)
 
@@ -271,191 +325,237 @@ if(NOT PY_VERSION)
 endif()
 set(PYBIND11_PYTHON_VERSION ${PY_VERSION})
 
-
 # the type of sanitizer, options are: Address, Leak, Memory, Thread, Undefined. Default: OFF
-if(SANITIZER_TYPE AND NOT "${SANITIZER_TYPE}" MATCHES "^(Address|Leak|Memory|Thread|Undefined)$")
+if(SANITIZER_TYPE AND NOT "${SANITIZER_TYPE}" MATCHES
+                      "^(Address|Leak|Memory|Thread|Undefined)$")
   message("Choose the correct type of sanitizer")
   return()
 endif()
 
-if (LINUX AND NOT WITH_CUSTOM_DEVICE AND NOT ON_INFER)
-set(WITH_CUSTOM_DEVICE ON)
+if(LINUX
+   AND NOT WITH_CUSTOM_DEVICE
+   AND NOT ON_INFER)
+  set(WITH_CUSTOM_DEVICE ON)
 endif()
 
 if(WIN32)
-    if(WITH_DISTRIBUTE)
-        MESSAGE(WARNING
-            "Disable DISTRIBUTE when compiling for Windows. Force WITH_DISTRIBUTE=OFF.")
-        set(WITH_DISTRIBUTE OFF CACHE STRING
-            "Disable DISTRIBUTE when compiling for Windows" FORCE)
-    endif()
-    if(WITH_NCCL)
-        MESSAGE(WARNING
-            "Disable NCCL when compiling for Windows. Force WITH_NCCL=OFF.")
-        set(WITH_NCCL OFF CACHE STRING
-            "Disable NCCL when compiling for Windows" FORCE)
-    endif()
-endif()
-
-if (NOT WITH_GPU AND WITH_NCCL)
-    MESSAGE(WARNING
-        "Disable NCCL when compiling without GPU. Force WITH_NCCL=OFF.")
-    set(WITH_NCCL OFF CACHE STRING
-        "Disable NCCL when compiling without GPU" FORCE)
+  if(WITH_DISTRIBUTE)
+    message(
+      WARNING
+        "Disable DISTRIBUTE when compiling for Windows. Force WITH_DISTRIBUTE=OFF."
+    )
+    set(WITH_DISTRIBUTE
+        OFF
+        CACHE STRING "Disable DISTRIBUTE when compiling for Windows" FORCE)
+  endif()
+  if(WITH_NCCL)
+    message(
+      WARNING "Disable NCCL when compiling for Windows. Force WITH_NCCL=OFF.")
+    set(WITH_NCCL
+        OFF
+        CACHE STRING "Disable NCCL when compiling for Windows" FORCE)
+  endif()
+endif()
+
+if(NOT WITH_GPU AND WITH_NCCL)
+  message(
+    WARNING "Disable NCCL when compiling without GPU. Force WITH_NCCL=OFF.")
+  set(WITH_NCCL
+      OFF
+      CACHE STRING "Disable NCCL when compiling without GPU" FORCE)
 endif()
 
 # force WITH_XPU on when WITH_XPU_KP
-if (WITH_XPU_KP AND NOT WITH_XPU)
-    MESSAGE(WARNING
-        "Enable WITH_XPU when compiling with WITH_XPU_KP. Force WITH_XPU=ON.")
-    set(WITH_XPU ON CACHE STRING
-        "Enable WITH_XPU when compiling with WITH_XPU_KP" FORCE)
+if(WITH_XPU_KP AND NOT WITH_XPU)
+  message(
+    WARNING
+      "Enable WITH_XPU when compiling with WITH_XPU_KP. Force WITH_XPU=ON.")
+  set(WITH_XPU
+      ON
+      CACHE STRING "Enable WITH_XPU when compiling with WITH_XPU_KP" FORCE)
 endif()
 
-if (NOT WITH_XPU AND WITH_XPU_BKCL)
-    MESSAGE(WARNING
-        "Disable BKCL when compiling without XPU. Force WITH_XPU_BKCL=OFF.")
-    set(WITH_XPU_BKCL OFF CACHE STRING
-        "Disable BKCL when compiling without XPU" FORCE)
+if(NOT WITH_XPU AND WITH_XPU_BKCL)
+  message(
+    WARNING "Disable BKCL when compiling without XPU. Force WITH_XPU_BKCL=OFF.")
+  set(WITH_XPU_BKCL
+      OFF
+      CACHE STRING "Disable BKCL when compiling without XPU" FORCE)
 endif()
 
-if (NOT WITH_MLU AND WITH_CNCL)
-    MESSAGE(WARNING
-        "Disable CNCL when compiling without MLU. Force WITH_MLU=OFF.")
-    set(WITH_MLU OFF CACHE STRING
-        "Disable CNCL when compiling without MLU" FORCE)
+if(NOT WITH_MLU AND WITH_CNCL)
+  message(
+    WARNING "Disable CNCL when compiling without MLU. Force WITH_MLU=OFF.")
+  set(WITH_MLU
+      OFF
+      CACHE STRING "Disable CNCL when compiling without MLU" FORCE)
 endif()
 
 if(WITH_NCCL)
-     add_definitions("-DPADDLE_WITH_NCCL")
-     include(nccl)
+  add_definitions("-DPADDLE_WITH_NCCL")
+  include(nccl)
 else()
-     if(WITH_GPU)
-         MESSAGE(WARNING "If the environment is multi-card, the WITH_NCCL option needs to be turned on, otherwise only a single card can be used.")
-     endif()
+  if(WITH_GPU)
+    message(
+      WARNING
+        "If the environment is multi-card, the WITH_NCCL option needs to be turned on, otherwise only a single card can be used."
+    )
+  endif()
 endif()
 
 if(WITH_BRPC_RDMA)
-    message(STATUS "Use brpc with rdma.")
-    if(NOT WITH_DISTRIBUTE)
-        message(FATAL_ERROR "Can't use brpc rdma in no distribute env.")
-    endif()
+  message(STATUS "Use brpc with rdma.")
+  if(NOT WITH_DISTRIBUTE)
+    message(FATAL_ERROR "Can't use brpc rdma in no distribute env.")
+  endif()
 endif()
 
-
 if(WITH_GPU)
-    include(cuda)
-    # lite subgraph compilation depends on CUDNN_ROOT,
-    # so include(cudnn) needs to be in front of include(third_party/lite)
-    include(cudnn)              # set cudnn libraries, must before configure
-    include(tensorrt)
-    # there is no official support of nccl, cupti in windows
-    if(NOT WIN32)
-        include(cupti)
-    endif()
+  include(cuda)
+  # lite subgraph compilation depends on CUDNN_ROOT,
+  # so include(cudnn) needs to be in front of include(third_party/lite)
+  include(cudnn) # set cudnn libraries, must before configure
+  include(tensorrt)
+  # there is no official support of nccl, cupti in windows
+  if(NOT WIN32)
+    include(cupti)
+  endif()
 endif()
 
 if(WITH_MLU)
-    include(neuware)
+  include(neuware)
 endif()
 
 if(WITH_ROCM)
-    include(hip)
-    include(miopen) # set miopen libraries, must before configure
+  include(hip)
+  include(miopen) # set miopen libraries, must before configure
 endif(WITH_ROCM)
 
 if(WITH_XPU_KP)
-    include(xpu_kp)
+  include(xpu_kp)
 endif()
 
-if (NOT WITH_ROCM AND WITH_RCCL)
-    MESSAGE(WARNING
-        "Disable RCCL when compiling without ROCM. Force WITH_RCCL=OFF.")
-    set(WITH_RCCL OFF CACHE STRING
-        "Disable RCCL when compiling without ROCM" FORCE)
+if(NOT WITH_ROCM AND WITH_RCCL)
+  message(
+    WARNING "Disable RCCL when compiling without ROCM. Force WITH_RCCL=OFF.")
+  set(WITH_RCCL
+      OFF
+      CACHE STRING "Disable RCCL when compiling without ROCM" FORCE)
 endif()
 
 if(WITH_RCCL)
-     add_definitions("-DPADDLE_WITH_RCCL")
-     include(rccl)
+  add_definitions("-DPADDLE_WITH_RCCL")
+  include(rccl)
 else()
-     if(WITH_ROCM)
-         MESSAGE(WARNING "If the environment is multi-card, the WITH_RCCL option needs to be turned on, otherwise only a single card can be used.")
-     endif()
+  if(WITH_ROCM)
+    message(
+      WARNING
+        "If the environment is multi-card, the WITH_RCCL option needs to be turned on, otherwise only a single card can be used."
+    )
+  endif()
 endif()
 
 if(WITH_HETERPS AND WITH_PSLIB)
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=0")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=0")
 endif()
 
 if(WITH_DISTRIBUTE)
-    if(LINUX)
-        set(WITH_GLOO ON CACHE STRING "Enable GLOO when compiling WITH_DISTRIBUTE=ON." FORCE)
-    endif()
-    if(WITH_ASCEND_CL AND NOT WITH_ARM_BRPC)
-        # disable WITH_PSCORE for NPU before include third_party
-        MESSAGE(WARNING "Disable WITH_PSCORE when compiling with NPU. Force WITH_PSCORE=OFF.")
-        set(WITH_PSCORE OFF CACHE BOOL "Disable WITH_PSCORE when compiling with NPU" FORCE)
-    endif()
-    if(WITH_ROCM AND HIP_VERSION LESS_EQUAL 40020496)
-        # TODO(qili93): third-party rocksdb throw Illegal instruction with HIP version 40020496
-        MESSAGE(WARNING "Disable WITH_PSCORE when HIP_VERSION is less than or equal 40020496. Force WITH_PSCORE=OFF.")
-        set(WITH_PSCORE OFF CACHE BOOL "Disable WITH_PSCORE when HIP_VERSION is less than or equal 40020496" FORCE)
-    endif()
-endif()
-
-include(third_party)  # download, build, install third_party, Contains about 20+ dependencies
-
-include(flags)              # set paddle compile flags
+  if(LINUX)
+    set(WITH_GLOO
+        ON
+        CACHE STRING "Enable GLOO when compiling WITH_DISTRIBUTE=ON." FORCE)
+  endif()
+  if(WITH_ASCEND_CL AND NOT WITH_ARM_BRPC)
+    # disable WITH_PSCORE for NPU before include third_party
+    message(
+      WARNING
+        "Disable WITH_PSCORE when compiling with NPU. Force WITH_PSCORE=OFF.")
+    set(WITH_PSCORE
+        OFF
+        CACHE BOOL "Disable WITH_PSCORE when compiling with NPU" FORCE)
+  endif()
+  if(WITH_ROCM AND HIP_VERSION LESS_EQUAL 40020496)
+    # TODO(qili93): third-party rocksdb throw Illegal instruction with HIP version 40020496
+    message(
+      WARNING
+        "Disable WITH_PSCORE when HIP_VERSION is less than or equal 40020496. Force WITH_PSCORE=OFF."
+    )
+    set(WITH_PSCORE
+        OFF
+        CACHE
+          BOOL
+          "Disable WITH_PSCORE when HIP_VERSION is less than or equal 40020496"
+          FORCE)
+  endif()
+endif()
+
+include(third_party
+)# download, build, install third_party, Contains about 20+ dependencies
+
+include(flags) # set paddle compile flags
 
 if(WITH_PROFILER)
-    find_package(Gperftools REQUIRED)
-    include_directories(${GPERFTOOLS_INCLUDE_DIR})
-    add_definitions(-DWITH_GPERFTOOLS)
+  find_package(Gperftools REQUIRED)
+  include_directories(${GPERFTOOLS_INCLUDE_DIR})
+  add_definitions(-DWITH_GPERFTOOLS)
 endif()
 
-include(util)               # set unittest and link libs
-include(version)            # set PADDLE_VERSION
-include(coveralls)          # set code coverage
-include(configure)          # add paddle env configuration
+include(util) # set unittest and link libs
+include(version) # set PADDLE_VERSION
+include(coveralls) # set code coverage
+include(configure) # add paddle env configuration
 
 include_directories("${PADDLE_SOURCE_DIR}")
 
 if(WITH_NV_JETSON)
-    set(WITH_ARM ON CACHE STRING "Set WITH_ARM=ON when compiling WITH_NV_JETSON=ON." FORCE)
+  set(WITH_ARM
+      ON
+      CACHE STRING "Set WITH_ARM=ON when compiling WITH_NV_JETSON=ON." FORCE)
 endif()
 
 if(WITH_ARM)
-    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fPIC")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC")
-    set(WITH_XBYAK OFF CACHE STRING "Disable XBYAK when compiling WITH_ARM=ON." FORCE)
-    set(WITH_MKL OFF CACHE STRING "Disable MKL when compiling WITH_ARM=ON." FORCE)
-    set(WITH_AVX OFF CACHE STRING "Disable AVX when compiling WITH_AVX=OFF." FORCE)
-    add_definitions(-DPADDLE_WITH_ARM)
-endif()
-
-if (WITH_SW)
-    # mieee flag solves floating-point exceptions under sw and ALPHA architectures
-    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fPIC -mieee")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC -mieee")
-    set(WITH_XBYAK OFF CACHE STRING "Disable XBYAK when compiling WITH_SW=ON" FORCE)
-    set(WITH_MKL OFF CACHE STRING "Disable MKL when compiling WITH_SW=ON." FORCE)
-    add_definitions(-DPADDLE_WITH_SW)
-endif()
-
-if (WITH_MIPS)
-    set(WITH_XBYAK OFF CACHE STRING "Disable XBYAK when compiling WITH_MIPS=ON" FORCE)
-    add_definitions(-DPADDLE_WITH_MIPS)
-endif()
-
-if (WITH_ONEMKL)
-    add_definitions(-DPADDLE_WITH_ONEMKL)
-endif()
-
-if (WITH_HETERPS)
-    if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0)
-        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -faligned-new")
-    endif()
+  set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fPIC")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC")
+  set(WITH_XBYAK
+      OFF
+      CACHE STRING "Disable XBYAK when compiling WITH_ARM=ON." FORCE)
+  set(WITH_MKL
+      OFF
+      CACHE STRING "Disable MKL when compiling WITH_ARM=ON." FORCE)
+  set(WITH_AVX
+      OFF
+      CACHE STRING "Disable AVX when compiling WITH_AVX=OFF." FORCE)
+  add_definitions(-DPADDLE_WITH_ARM)
+endif()
+
+if(WITH_SW)
+  # mieee flag solves floating-point exceptions under sw and ALPHA architectures
+  set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fPIC -mieee")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC -mieee")
+  set(WITH_XBYAK
+      OFF
+      CACHE STRING "Disable XBYAK when compiling WITH_SW=ON" FORCE)
+  set(WITH_MKL
+      OFF
+      CACHE STRING "Disable MKL when compiling WITH_SW=ON." FORCE)
+  add_definitions(-DPADDLE_WITH_SW)
+endif()
+
+if(WITH_MIPS)
+  set(WITH_XBYAK
+      OFF
+      CACHE STRING "Disable XBYAK when compiling WITH_MIPS=ON" FORCE)
+  add_definitions(-DPADDLE_WITH_MIPS)
+endif()
+
+if(WITH_ONEMKL)
+  add_definitions(-DPADDLE_WITH_ONEMKL)
+endif()
+
+if(WITH_HETERPS)
+  if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0)
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -faligned-new")
+  endif()
 endif()
 set(PADDLE_PYTHON_BUILD_DIR "${CMAKE_CURRENT_BINARY_DIR}/python/build")
 
@@ -465,25 +565,32 @@ set(CMAKE_C_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG")
 add_definitions(-DPADDLE_DLL_EXPORT)
 
 if(ON_INFER)
-    # you can trun off the paddle fluid and inference lib by set ON_INFER=OFF
-    message(STATUS "On inference mode, will take place some specific optimization.")
-    include(inference_lib)
-    add_definitions(-DPADDLE_ON_INFERENCE)
+  # you can trun off the paddle fluid and inference lib by set ON_INFER=OFF
+  message(
+    STATUS "On inference mode, will take place some specific optimization.")
+  include(inference_lib)
+  add_definitions(-DPADDLE_ON_INFERENCE)
 else()
-    #TODO(luotao), combine this warning with `make inference_lib_dist` command.
-    message(WARNING "On inference mode, will take place some specific optimization. Turn on the ON_INFER flag when building inference_lib only.")
+  #TODO(luotao), combine this warning with `make inference_lib_dist` command.
+  message(
+    WARNING
+      "On inference mode, will take place some specific optimization. Turn on the ON_INFER flag when building inference_lib only."
+  )
 endif()
 
 if(WITH_STRIP)
-    find_program(STRIP_PATH strip)
-    if(NOT STRIP_PATH OR NOT LINUX)
-        set(WITH_STRIP OFF CACHE STRING "Command strip is only used on Linux when it exists." FORCE)
-    endif()
+  find_program(STRIP_PATH strip)
+  if(NOT STRIP_PATH OR NOT LINUX)
+    set(WITH_STRIP
+        OFF
+        CACHE STRING "Command strip is only used on Linux when it exists."
+              FORCE)
+  endif()
 endif()
 
 add_subdirectory(paddle)
 if(WITH_PYTHON)
-    add_subdirectory(python)
+  add_subdirectory(python)
 endif()
 
 get_directory_property(all_inc_dirs INCLUDE_DIRECTORIES)
diff --git a/tools/codestyle/.cmakelintrc b/tools/codestyle/.cmakelintrc
new file mode 100644
index 0000000000000..6c5fe30276fc6
--- /dev/null
+++ b/tools/codestyle/.cmakelintrc
@@ -0,0 +1 @@
+filter=-readability/wonkycase,-syntax,-convention/filename,-package/stdargs,-whitespace/indent

From 264de612eb2d2d4742cd74f63a0686d9a287c461 Mon Sep 17 00:00:00 2001
From: tianshuo78520a <707759223@qq.com>
Date: Tue, 7 Jun 2022 10:22:19 +0800
Subject: [PATCH 09/53] update docker (#43136)

---
 paddle/scripts/paddle_build.sh    | 1 -
 tools/dockerfile/ci_dockerfile.sh | 2 +-
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index b3862ea6b3232..ad081d8128162 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -3300,7 +3300,6 @@ function check_coverage_build() {
     set -x
 }
 
-
 function main() {
     local CMD=$1 
     local parallel_number=$2
diff --git a/tools/dockerfile/ci_dockerfile.sh b/tools/dockerfile/ci_dockerfile.sh
index 485bfd7968f05..1195e4c4594c6 100644
--- a/tools/dockerfile/ci_dockerfile.sh
+++ b/tools/dockerfile/ci_dockerfile.sh
@@ -20,7 +20,7 @@ function make_ubuntu_dockerfile(){
   sed -i "s#liblzma-dev#liblzma-dev openmpi-bin openmpi-doc libopenmpi-dev#g" ${dockerfile_name} 
   dockerfile_line=$(wc -l ${dockerfile_name}|awk '{print $1}')
   sed -i "${dockerfile_line}i RUN wget --no-check-certificate -q https://paddle-edl.bj.bcebos.com/hadoop-2.7.7.tar.gz \&\& \
-     tar -xzf hadoop-2.7.7.tar.gz && mv hadoop-2.7.7 /usr/local/" ${dockerfile_name} 
+     tar -xzf     hadoop-2.7.7.tar.gz && mv hadoop-2.7.7 /usr/local/" ${dockerfile_name}
   sed -i "${dockerfile_line}i RUN apt remove git -y \&\& apt install -y libcurl4-openssl-dev gettext zstd \&\& wget -q https://paddle-ci.gz.bcebos.com/git-2.17.1.tar.gz \&\& \
     tar -xvf git-2.17.1.tar.gz \&\& \
     cd git-2.17.1 \&\& \

From d9f8636c3d44a70f114d910fa31c15a25846e344 Mon Sep 17 00:00:00 2001
From: Zhang Zheng <32410583+ZzSean@users.noreply.github.com>
Date: Tue, 7 Jun 2022 10:30:03 +0800
Subject: [PATCH 10/53] Supoort more dimensions in forward fast layer_norm
 kernel (#43226)

---
 .../operators/fused/fused_layernorm_residual_dropout_bias.h   | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h b/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h
index f72f73438c0a2..fc044e0bafa31 100644
--- a/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h
+++ b/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h
@@ -481,10 +481,12 @@ void LaunchLayernormResidualDropoutBias(
   LAUNCH_FUSED_FAST_LN_KERNEL_BASE(1536); \
   LAUNCH_FUSED_FAST_LN_KERNEL_BASE(1792); \
   LAUNCH_FUSED_FAST_LN_KERNEL_BASE(2048); \
+  LAUNCH_FUSED_FAST_LN_KERNEL_BASE(3072); \
   LAUNCH_FUSED_FAST_LN_KERNEL_BASE(4096)
 
   bool can_call_fast_ln_kernel = false;
-  if (((cols >= 768 && cols <= 2048 && cols % 256 == 0) || cols == 4096) &&
+  if (((cols >= 768 && cols <= 2048 && cols % 256 == 0) || cols == 3072 ||
+       cols == 4096) &&
       scale != nullptr && layernorm_bias != nullptr) {
     can_call_fast_ln_kernel = true;
   }

From a2020d0cc369d7d2cf5c4d7eae41f007afb8ab89 Mon Sep 17 00:00:00 2001
From: sneaxiy <32832641+sneaxiy@users.noreply.github.com>
Date: Tue, 7 Jun 2022 10:36:03 +0800
Subject: [PATCH 11/53] fix dropout (#43234)

---
 paddle/fluid/operators/dropout_impl.cu.h | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/paddle/fluid/operators/dropout_impl.cu.h b/paddle/fluid/operators/dropout_impl.cu.h
index c40f6c0bbaea0..6db3efa3cdd60 100644
--- a/paddle/fluid/operators/dropout_impl.cu.h
+++ b/paddle/fluid/operators/dropout_impl.cu.h
@@ -198,11 +198,13 @@ void DropoutFwGPUKernelDriver(const phi::GPUContext& dev_ctx, bool is_test,
     size_t main_offset =
         size / (block_size * kVecSize) * (block_size * kVecSize);
 
+#define PD_DROPOUT_KERNEL_NAME VectorizedRandomGenerator<T, uint8_t>
     PD_RECORD_CUDA_GRAPH_RANDOM_KERNEL(
-        !is_fix_seed, (VectorizedRandomGenerator<T, uint8_t>), grid_size,
-        block_size, 0, stream, offset, KERNEL_PARAMS.As<uint64_t>(1),
-        KERNEL_PARAMS.As<uint64_t>(7), size, seed_data, dropout_prob, x_data,
-        mask_data, y_data, upscale_in_train, increment, main_offset);
+        !is_fix_seed, PD_DROPOUT_KERNEL_NAME, grid_size, block_size, 0, stream,
+        offset, KERNEL_PARAMS.As<uint64_t>(1), KERNEL_PARAMS.As<uint64_t>(7),
+        size, seed_data, dropout_prob, x_data, mask_data, y_data,
+        upscale_in_train, increment, main_offset);
+#undef PD_DROPOUT_KERNEL_NAME
   } else {
     if (upscale_in_train) {
 // todo: can y share with data with x directly?

From aec49361ee75a44c453ecfbfd996ad7373686864 Mon Sep 17 00:00:00 2001
From: niuliling123 <51102941+niuliling123@users.noreply.github.com>
Date: Tue, 7 Jun 2022 10:38:21 +0800
Subject: [PATCH 12/53] [XPU KP]Add xpu register, any, amax, amin op test
 (#43204)

---
 .../{reduce_amax_op.cu => reduce_amax_op.kps} | 14 ++-
 .../{reduce_amin_op.cu => reduce_amin_op.kps} | 14 ++-
 paddle/phi/kernels/funcs/reduce_function.h    | 15 +--
 .../kernels/{gpu => kps}/reduce_any_kernel.cu |  6 +-
 paddle/phi/kernels/kps/reduce_max_kernel.cu   |  1 -
 .../{gpu => kps}/reduce_prod_kernel.cu        |  7 +-
 .../primitive/compute_primitives_xpu2.h       | 15 +--
 paddle/phi/kernels/reduce_all_kernel.cc       |  4 +
 paddle/phi/kernels/reduce_any_kernel.cc       |  4 +
 paddle/phi/kernels/reduce_max_kernel.cc       |  4 +
 paddle/phi/kernels/reduce_mean_kernel.cc      |  4 +
 paddle/phi/kernels/reduce_min_kernel.cc       |  4 +
 paddle/phi/kernels/reduce_prod_kernel.cc      |  4 +
 paddle/phi/kernels/reduce_sum_kernel.cc       |  6 ++
 .../unittests/xpu/test_reduce_amax_op_xpu.py  | 67 +++++++++++++
 .../unittests/xpu/test_reduce_amin_op_xpu.py  | 67 +++++++++++++
 .../unittests/xpu/test_reduce_any_op_xpu.py   | 99 +++++++++++++++++++
 17 files changed, 315 insertions(+), 20 deletions(-)
 rename paddle/fluid/operators/reduce_ops/{reduce_amax_op.cu => reduce_amax_op.kps} (77%)
 rename paddle/fluid/operators/reduce_ops/{reduce_amin_op.cu => reduce_amin_op.kps} (77%)
 rename paddle/phi/kernels/{gpu => kps}/reduce_any_kernel.cu (87%)
 rename paddle/phi/kernels/{gpu => kps}/reduce_prod_kernel.cu (91%)
 create mode 100644 python/paddle/fluid/tests/unittests/xpu/test_reduce_amax_op_xpu.py
 create mode 100644 python/paddle/fluid/tests/unittests/xpu/test_reduce_amin_op_xpu.py
 create mode 100644 python/paddle/fluid/tests/unittests/xpu/test_reduce_any_op_xpu.py

diff --git a/paddle/fluid/operators/reduce_ops/reduce_amax_op.cu b/paddle/fluid/operators/reduce_ops/reduce_amax_op.kps
similarity index 77%
rename from paddle/fluid/operators/reduce_ops/reduce_amax_op.cu
rename to paddle/fluid/operators/reduce_ops/reduce_amax_op.kps
index b33859153419c..0998727918469 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_amax_op.cu
+++ b/paddle/fluid/operators/reduce_ops/reduce_amax_op.kps
@@ -12,13 +12,25 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#ifndef PADDLE_WITH_XPU_KP
 #include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h"
+#endif
+
 #include "paddle/fluid/operators/reduce_ops/reduce_op.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
 
-// reduce_max
+#ifdef PADDLE_WITH_XPU_KP
+REGISTER_OP_KERNEL(
+    reduce_amax, KP, plat::XPUPlace,
+    ops::ReduceCudaKernel<float, kps::MaxFunctor, kps::IdentityFunctor>);
+#else
 REGISTER_OP_CUDA_KERNEL(
     reduce_amax,
     ops::ReduceCudaKernel<float, kps::MaxFunctor, kps::IdentityFunctor>,
     ops::ReduceCudaKernel<double, kps::MaxFunctor, kps::IdentityFunctor>,
     ops::ReduceCudaKernel<int, kps::MaxFunctor, kps::IdentityFunctor>,
     ops::ReduceCudaKernel<int64_t, kps::MaxFunctor, kps::IdentityFunctor>);
+#endif
diff --git a/paddle/fluid/operators/reduce_ops/reduce_amin_op.cu b/paddle/fluid/operators/reduce_ops/reduce_amin_op.kps
similarity index 77%
rename from paddle/fluid/operators/reduce_ops/reduce_amin_op.cu
rename to paddle/fluid/operators/reduce_ops/reduce_amin_op.kps
index 037dab396c757..5e1139396d90c 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_amin_op.cu
+++ b/paddle/fluid/operators/reduce_ops/reduce_amin_op.kps
@@ -12,13 +12,25 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#ifndef PADDLE_WITH_XPU_KP
 #include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h"
+#endif
+
 #include "paddle/fluid/operators/reduce_ops/reduce_op.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
 
-// reduce_min
+#ifdef PADDLE_WITH_XPU_KP
+REGISTER_OP_KERNEL(
+    reduce_amin, KP, plat::XPUPlace,
+    ops::ReduceCudaKernel<float, kps::MinFunctor, kps::IdentityFunctor>);
+#else
 REGISTER_OP_CUDA_KERNEL(
     reduce_amin,
     ops::ReduceCudaKernel<float, kps::MinFunctor, kps::IdentityFunctor>,
     ops::ReduceCudaKernel<double, kps::MinFunctor, kps::IdentityFunctor>,
     ops::ReduceCudaKernel<int, kps::MinFunctor, kps::IdentityFunctor>,
     ops::ReduceCudaKernel<int64_t, kps::MinFunctor, kps::IdentityFunctor>);
+#endif
diff --git a/paddle/phi/kernels/funcs/reduce_function.h b/paddle/phi/kernels/funcs/reduce_function.h
index 5c74751b348c0..4d903e01a4982 100644
--- a/paddle/phi/kernels/funcs/reduce_function.h
+++ b/paddle/phi/kernels/funcs/reduce_function.h
@@ -236,8 +236,9 @@ struct IndexCalculator {
 template <bool ReduceLastDim = false>
 struct ReduceIndexMapping {
   const kps::DimConfig dim;
-  HOSTDEVICE explicit ReduceIndexMapping(const kps::DimConfig& dims)
-      : dim(dims) {}
+  int loop_size;
+  HOSTDEVICE ReduceIndexMapping(const kps::DimConfig& dims, int max_loop = 1)
+      : dim(dims), loop_size(max_loop) {}
 
 #ifdef PADDLE_WITH_XPU_KP
   __device__ __forceinline__ int BlockIdX() {
@@ -277,10 +278,10 @@ struct ReduceIndexMapping {
   }
 
   __device__ __forceinline__ int GetLoopSize() {
-    if (ReduceLastDim) {
-      return dim.deal_size_y;
-    } else {
+    if ((!ReduceLastDim) && (loop_size == 1)) {
       return dim.deal_size_x;
+    } else {
+      return loop_size;
     }
   }
 #else
@@ -670,7 +671,7 @@ __global__ void ReduceAnyKernel(const Tx* x,
   int store_offset = 0;
   int stride_left = 0;
   if (reduce_last_dim) {
-    auto block = ReduceIndexMapping<true>(dim);
+    auto block = ReduceIndexMapping<true>(dim, left_num);
     input_idx = block.BlockIdY() * block.BlockDimX();
     left_idx = block.BlockIdX() * block.BlockDimY() + THREAD_ID_Y;
     stride = block.GridDimY() * block.BlockDimX();
@@ -681,7 +682,7 @@ __global__ void ReduceAnyKernel(const Tx* x,
     stride_left = 1;
     tid = THREAD_ID_X;
   } else {
-    auto block = ReduceIndexMapping<false>(dim);
+    auto block = ReduceIndexMapping<false>(dim, left_num);
     input_idx = block.BlockIdY() * block.BlockDimY();
     left_idx = block.BlockIdX() * block.BlockDimX() + THREAD_ID_X;
     stride = block.GridDimY() * block.BlockDimY();
diff --git a/paddle/phi/kernels/gpu/reduce_any_kernel.cu b/paddle/phi/kernels/kps/reduce_any_kernel.cu
similarity index 87%
rename from paddle/phi/kernels/gpu/reduce_any_kernel.cu
rename to paddle/phi/kernels/kps/reduce_any_kernel.cu
index 25f73c64a5417..480268936f49f 100644
--- a/paddle/phi/kernels/gpu/reduce_any_kernel.cu
+++ b/paddle/phi/kernels/kps/reduce_any_kernel.cu
@@ -32,4 +32,8 @@ void AnyRawKernel(const Context& dev_ctx,
 
 }  // namespace phi
 
-PD_REGISTER_KERNEL(any_raw, GPU, ALL_LAYOUT, phi::AnyRawKernel, bool) {}
+#ifdef PADDLE_WITH_XPU_KP
+PD_REGISTER_KERNEL(any_raw, KPS, ALL_LAYOUT, phi::AnyRawKernel, bool) {}
+#else
+PD_REGISTER_KERNEL(any_raw, KPS, ALL_LAYOUT, phi::AnyRawKernel, bool) {}
+#endif
diff --git a/paddle/phi/kernels/kps/reduce_max_kernel.cu b/paddle/phi/kernels/kps/reduce_max_kernel.cu
index bc997c6c4e3b6..52644849ad8bf 100644
--- a/paddle/phi/kernels/kps/reduce_max_kernel.cu
+++ b/paddle/phi/kernels/kps/reduce_max_kernel.cu
@@ -37,5 +37,4 @@ PD_REGISTER_KERNEL(max_raw, KPS, ALL_LAYOUT, phi::MaxRawKernel, float) {}
 #else
 PD_REGISTER_KERNEL(
     max_raw, KPS, ALL_LAYOUT, phi::MaxRawKernel, float, double, int, int64_t) {}
-
 #endif
diff --git a/paddle/phi/kernels/gpu/reduce_prod_kernel.cu b/paddle/phi/kernels/kps/reduce_prod_kernel.cu
similarity index 91%
rename from paddle/phi/kernels/gpu/reduce_prod_kernel.cu
rename to paddle/phi/kernels/kps/reduce_prod_kernel.cu
index 4ae1dcfeba0a1..13d8e29b60b12 100644
--- a/paddle/phi/kernels/gpu/reduce_prod_kernel.cu
+++ b/paddle/phi/kernels/kps/reduce_prod_kernel.cu
@@ -31,12 +31,15 @@ void ProdRawKernel(const Context& dev_ctx,
 }
 
 }  // namespace phi
-
+#ifdef PADDLE_WITH_XPU_KP
+PD_REGISTER_KERNEL(prod_raw, KPS, ALL_LAYOUT, phi::ProdRawKernel, float) {}
+#else
 PD_REGISTER_KERNEL(prod_raw,
-                   GPU,
+                   KPS,
                    ALL_LAYOUT,
                    phi::ProdRawKernel,
                    float,
                    double,
                    int,
                    int64_t) {}
+#endif
diff --git a/paddle/phi/kernels/primitive/compute_primitives_xpu2.h b/paddle/phi/kernels/primitive/compute_primitives_xpu2.h
index 6ec05ee505443..38a8d40aee628 100644
--- a/paddle/phi/kernels/primitive/compute_primitives_xpu2.h
+++ b/paddle/phi/kernels/primitive/compute_primitives_xpu2.h
@@ -48,7 +48,7 @@ static inline __device__ void sync_all() {
 
 #define ncores 64
 template <typename T, typename OpFunc, int VecSize>
-__device__ void BlockXReduce(T* data, OpFunc reducer) {
+__device__ void BlockXReduce(T* out, const T* data, OpFunc reducer) {
   __shared__ T sum_array[ncores * VecSize];
   int core_idx = core_id() * VecSize;
   mfence();
@@ -57,21 +57,22 @@ __device__ void BlockXReduce(T* data, OpFunc reducer) {
 #pragma unroll
   for (int i = 0; i < VecSize; i++) {
     mfence();
-    sum_array[core_idx + i] = data[i];
+    sum_array[i * ncores + core_idx] = data[i];
     mfence();
-    data[i] = 0;
   }
   sync_all();
 #pragma unroll
   for (int i = 0; i < VecSize; i++) {
+    T start = data[i * ncores];
 #pragma unroll
-    for (int j = 0; j < ncores; j++) {
+    for (int j = 1; j < ncores; j++) {
       mfence();
-      T tmp = sum_array[j * VecSize + i];
+      T tmp = sum_array[i * ncores + j];
       mfence();
-      data[i] = reducer(data[i], tmp);
+      start = reducer(start, tmp);
       mfence();
     }
+    out[i] = start;
   }
   sync_all();
 }
@@ -346,7 +347,7 @@ __device__ __forceinline__ void Reduce(T* out,
     if (reduce_last_dim) {
 #pragma unroll
       for (int i = 0; i < NY * NX; i++) {  // reduce along blockDim.x
-        details::BlockXReduce<T, ReduceFunctor, 1>(&out[i], reducer);
+        details::BlockXReduce<T, ReduceFunctor, 1>(&out[i], &in[i], reducer);
       }
     }
   } else {  // else  kLocalMode
diff --git a/paddle/phi/kernels/reduce_all_kernel.cc b/paddle/phi/kernels/reduce_all_kernel.cc
index 5525f0dbfa7ed..9b4515ee2909f 100644
--- a/paddle/phi/kernels/reduce_all_kernel.cc
+++ b/paddle/phi/kernels/reduce_all_kernel.cc
@@ -36,3 +36,7 @@ PD_REGISTER_KERNEL(all, CPU, ALL_LAYOUT, phi::AllKernel, bool) {}
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 PD_REGISTER_KERNEL(all, GPU, ALL_LAYOUT, phi::AllKernel, bool) {}
 #endif
+
+#if defined(PADDLE_WITH_XPU_KP)
+PD_REGISTER_KERNEL(all, KPS, ALL_LAYOUT, phi::AllKernel, bool) {}
+#endif
diff --git a/paddle/phi/kernels/reduce_any_kernel.cc b/paddle/phi/kernels/reduce_any_kernel.cc
index 01cbcd4029c77..642b80c3d86f0 100644
--- a/paddle/phi/kernels/reduce_any_kernel.cc
+++ b/paddle/phi/kernels/reduce_any_kernel.cc
@@ -36,3 +36,7 @@ PD_REGISTER_KERNEL(any, CPU, ALL_LAYOUT, phi::AnyKernel, bool) {}
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 PD_REGISTER_KERNEL(any, GPU, ALL_LAYOUT, phi::AnyKernel, bool) {}
 #endif
+
+#if defined(PADDLE_WITH_XPU_KP)
+PD_REGISTER_KERNEL(any, KPS, ALL_LAYOUT, phi::AnyKernel, bool) {}
+#endif
diff --git a/paddle/phi/kernels/reduce_max_kernel.cc b/paddle/phi/kernels/reduce_max_kernel.cc
index a7458a3e0ac13..26b8bc196ccd4 100644
--- a/paddle/phi/kernels/reduce_max_kernel.cc
+++ b/paddle/phi/kernels/reduce_max_kernel.cc
@@ -38,3 +38,7 @@ PD_REGISTER_KERNEL(
 PD_REGISTER_KERNEL(
     max, GPU, ALL_LAYOUT, phi::MaxKernel, float, double, int, int64_t) {}
 #endif
+
+#if defined(PADDLE_WITH_XPU_KP)
+PD_REGISTER_KERNEL(max, KPS, ALL_LAYOUT, phi::MaxKernel, float) {}
+#endif
diff --git a/paddle/phi/kernels/reduce_mean_kernel.cc b/paddle/phi/kernels/reduce_mean_kernel.cc
index 812cf8702e15c..599b7eca32110 100644
--- a/paddle/phi/kernels/reduce_mean_kernel.cc
+++ b/paddle/phi/kernels/reduce_mean_kernel.cc
@@ -46,3 +46,7 @@ PD_REGISTER_KERNEL(mean,
                    int64_t,
                    phi::dtype::float16) {}
 #endif
+
+#if defined(PADDLE_WITH_XPU_KP)
+PD_REGISTER_KERNEL(mean, KPS, ALL_LAYOUT, phi::MeanKernel, float) {}
+#endif
diff --git a/paddle/phi/kernels/reduce_min_kernel.cc b/paddle/phi/kernels/reduce_min_kernel.cc
index 620b5167566f2..75d906aa4bd75 100644
--- a/paddle/phi/kernels/reduce_min_kernel.cc
+++ b/paddle/phi/kernels/reduce_min_kernel.cc
@@ -38,3 +38,7 @@ PD_REGISTER_KERNEL(
 PD_REGISTER_KERNEL(
     min, GPU, ALL_LAYOUT, phi::MinKernel, float, double, int, int64_t) {}
 #endif
+
+#if defined(PADDLE_WITH_XPU_KP)
+PD_REGISTER_KERNEL(min, KPS, ALL_LAYOUT, phi::MinKernel, float) {}
+#endif
diff --git a/paddle/phi/kernels/reduce_prod_kernel.cc b/paddle/phi/kernels/reduce_prod_kernel.cc
index 5bd410709c6ba..3bb1c7552b11f 100644
--- a/paddle/phi/kernels/reduce_prod_kernel.cc
+++ b/paddle/phi/kernels/reduce_prod_kernel.cc
@@ -38,3 +38,7 @@ PD_REGISTER_KERNEL(
 PD_REGISTER_KERNEL(
     prod, GPU, ALL_LAYOUT, phi::ProdKernel, float, double, int, int64_t) {}
 #endif
+
+#if defined(PADDLE_WITH_XPU_KP)
+PD_REGISTER_KERNEL(prod, KPS, ALL_LAYOUT, phi::ProdKernel, float) {}
+#endif
diff --git a/paddle/phi/kernels/reduce_sum_kernel.cc b/paddle/phi/kernels/reduce_sum_kernel.cc
index e2b13333d7f81..0d79fa34bc274 100644
--- a/paddle/phi/kernels/reduce_sum_kernel.cc
+++ b/paddle/phi/kernels/reduce_sum_kernel.cc
@@ -69,3 +69,9 @@ PD_REGISTER_KERNEL(sum,
   kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::UNDEFINED);
 }
 #endif
+
+#if defined(PADDLE_WITH_XPU_KP)
+PD_REGISTER_KERNEL(sum, KPS, ALL_LAYOUT, phi::SumKernel, float) {
+  kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::UNDEFINED);
+}
+#endif
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_reduce_amax_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_reduce_amax_op_xpu.py
new file mode 100644
index 0000000000000..a6a0c7b5920a8
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/xpu/test_reduce_amax_op_xpu.py
@@ -0,0 +1,67 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import sys
+
+sys.path.append("..")
+
+import paddle
+from op_test import OpTest
+from op_test_xpu import XPUOpTest
+from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types, XPUOpTestWrapper
+
+paddle.enable_static()
+
+
+class XPUTestReduceAmaxOp(XPUOpTestWrapper):
+
+    def __init__(self):
+        self.op_name = 'reduce_amax'
+
+    class XPUTestReduceAmaxBase(XPUOpTest):
+
+        def setUp(self):
+            self.place = paddle.XPUPlace(0)
+            self.set_case()
+
+        def set_case(self):
+            self.op_type = 'reduce_amax'
+            self.shape = (20, 10)
+            self.attrs = {'use_xpu': True, 'keep_dim': False, 'dim': (1, )}
+
+            self.inputs = {
+                'X': np.random.randint(0, 100, self.shape).astype("float32")
+            }
+
+            expect_intput = self.inputs['X']
+            self.outputs = {
+                'Out':
+                np.amax(expect_intput,
+                        axis=self.attrs['dim'],
+                        keepdims=self.attrs['keep_dim'])
+            }
+
+        def test_check_output(self):
+            self.check_output_with_place(self.place)
+
+
+support_types = get_xpu_op_support_types('reduce_amax')
+for stype in support_types:
+    create_test_class(globals(), XPUTestReduceAmaxOp, stype)
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_reduce_amin_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_reduce_amin_op_xpu.py
new file mode 100644
index 0000000000000..def6c0821f5a3
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/xpu/test_reduce_amin_op_xpu.py
@@ -0,0 +1,67 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import sys
+
+sys.path.append("..")
+
+import paddle
+from op_test import OpTest
+from op_test_xpu import XPUOpTest
+from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types, XPUOpTestWrapper
+
+paddle.enable_static()
+
+
+class XPUTestReduceAmaxOp(XPUOpTestWrapper):
+
+    def __init__(self):
+        self.op_name = 'reduce_amin'
+
+    class XPUTestReduceAmaxBase(XPUOpTest):
+
+        def setUp(self):
+            self.place = paddle.XPUPlace(0)
+            self.set_case()
+
+        def set_case(self):
+            self.op_type = 'reduce_amin'
+            self.shape = (20, 10)
+            self.attrs = {'use_xpu': True, 'keep_dim': False, 'dim': (1, )}
+
+            self.inputs = {
+                'X': np.random.randint(0, 100, self.shape).astype("float32")
+            }
+
+            expect_intput = self.inputs['X']
+            self.outputs = {
+                'Out':
+                np.amin(expect_intput,
+                        axis=self.attrs['dim'],
+                        keepdims=self.attrs['keep_dim'])
+            }
+
+        def test_check_output(self):
+            self.check_output_with_place(self.place)
+
+
+support_types = get_xpu_op_support_types('reduce_amin')
+for stype in support_types:
+    create_test_class(globals(), XPUTestReduceAmaxOp, stype)
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_reduce_any_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_reduce_any_op_xpu.py
new file mode 100644
index 0000000000000..5118c3787e663
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/xpu/test_reduce_any_op_xpu.py
@@ -0,0 +1,99 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import sys
+
+sys.path.append("..")
+
+import paddle
+from op_test import OpTest
+from op_test_xpu import XPUOpTest
+from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types, XPUOpTestWrapper
+
+paddle.enable_static()
+
+
+class XPUTestReduceAnyOp(XPUOpTestWrapper):
+
+    def __init__(self):
+        self.op_name = 'reduce_any'
+
+    class XPUTestReduceAnyBase(XPUOpTest):
+
+        def setUp(self):
+            self.place = paddle.XPUPlace(0)
+            self.set_case()
+
+        def set_case(self):
+            self.op_type = 'reduce_any'
+            self.attrs = {
+                'use_xpu': True,
+                'reduce_all': True,
+                'keep_dim': True,
+                'dim': (3, 5, 4)
+            }
+            self.inputs = {
+                'X':
+                np.random.randint(0, 2, (2, 5, 3, 2, 2, 3, 4, 2)).astype("bool")
+            }
+            self.outputs = {'Out': self.inputs['X'].any(axis=self.attrs['dim'])}
+
+        def test_check_output(self):
+            self.check_output_with_place(self.place)
+
+        def test_check_grad(self):
+            pass
+
+    class XPUTestReduceAnyCase1(XPUTestReduceAnyBase):
+
+        def set_case(self):
+            self.op_type = 'reduce_any'
+            self.attrs = {
+                'use_xpu': True,
+                'dim': [1]
+                # 'reduce_all': True,
+                # 'keep_dim': True,
+            }
+            self.inputs = {
+                'X': np.random.randint(0, 2, (5, 6, 10)).astype("bool")
+            }
+            self.outputs = {'Out': self.inputs['X'].any(axis=1)}
+
+    class XPUTestReduceAnyCase2(XPUTestReduceAnyBase):
+
+        def set_case(self):
+            self.op_type = 'reduce_any'
+            self.attrs = {
+                'use_xpu': True,
+                'reduce_all': True,
+                'keep_dim': False,
+                'dim': (3, 6)
+            }
+            self.inputs = {
+                'X':
+                np.random.randint(0, 2, (2, 5, 3, 2, 2, 3, 4, 2)).astype("bool")
+            }
+            self.outputs = {'Out': self.inputs['X'].any(axis=self.attrs['dim'])}
+
+
+support_types = get_xpu_op_support_types('reduce_any')
+for stype in support_types:
+    create_test_class(globals(), XPUTestReduceAnyOp, stype)
+
+if __name__ == '__main__':
+    unittest.main()

From 552808378be6d609301aa596a3f76b38e9c25467 Mon Sep 17 00:00:00 2001
From: wangguanzhong <jerrywgz@126.com>
Date: Tue, 7 Jun 2022 11:23:01 +0800
Subject: [PATCH 13/53] fix conv3d doc, test=document_fix (#43253)

---
 python/paddle/nn/functional/conv.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/python/paddle/nn/functional/conv.py b/python/paddle/nn/functional/conv.py
index 26f07c2f9a11c..f1d66a9e3a1b5 100644
--- a/python/paddle/nn/functional/conv.py
+++ b/python/paddle/nn/functional/conv.py
@@ -1244,10 +1244,10 @@ def conv3d(x,
             where M is the number of filters(output channels), g is the number of groups,
             kD, kH, kW are the filter's depth, height and width respectively.
         bias (Tensor, optional): The bias, a Tensor of shape [M, ].
-        stride (int|list|tuple): The stride size. It means the stride in convolution. If stride is a 
+        stride (int|list|tuple, optional): The stride size. It means the stride in convolution. If stride is a 
             list/tuple, it must contain three integers, (stride_depth, stride_height, stride_width). 
             Otherwise, stride_depth = stride_height = stride_width = stride. Default: stride = 1.
-        padding (string|int|list|tuple): The padding size. It means the number of zero-paddings 
+        padding (string|int|list|tuple, optional): The padding size. It means the number of zero-paddings 
             on both sides for each dimension. If `padding` is a string, either 'VALID' or
             'SAME' which is the padding algorithm. If padding size is a tuple or list,
             it could be in three forms: `[pad_depth, pad_height, pad_width]` or
@@ -1257,20 +1257,20 @@ def conv3d(x,
             when `data_format` is `"NDHWC"`, `padding` can be in the form
             `[[0,0], [pad_depth_front, pad_depth_back], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right], [0,0]]`.
             Default: padding = 0.
-        dilation (int|list|tuple): The dilation size. It means the spacing between the kernel points. 
+        dilation (int|list|tuple, optional): The dilation size. It means the spacing between the kernel points. 
             If dilation is a list/tuple, it must contain three integers, (dilation_depth, dilation_height,
             dilation_width). Otherwise, dilation_depth = dilation_height = dilation_width = dilation. 
             Default: dilation = 1.
-        groups (int): The groups number of the Conv3D Layer. According to grouped
+        groups (int, optional): The groups number of the Conv3D Layer. According to grouped
             convolution in Alex Krizhevsky's Deep CNN paper: when group=2,
             the first half of the filters is only connected to the first half
             of the input channels, while the second half of the filters is only
             connected to the second half of the input channels. Default: groups=1
         data_format (str, optional): Specify the data format of the input, and the data format of the output 
-            will be consistent with that of the input. An optional string from: `"NCHW"`, `"NHWC"`.
-            The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
-            `[batch_size, input_channels, input_height, input_width]`.
-        name(str|None): For detailed information, please refer 
+            will be consistent with that of the input. An optional string from: `"NCDHW"`, `"NDHWC"`.
+            The default is `"NCDHW"`. When it is `"NCDHW"`, the data is stored in the order of:
+            `[batch_size, input_channels, input_depth, input_height, input_width]`.
+        name(str|None, optional): For detailed information, please refer 
            to :ref:`api_guide_Name`. Usually name is no need to set and 
            None by default.
 

From e74f287bb32fe899587b0881785ce29fe8d3ee1d Mon Sep 17 00:00:00 2001
From: Guoxia Wang <mingzilaochongtu@gmail.com>
Date: Tue, 7 Jun 2022 11:25:33 +0800
Subject: [PATCH 14/53] fix the unittest bug of none grad of
 margin_cross_entropy when FLAGS_retain_grad_for_all_tensor change default
 setting (#43241)

---
 .../fluid/tests/unittests/parallel_margin_cross_entropy.py       | 1 +
 1 file changed, 1 insertion(+)

diff --git a/python/paddle/fluid/tests/unittests/parallel_margin_cross_entropy.py b/python/paddle/fluid/tests/unittests/parallel_margin_cross_entropy.py
index b77a04d8eea9c..26e9e05b82ab8 100644
--- a/python/paddle/fluid/tests/unittests/parallel_margin_cross_entropy.py
+++ b/python/paddle/fluid/tests/unittests/parallel_margin_cross_entropy.py
@@ -39,6 +39,7 @@ class TestParallelMarginSoftmaxCrossEntropyOp(unittest.TestCase):
     def setUp(self):
         strategy = fleet.DistributedStrategy()
         fleet.init(is_collective=True, strategy=strategy)
+        paddle.fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
 
     def test_parallel_margin_softmax_cross_entropy(self):
         margin1s = [1.0, 1.0, 1.35]

From 71a63f0a9be78d371a648a7cc97456857cadf718 Mon Sep 17 00:00:00 2001
From: limingshu <61349199+JamesLim-sy@users.noreply.github.com>
Date: Tue, 7 Jun 2022 11:28:49 +0800
Subject: [PATCH 15/53] Transpose optimization with assitant of  Chengdu
 Supercomputing Center and auto_tune operation (#42704)

---
 paddle/fluid/operators/transpose_op.cu.h      | 432 +++++++++++++++++-
 paddle/fluid/operators/transpose_op.h         | 178 ++++++++
 paddle/fluid/platform/fast_divmod.h           |   2 +-
 paddle/phi/kernels/autotune/auto_tune_base.h  | 114 +++--
 paddle/phi/kernels/autotune/auto_tune_test.cu |  22 +-
 paddle/phi/kernels/autotune/cache.h           |   5 +-
 6 files changed, 709 insertions(+), 44 deletions(-)

diff --git a/paddle/fluid/operators/transpose_op.cu.h b/paddle/fluid/operators/transpose_op.cu.h
index 40a967b11f7a9..f9d91fec4c3f6 100644
--- a/paddle/fluid/operators/transpose_op.cu.h
+++ b/paddle/fluid/operators/transpose_op.cu.h
@@ -17,8 +17,12 @@ limitations under the License. */
 #include "paddle/fluid/framework/gpu_utils.h"
 #include "paddle/fluid/operators/transpose_op.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/fluid/platform/fast_divmod.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
+#include "paddle/phi/kernels/autotune/auto_tune_base.h"
+#include "paddle/phi/kernels/autotune/cache.h"
+#include "paddle/phi/kernels/copy_kernel.h"
 
 namespace paddle {
 namespace operators {
@@ -656,13 +660,437 @@ struct TransposeSimple {
   }
 };
 
+template <int N, typename T>
+class IdxHelper {
+ public:
+  IdxHelper() {}
+  explicit IdxHelper(const T* dims) {
+    for (int i = N - 1; i >= 0; --i) {
+      stride_[i] = i < (N - 1) ? dims[i + 1] * stride_[i + 1] : 1;
+    }
+  }
+
+  __device__ inline T GetStride(int idx) const { return stride_[idx]; }
+
+  __device__ inline void GetIndexFromOffset(T offset, T* index) const {
+    T remaining = offset;
+#pragma unroll
+    for (int i = 0; i < N - 1; ++i) {
+      const T idx = remaining / stride_[i];
+      remaining -= idx * stride_[i];
+      index[i] = idx;
+    }
+    index[N - 1] = remaining;
+  }
+
+ private:
+  T stride_[N];
+};
+
+template <int N>
+class IdxHelper<N, uint32_t> {
+ public:
+  IdxHelper() {}
+  explicit IdxHelper(const uint32_t* dims) {
+    for (int i = N - 1; i >= 0; --i) {
+      uint32_t value = i < (N - 1) ? dims[i + 1] * stride_[i + 1] : 1;
+      divmoder_[i] = paddle::platform::FastDivMod(value);
+      stride_[i] = value;
+    }
+  }
+
+  __device__ inline uint32_t GetStride(int idx) const { return stride_[idx]; }
+
+  __device__ inline void GetIndexFromOffset(uint32_t offset,
+                                            uint32_t* index) const {
+    uint32_t remaining = offset;
+#pragma unroll
+    for (int i = 0; i < N - 1; ++i) {
+      uint32_t idx = divmoder_[i].Div(remaining);
+      index[i] = idx;
+      remaining -= idx * stride_[i];
+    }
+    index[N - 1] = remaining;
+  }
+
+ private:
+  uint32_t stride_[N];
+  paddle::platform::FastDivMod divmoder_[N];
+};
+
+// Transform index between memory offset and shape coodinate.
+template <typename T, int N>
+class IdxAndOffsetHelper {
+ public:
+  IdxAndOffsetHelper() {}
+  ~IdxAndOffsetHelper() = default;
+
+  explicit IdxAndOffsetHelper(const T* dims) {
+    index_helper = IdxHelper<N, T>(dims);
+  }
+
+  template <typename U>
+  explicit IdxAndOffsetHelper(const U* dims) {
+    T temp_dims[N];
+    for (int i = 0; i < N; ++i) {
+      temp_dims[i] = static_cast<T>(dims[i]);
+    }
+    index_helper = IdxHelper<N, T>(temp_dims);
+  }
+
+  __device__ inline T IndexToOffset(const T* index) const {
+    T offset = 0;
+#pragma unroll
+    for (int i = 0; i < N - 1; ++i) {
+      offset += index[i] * index_helper.GetStride(i);
+    }
+    offset += index[N - 1];
+    return offset;
+  }
+
+  __device__ inline void OffsetToIndex(T offset, T* index) const {
+    index_helper.GetIndexFromOffset(offset, index);
+  }
+
+ private:
+  IdxHelper<N, T> index_helper;
+};
+
+template <size_t Rank, typename IndexT>
+struct PermuteParams {
+ public:
+  IdxAndOffsetHelper<IndexT, Rank> src_index_helper;
+  IdxAndOffsetHelper<IndexT, Rank> dst_index_helper;
+  int perm[Rank]{};
+
+  explicit PermuteParams(const std::vector<size_t>& dims,
+                         const std::vector<int>& perm_) {
+    size_t dst_dims[Rank];
+    for (size_t i = 0; i < Rank; ++i) {
+      dst_dims[i] = dims[perm_[i]];
+      perm[i] = perm_[i];
+    }
+    dst_index_helper = IdxAndOffsetHelper<IndexT, Rank>(dst_dims);
+    src_index_helper = IdxAndOffsetHelper<IndexT, Rank>(dims.data());
+  }
+};
+
+// A special kernel for target case, both vectorized read and write supported.
+template <typename T, typename IndexT, int VecSize, int Rank>
+__global__ void VectorizedPermuteKernel(PermuteParams<Rank, IndexT> params,
+                                        const size_t count,
+                                        const T* __restrict__ src_data,
+                                        T* dst_data) {
+  using VecT = phi::AlignedVector<T, VecSize>;
+  IndexT src_index[Rank];
+  IndexT dst_index[Rank];
+
+  const VecT* __restrict__ src =
+      reinterpret_cast<const VecT* __restrict__>(src_data);
+  VecT* dst = reinterpret_cast<VecT*>(dst_data);
+
+  IndexT tid = blockIdx.x * blockDim.x + threadIdx.x;
+  for (IndexT i = tid; i < count; i += blockDim.x * gridDim.x) {
+    params.dst_index_helper.OffsetToIndex(i, dst_index);
+
+#pragma unroll
+    for (int j = 0; j < Rank; ++j) {
+      src_index[params.perm[j]] = dst_index[j];
+    }
+    IndexT src_offset = params.src_index_helper.IndexToOffset(src_index);
+    dst[i] = src[src_offset];
+  }
+}
+
+// A general kernel for normal case, only support vectorized write.
+template <typename T, typename IndexT, int VecSize, int Rank>
+__global__ void GeneralPermuteKernel(PermuteParams<Rank, IndexT> params,
+                                     const T* __restrict__ src, T* dst,
+                                     const size_t main_cnt,
+                                     const size_t tail_cnt,
+                                     const size_t offset) {
+  using VecT = phi::AlignedVector<T, VecSize>;
+  VecT* vec_dst = reinterpret_cast<VecT*>(dst);
+
+  IndexT src_index[VecSize][Rank];
+  IndexT dst_index[VecSize][Rank];
+
+  // Avoid read perm data both in 2 load process.
+  __shared__ int perm[Rank];
+  if (threadIdx.x < Rank) {
+    perm[threadIdx.x] = params.perm[threadIdx.x];
+  }
+  __syncthreads();
+
+  // Vectorized load data.
+  IndexT tid = blockIdx.x * blockDim.x + threadIdx.x;
+  for (IndexT idx = tid; idx < main_cnt; idx += blockDim.x * gridDim.x) {
+    VecT vec_data;
+    IndexT vec_idx = idx * VecSize;
+
+#pragma unroll
+    for (int i = 0; i < VecSize; ++i) {
+      params.dst_index_helper.OffsetToIndex(vec_idx + i, dst_index[i]);
+
+#pragma unroll
+      for (int j = 0; j < Rank; ++j) {
+        src_index[i][perm[j]] = dst_index[i][j];
+      }
+      IndexT src_offset = params.src_index_helper.IndexToOffset(src_index[i]);
+      vec_data[i] = src[src_offset];
+    }
+    vec_dst[idx] = vec_data;
+  }
+
+  // Singularized load data.
+  if (tid < tail_cnt) {
+    IndexT idx = tid + offset;
+    params.dst_index_helper.OffsetToIndex(idx, dst_index[0]);
+
+#pragma unroll
+    for (int j = 0; j < Rank; ++j) {
+      src_index[0][perm[j]] = dst_index[0][j];
+    }
+    IndexT src_offset = params.src_index_helper.IndexToOffset(src_index[0]);
+    dst[idx] = src[src_offset];
+  }
+}
+
+// A Gerneral permute method that drectly find the dst data
+// coordinate in the source data.
+template <typename T, typename IndexT, int VecSize, int Rank>
+inline void LaunchPermuteKernel(const phi::GPUContext& ctx, const IndexT count,
+                                const PermuteType perm_type,
+                                const std::vector<size_t>& dims,
+                                const std::vector<int>& perm, const T* src,
+                                T* dst) {
+  size_t main_count = count / VecSize;
+  auto params = PermuteParams<Rank, IndexT>(dims, perm);
+  auto config = phi::backends::gpu::GetGpuLaunchConfig1D(ctx, main_count);
+
+  if (perm_type == PermuteType::kNormalPermute) {
+    size_t tail_count = count - main_count * VecSize;
+    size_t offset = count - tail_count;
+    GeneralPermuteKernel<
+        T, IndexT, VecSize,
+        Rank><<<config.GetGridSize(), config.GetBlockSize(), 0, ctx.stream()>>>(
+        params, src, dst, main_count, tail_count, offset);
+  } else {
+    VectorizedPermuteKernel<
+        T, IndexT, VecSize,
+        Rank><<<config.GetGridSize(), config.GetBlockSize(), 0, ctx.stream()>>>(
+        params, main_count, src, dst);
+  }
+}
+
+template <typename T, typename IndexT, int VecSize>
+inline void LaunchPermuteRankDispatch(const phi::GPUContext& ctx,
+                                      const IndexT count,
+                                      const PermuteType perm_type,
+                                      const std::vector<size_t>& dims,
+                                      const std::vector<int>& perm,
+                                      const T* src, T* dst) {
+#define CALL_DISPATCH_RANK(rank)                                               \
+  case rank: {                                                                 \
+    LaunchPermuteKernel<T, IndexT, VecSize, rank>(ctx, count, perm_type, dims, \
+                                                  perm, src, dst);             \
+    break;                                                                     \
+  }
+
+  switch (dims.size()) {
+    CALL_DISPATCH_RANK(1);
+    CALL_DISPATCH_RANK(2);
+    CALL_DISPATCH_RANK(3);
+    CALL_DISPATCH_RANK(4);
+    CALL_DISPATCH_RANK(5);
+    CALL_DISPATCH_RANK(6);
+    CALL_DISPATCH_RANK(7);
+    CALL_DISPATCH_RANK(8);
+    CALL_DISPATCH_RANK(9);
+  }
+#undef CALL_DISPATCH_RANK
+}
+
+// Aim at transposing the last 2 dimensions. Refer from
+// https://developer.nvidia.com/blog/efficient-matrix-transpose-cuda-cc/
+template <typename T, typename IndexT, int VecSize>
+__global__ void BatchTransposeKernel(const T* __restrict__ src_data,
+                                     T* dst_data, IndexT rows, IndexT cols) {
+  using VecT = phi::AlignedVector<T, VecSize>;
+
+  __shared__ VecT tile[kTileSize][kShareCol];
+  T* single_tile = reinterpret_cast<T*>(tile);
+
+  IndexT col_in_matrix = blockIdx.x * kTileSize + threadIdx.x;
+  IndexT offset = blockIdx.z * rows * cols;
+
+  // Vectorized load data from src into shared memory. [rows, cols]
+  const VecT* __restrict__ src =
+      reinterpret_cast<const VecT* __restrict__>(src_data);
+
+  for (IndexT tile_y = threadIdx.y; tile_y < kTileSize; tile_y += kBlockRows) {
+    IndexT row_in_matrix = tile_y + blockIdx.y * kTileSize;
+
+    if (col_in_matrix < cols && row_in_matrix < rows) {
+      tile[tile_y][threadIdx.x] =
+          src[offset + row_in_matrix * cols + col_in_matrix];
+    }
+  }
+
+  // Singularized load data from shared memory into dst.
+  // and dst_cols = rows, dst_rows = cols, [cols * Vecsize, rows]
+  col_in_matrix = blockIdx.y * kTileSize + threadIdx.x;
+  offset = offset * VecSize + col_in_matrix;
+  IndexT tile_x_idx = threadIdx.x * (kShareCol * VecSize);
+
+  __syncthreads();
+
+  for (IndexT tile_y = threadIdx.y; tile_y < kTileSize; tile_y += kBlockRows) {
+    IndexT row_in_matrix = tile_y + blockIdx.x * kTileSize;
+    IndexT dst_idx = offset + row_in_matrix * VecSize * rows;
+    IndexT tile_idx = tile_x_idx + tile_y * VecSize;
+    if (col_in_matrix < /*dst_cols=*/rows &&
+        row_in_matrix < /*dst_rows=*/cols) {
+#pragma unroll
+      for (auto i = 0; i < VecSize; ++i) {
+        dst_data[dst_idx + i * rows] = single_tile[tile_idx + i];
+      }
+    }
+  }
+}
+
+// With the byte limitation of shared_memory, the VecSize shall be restricted
+// for the type whose byte-size is less than 8.
+template <typename T, typename IndexT, int Size,
+          int VecSize = (sizeof(T) > 8 ? 1 : Size)>
+inline void LaunchTransposeKernel(const phi::GPUContext& ctx,
+                                  const std::vector<size_t>& dims, const T* src,
+                                  T* dst) {
+  auto rank = dims.size();
+  IndexT num_batches = (rank == 2) ? 1 : dims[0];
+  IndexT rows = dims[rank - 2];
+  IndexT cols = dims[rank - 1];
+  IndexT num_tile_rows = (rows + kTileSize - 1) / kTileSize;
+  IndexT num_tile_cols = (cols + kTileSize - 1) / kTileSize;
+
+  dim3 blocks(num_tile_cols, num_tile_rows, num_batches);
+  dim3 threads(kTileSize, kBlockRows, 1);
+
+  BatchTransposeKernel<T, IndexT,
+                       VecSize><<<blocks, threads, 0, ctx.stream()>>>(
+      src, dst, rows, cols);
+}
+
+template <typename T, typename IndexT>
+inline void LaunchWithDispatchVecSize(const phi::GPUContext& ctx,
+                                      const int vec_size,
+                                      const PermuteType perm_type,
+                                      const std::vector<size_t>& dims,
+                                      const std::vector<int>& perm,
+                                      const T* src, T* dst, IndexT count) {
+#define CALL_DISPATCH_VEC_SIZE(vec_size)                                    \
+  case vec_size: {                                                          \
+    if (perm_type == PermuteType::kTranspose) {                             \
+      LaunchTransposeKernel<T, IndexT, vec_size>(ctx, dims, src, dst);      \
+    } else {                                                                \
+      LaunchPermuteRankDispatch<T, IndexT, vec_size>(ctx, count, perm_type, \
+                                                     dims, perm, src, dst); \
+    }                                                                       \
+    break;                                                                  \
+  }
+
+  switch (vec_size) {
+    CALL_DISPATCH_VEC_SIZE(1);
+    CALL_DISPATCH_VEC_SIZE(2);
+    CALL_DISPATCH_VEC_SIZE(4);
+    default: {
+      PADDLE_THROW(phi::errors::Unimplemented(
+          "Unsupported vectorized size: %d !", vec_size));
+      break;
+    }
+  }
+#undef CALL_DISPATCH_VEC_SIZE
+}
+
+template <typename T>
+inline void LaunchWithDispatchIndex(const phi::GPUContext& ctx,
+                                    const size_t count, const int vec_size,
+                                    const PermuteType perm_type,
+                                    const std::vector<size_t>& dims,
+                                    const std::vector<int>& perm, const T* src,
+                                    T* dst) {
+  if (count < std::numeric_limits<uint32_t>::max()) {
+    LaunchWithDispatchVecSize<T, uint32_t>(ctx, vec_size, perm_type, dims, perm,
+                                           src, dst,
+                                           static_cast<uint32_t>(count));
+  } else {
+    int64_t cnt = static_cast<int64_t>(count);
+    LaunchWithDispatchVecSize<T, int64_t>(ctx, vec_size, perm_type, dims, perm,
+                                          src, dst,
+                                          static_cast<int64_t>(count));
+  }
+}
+
+template <typename DeviceContext, typename T>
+inline void SimplifyThenLaunch(const int rank, const DeviceContext& ctx,
+                               const Tensor& in, Tensor* out,
+                               const std::vector<int32_t>& perm) {
+  int sm_count = ctx.GetSMCount();
+  auto src_dims = phi::vectorize<size_t>(in.dims());
+  auto simplifier = DimsSimplifier<T>(sm_count, rank, perm, src_dims,
+                                      in.data<T>(), out->data<T>());
+
+  if (simplifier.GetPermType() == PermuteType::kCopy) {
+    // If perm is [0,1,2,3], then just operate a DtoD copy.
+    phi::Copy(ctx, in, ctx.GetPlace(), false, out);
+  } else {
+    LaunchWithDispatchIndex<T>(
+        ctx, simplifier.GetCount(), simplifier.GetVecSize(),
+        simplifier.GetPermType(), simplifier.GetDims(), simplifier.GetPerm(),
+        in.data<T>(), out->data<T>());
+  }
+}
+
+template <typename T>
+size_t GetTransposeKey(const int rank, const Tensor& in,
+                       const std::vector<int32_t>& perm) {
+  auto in_shape = phi::vectorize(in.dims());
+  return phi::autotune::GetKey(
+      in_shape, perm, rank, paddle::experimental::CppTypeToDataType<T>::Type());
+}
+
 template <typename T>
-void TransposeGPUKernelDriver(const phi::GPUContext& dev_ctx, const int ndims,
+void TransposeGPUKernelDriver(const phi::GPUContext& dev_ctx, const int rank,
                               const Tensor& in,
                               const std::vector<int32_t>& perm, Tensor* out) {
+  PADDLE_ENFORCE_LT(
+      rank, phi::DDim::kMaxRank,
+      platform::errors::OutOfRange(
+          "The maximum dimension rank of "
+          "tensor is expected to be less than %d, but here is %d.",
+          phi::DDim::kMaxRank, rank));
+
   auto ret = TransposeSimple<T>::run(dev_ctx, in, perm, out);
   if (!ret) {
-    TransCompute<phi::GPUContext, T>(ndims, dev_ctx, in, out, perm);
+    auto* tuner = phi::autotune::MakeTransposeTuner<T>(
+        SimplifyThenLaunch<phi::GPUContext, T>);
+    if (!tuner->IsInit()) {
+      tuner->AddCallBack(
+          phi::autotune::MakeCallback<T>(TransCompute<phi::GPUContext, T>));
+      tuner->Finalize();
+    }
+
+    auto key = GetTransposeKey<T>(rank, in, perm);
+    auto& cache = phi::autotune::AutoTuneCache::Instance().GetTranspose();
+    if (cache.Find(key)) {
+      auto index = cache.Get(key);
+      tuner->RunBestKernel(index, rank, dev_ctx, in, out, perm);
+    } else {
+      // All avaliable kernels have ran while picking the best kernel, so
+      // there may be no need for another RunBestKernel.
+      auto index = tuner->PickBestKernel(dev_ctx, rank, dev_ctx, in, out, perm);
+      cache.Set(key, index);
+    }
   }
 }
 
diff --git a/paddle/fluid/operators/transpose_op.h b/paddle/fluid/operators/transpose_op.h
index 891aa312f69ff..ca57687ea5fe4 100644
--- a/paddle/fluid/operators/transpose_op.h
+++ b/paddle/fluid/operators/transpose_op.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <vector>
 
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/kernels/funcs/aligned_vector.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
 namespace paddle {
@@ -60,5 +61,182 @@ inline void TransCompute(const int dim, const DeviceContext& dev_ctx,
   }
 }
 
+enum PermuteType {
+  kCopy = 1,
+  kTranspose = 2,
+  kVecPermute = 3,
+  kNormalPermute = 4
+};
+
+constexpr int kBlockRows = 16;
+constexpr int kTileSize = 32;
+// To avoid bank conflict.
+constexpr int kShareCol = kTileSize + 1;
+
+// Simplify the input dims and permute dims if possible.
+template <typename T>
+class DimsSimplifier {
+ public:
+  explicit DimsSimplifier(const int sm_count, const int rank,
+                          const std::vector<int32_t>& perm,
+                          const std::vector<size_t>& dims, const T* src, T* dst)
+      : perm_(rank), dims_(rank) {
+    SimplifyPermAndDims(rank, dims, perm);
+    count_ = std::accumulate(dims.begin(), dims.end(), size_t{1},
+                             std::multiplies<size_t>());
+    if (rank_ > 1) {
+      vec_size_ = GetPermVecSize(sm_count, src, dst);
+      perm_.resize(rank_);
+      dims_.resize(rank_);
+    }
+  }
+
+  size_t GetCount() const { return count_; }
+  int GetVecSize() const { return vec_size_; }
+  PermuteType GetPermType() const { return type_; }
+
+  std::vector<int> GetPerm() const { return perm_; }
+  std::vector<size_t> GetDims() const { return dims_; }
+
+ private:
+  size_t rank_{1};
+  size_t count_{0};
+  int vec_size_{1};
+  std::vector<int> perm_;
+  std::vector<size_t> dims_;
+  PermuteType type_{kCopy};
+
+  void SimplifyPermAndDims(const size_t rank,
+                           const std::vector<size_t>& in_dims,
+                           const std::vector<int32_t>& perm) {
+    size_t combined_dims[phi::DDim::kMaxRank];
+    int valid_map[phi::DDim::kMaxRank];
+
+    // Merge consecutive dims to the fist one of this these dims,
+    // and leave the origin dim value to be 1. Example below :
+    // perm: [2, 3, 0, 1], origin_dims : [4, 8, 2, 5]
+    // new_dims: [4, 8, 2, 5] -> [32, 1, 10, 1]
+    size_t start_perm_idx = 0;
+    while (start_perm_idx < rank) {
+      const size_t start_dim_idx = perm[start_perm_idx];
+      combined_dims[start_dim_idx] = in_dims[start_dim_idx];
+      size_t end_perm_idx = start_perm_idx + 1;
+
+      while (end_perm_idx < rank &&
+             perm[end_perm_idx] == perm[end_perm_idx - 1] + 1) {
+        const size_t end_dim_idx = perm[end_perm_idx];
+        combined_dims[start_dim_idx] *= in_dims[end_dim_idx];
+        combined_dims[end_dim_idx] = 1;
+        end_perm_idx += 1;
+      }
+      start_perm_idx = end_perm_idx;
+    }
+
+    // Reorder combined dims and marked useless dim as -1.
+    // for example, if combined dims is [32, 1, 10, 1],
+    // valid_map is [0, -1, 1, -1] and generate simplified
+    // dims as [32, 10]
+    size_t valid_dim_idx = 0;
+    bool sequential_flag = false;
+    for (size_t i = 0; i < rank; ++i) {
+      const int src_dim = combined_dims[i];
+      if (src_dim == 1) {
+        valid_map[i] = -1;
+      } else {
+        sequential_flag = true;
+        valid_map[i] = valid_dim_idx;
+        dims_[valid_dim_idx] = src_dim;
+        valid_dim_idx += 1;
+      }
+    }
+
+    if (valid_dim_idx == 0) {
+      dims_[0] = 1;
+      perm_[0] = 0;
+      return;
+    } else if (valid_dim_idx == 1) {
+      type_ = PermuteType::kCopy;
+    }
+
+    // Acquire simplified perm with help of combined dims
+    // and original perm, finally simplified perm is [1, 0]
+    size_t perm_idx = 0;
+    for (size_t i = 0; i < rank; ++i) {
+      const int mapped = valid_map[perm[i]];
+      if (mapped >= 0) {
+        perm_[perm_idx] = mapped;
+        perm_idx += 1;
+      }
+    }
+    rank_ = valid_dim_idx;
+  }
+
+  int GetPermVecSize(const int sm_count, const T* src, T* dst) {
+    // For gerneal_permute kernel, there is good chance for
+    // vectorized write.
+    int vec_size = phi::GetVectorizedSize<T>(dst);
+    type_ = PermuteType::kNormalPermute;
+
+    // While the last dim is fixed, there is good chance for
+    // both vectorized read and write.
+    if (perm_[rank_ - 1] == rank_ - 1) {
+      int tmp_size = std::min(vec_size, phi::GetVectorizedSize<T>(src));
+      tmp_size = GetDimVesSize(tmp_size, dims_[rank_ - 1]);
+      if (tmp_size > 1) {
+        type_ = kVecPermute;
+        vec_size = tmp_size;
+
+        // For stride calculation of src_data index.
+        dims_[rank_ - 1] /= vec_size;
+      }
+    }
+
+    // Once only transpose at the last 2 dims, there is good
+    // chance for vectorized read.
+    if ((rank_ == 2 && perm_[1] == 0 && perm_[0] == 1) ||
+        (rank_ == 3 && perm_[2] == 1 && perm_[1] == 2)) {
+      type_ = PermuteType::kTranspose;
+
+      // Compared with vectorized load or read, set config to let more
+      // sm work simultaneously affect more according to performance.
+      constexpr int threads = kTileSize * kTileSize;
+      int blocks = count_ / threads;
+      if (blocks < sm_count) {
+        vec_size = 1;
+      } else {
+        int tmp_vec = std::min(vec_size, phi::GetVectorizedSize<T>(src));
+        // With bytes limitation of shared_memory, the VecSize shall be
+        // restricted for the type whose byte-size is less than 8 (double).
+        int type_vec =
+            sizeof(T) > 8 ? 1 : GetDimVesSize(tmp_vec, dims_[rank_ - 1]);
+        for (int i = type_vec; i > 0; i /= 2) {
+          if (blocks / i >= sm_count) {
+            break;
+          }
+          // When blocks is smaller than sm_count, a test shown that decrease
+          // vec_size to make blocks close to sm_count would gain performance.
+          vec_size = i;
+        }
+      }
+
+      dims_[rank_ - 1] /= vec_size;
+      count_ /= vec_size;
+    }
+    return vec_size;
+  }
+
+  // To find if highest common divisor and make it as vec_size.
+  int GetDimVesSize(const int vec_size, const size_t target_dim) {
+    int dim_vec_size = 1;
+    for (auto size = vec_size; size > 0; size /= 2) {
+      if (target_dim % size == 0) {
+        dim_vec_size = size;
+        break;
+      }
+    }
+    return dim_vec_size;
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/platform/fast_divmod.h b/paddle/fluid/platform/fast_divmod.h
index f2a150c301216..892c5b29aae33 100644
--- a/paddle/fluid/platform/fast_divmod.h
+++ b/paddle/fluid/platform/fast_divmod.h
@@ -59,8 +59,8 @@ struct FastDivMod {
     return result;
   }
 
-  int32_t divisor;
   int32_t shift_val;
+  uint32_t divisor;
   uint32_t multiplier;
 };
 
diff --git a/paddle/phi/kernels/autotune/auto_tune_base.h b/paddle/phi/kernels/autotune/auto_tune_base.h
index e18b854cf34b3..95afa7f697b49 100644
--- a/paddle/phi/kernels/autotune/auto_tune_base.h
+++ b/paddle/phi/kernels/autotune/auto_tune_base.h
@@ -14,6 +14,7 @@
 
 #pragma once
 
+#include <mutex>
 #include <type_traits>
 
 #include "glog/logging.h"
@@ -23,7 +24,7 @@
 namespace phi {
 namespace autotune {
 
-template <typename RetureType, typename... Args>
+template <typename T, typename RetureType, typename... Args>
 class KernelCallback {
  public:
   using ReturnT = RetureType;
@@ -33,71 +34,126 @@ class KernelCallback {
   explicit KernelCallback(FuncType func_) : func(func_) {}
   virtual ~KernelCallback() {}
 
-  RetureType Call(Args... args) { return func(args...); }
+  RetureType Run(Args... args) { return func(args...); }
 
  private:
   FuncType func;
 };
 
-template <typename RetureType, typename... Args>
-static KernelCallback<RetureType, Args...> MakeCallback(
+template <typename T, typename RetureType, typename... Args>
+static KernelCallback<T, RetureType, Args...> MakeCallback(
     RetureType (*cb)(Args...)) {
-  return KernelCallback<RetureType, Args...>(cb);
+  return KernelCallback<T, RetureType, Args...>(cb);
 }
 
-template <typename KernelType>
+template <typename T, typename KernelType>
 class AutoTuneBase {
  public:
   AutoTuneBase() {}
   virtual ~AutoTuneBase() {}
-  explicit AutoTuneBase(KernelType kernel) : default_kernel_(kernel) {
+  explicit AutoTuneBase(KernelType kernel) { kernels_.push_back(kernel); }
+
+  template <typename Type>
+  void AddCallBack(Type kernel) {
+    static_assert(std::is_same<Type, KernelType>::value,
+                  "Type must be the same");
     kernels_.push_back(kernel);
   }
 
-  template <typename T>
-  void AddCallBack(T kernel) {
-    static_assert(std::is_same<T, KernelType>::value, "Type must be the same");
-    kernels_.push_back(kernel);
+  template <typename... Args>
+  void RunBestKernel(const int idx, Args&&... args) {
+    kernels_[idx].Run(args...);
+  }
+
+  template <typename... Args>
+  void RunDefaultKernel(Args&&... args) {
+    kernels_[0].Run(args...);
   }
 
   template <typename Context, typename... Args>
-  KernelType PickBestKernel(const Context& ctx, Args&&... args) {
+  int PickBestKernel(const Context& ctx, Args&&... args) {
     PADDLE_ENFORCE_GT(
         kernels_.size(),
         0,
         paddle::platform::errors::InvalidArgument(
             "kernel num must be greater than 0, now is %d", kernels_.size()));
-    int idx = 0;
-    phi::GpuTimer timer;
+    int best_idx = 0;
     float min_time = std::numeric_limits<float>::max();
 
+    // Time cost test estabulished in default stream.
     for (int i = 0; i < kernels_.size(); ++i) {
-      ctx.Wait();
-      timer.Start(0);
-      kernels_[i].Call(args...);
-      timer.Stop(0);
-      auto time = timer.ElapsedTime();
-      VLOG(3) << "kernel[" << i << "]: time cost is " << time;
-
+      auto time = RunAndMeasureKernel<Context>(ctx, i, args...);
       if (time < min_time) {
         min_time = time;
-        idx = i;
+        best_idx = i;
       }
     }
-    VLOG(3) << "best kernel idx is " << idx;
-    return kernels_[idx];
+    VLOG(3) << "best kernel idx is " << best_idx;
+    return best_idx;
   }
 
+  bool IsInit() { return is_init_; }
+  void Finalize() { is_init_ = true; }
+
  private:
-  KernelType default_kernel_;
+  bool is_init_{false};
   std::vector<KernelType> kernels_;
+
+  template <typename Context, typename... Args>
+  float RunAndMeasureKernel(const Context& ctx, const int idx, Args&&... args) {
+    phi::GpuTimer timer;
+    float time_cost = 0;
+    const auto& stream = ctx.stream();
+
+    // Treat 1st run as warm up. Judge the result with
+    // the sum of 2nd and 3rd run.
+    constexpr int repeats = 3;
+
+    ctx.Wait();
+    for (int i = 0; i < repeats; ++i) {
+      timer.Start(stream);
+      kernels_[idx].Run(args...);
+      timer.Stop(stream);
+      auto time = timer.ElapsedTime();
+      if (i > 0) {
+        time_cost += time;
+      }
+      VLOG(3) << "kernel[" << idx << "][" << i << "th time cost is " << time;
+    }
+    return time_cost;
+  }
 };
 
-template <typename RetureType, typename... Args>
-static AutoTuneBase<KernelCallback<RetureType, Args...>> MakeAutoTuner(
+template <typename T, typename RetureType, typename... Args>
+static AutoTuneBase<T, KernelCallback<T, RetureType, Args...>> MakeAutoTuner(
     RetureType (*func)(Args...)) {
-  auto obj = MakeCallback(func);
-  return AutoTuneBase<decltype(obj)>(obj);
+  auto obj = MakeCallback<T>(func);
+  return AutoTuneBase<T, decltype(obj)>(obj);
+}
+
+template <typename T, typename KernelType>
+class TransposeAutoTuner : public AutoTuneBase<T, KernelType> {
+ public:
+  static AutoTuneBase<T, KernelType>* Instance(KernelType kernel) {
+    static std::unique_ptr<AutoTuneBase<T, KernelType>> instance_;
+    std::call_once(init_flag_, [&] {
+      instance_.reset(new AutoTuneBase<T, KernelType>(kernel));
+    });
+    return instance_.get();
+  }
+
+ private:
+  static std::once_flag init_flag_;
+};
+
+template <typename T, typename KernelType>
+std::once_flag TransposeAutoTuner<T, KernelType>::init_flag_;
+
+template <typename T, typename RetureType, typename... Args>
+static AutoTuneBase<T, KernelCallback<T, RetureType, Args...>>*
+    MakeTransposeTuner(RetureType (*func)(Args...)) {
+  auto obj = MakeCallback<T>(func);
+  return TransposeAutoTuner<T, decltype(obj)>::Instance(obj);
 }
 
 }  // namespace autotune
diff --git a/paddle/phi/kernels/autotune/auto_tune_test.cu b/paddle/phi/kernels/autotune/auto_tune_test.cu
index c3918b8ebe59d..8701a0572fcd8 100644
--- a/paddle/phi/kernels/autotune/auto_tune_test.cu
+++ b/paddle/phi/kernels/autotune/auto_tune_test.cu
@@ -74,7 +74,7 @@ float Algo(const phi::GPUContext& ctx,
 }
 
 TEST(AutoTune, sum) {
-  int64_t N = 1 << 22;
+  int64_t N = 1 << 20;
   size_t blocks = 512;
   size_t threads = 256;
   size_t size = sizeof(float) * N;
@@ -119,35 +119,35 @@ TEST(AutoTune, sum) {
 
   // 1. Test call_back.
   VLOG(3) << ">>> [CallBack]: Test case.";
-  auto callback1 = tune::MakeCallback(Algo<4>);
-  auto callback2 = tune::MakeCallback(Algo<2>);
-  auto callback3 = tune::MakeCallback(Algo<1>);
+  auto callback1 = tune::MakeCallback<float>(Algo<4>);
+  auto callback2 = tune::MakeCallback<float>(Algo<2>);
+  auto callback3 = tune::MakeCallback<float>(Algo<1>);
   std::vector<decltype(callback1)> callbacks{callback1, callback2, callback3};
   for (int i = 0; i < callbacks.size(); ++i) {
     dev_ctx->Wait();
     phi::GpuTimer timer;
     timer.Start(0);
-    callbacks[i].Call(*dev_ctx, *d_in1.get(), d_in2.get(), N, threads, blocks);
+    callbacks[i].Run(*dev_ctx, *d_in1.get(), d_in2.get(), N, threads, blocks);
     timer.Stop(0);
     VLOG(3) << "kernel[" << i << "]: time cost is " << timer.ElapsedTime();
   }
 
   // 2. Test call_back tune.
   VLOG(3) << ">>> [AutoTune]: Test case.";
-  auto tuner = tune::MakeAutoTuner(Algo<4>);
-  tuner.AddCallBack(tune::MakeCallback(Algo<2>));
-  tuner.AddCallBack(tune::MakeCallback(Algo<1>));
+  auto tuner = tune::MakeAutoTuner<float>(Algo<4>);
+  tuner.AddCallBack(tune::MakeCallback<float>(Algo<2>));
+  tuner.AddCallBack(tune::MakeCallback<float>(Algo<1>));
 
   /* The 1st ctx works for ctx.Wait(),
      the 2nd is just the param of call_back. */
-  auto best_call_back = tuner.PickBestKernel(
+  auto best_index = tuner.PickBestKernel(
       *dev_ctx, *dev_ctx, *d_in1.get(), d_in2.get(), N, threads, blocks);
-  best_call_back.Call(*dev_ctx, *d_in1.get(), d_in2.get(), N, threads, blocks);
 
   dev_ctx->Wait();
   phi::GpuTimer timer;
   timer.Start(0);
-  best_call_back.Call(*dev_ctx, *d_in1.get(), d_in2.get(), N, threads, blocks);
+  tuner.RunBestKernel(
+      best_index, *dev_ctx, *d_in1.get(), d_in2.get(), N, threads, blocks);
   timer.Stop(0);
   VLOG(3) << "Best CallBackKernel time cost is " << timer.ElapsedTime();
 #endif
diff --git a/paddle/phi/kernels/autotune/cache.h b/paddle/phi/kernels/autotune/cache.h
index 9d7f57e96e373..8de0695ede40c 100644
--- a/paddle/phi/kernels/autotune/cache.h
+++ b/paddle/phi/kernels/autotune/cache.h
@@ -134,7 +134,8 @@ enum class AlgorithmType {
   kConvForward = 1,
   kConvBackwardData = 2,
   kConvBackwardFilter = 3,
-  kAlgorithmCount = 4
+  kTranspose = 4,
+  kAlgorithmCount = 5
 };
 
 // AlgorithmsConfigKey -> AlgorithmsID
@@ -165,6 +166,8 @@ class AutoTuneCache {
     return Get(AlgorithmType::kConvBackwardFilter);
   }
 
+  AlgorithmsCacheMap& GetTranspose() { return Get(AlgorithmType::kTranspose); }
+
   void Clean() {
     for (auto& v : auto_tune_map_) {
       v.second.Clean();

From 9bb39d489972ad85eec43d6418619e9e5a2a22f0 Mon Sep 17 00:00:00 2001
From: Jiabin Yang <360788950@qq.com>
Date: Tue, 7 Jun 2022 12:00:17 +0800
Subject: [PATCH 16/53] support prune (#43250)

---
 .../auto_code_generator/eager_generator.cc    | 41 +++++++++++++------
 1 file changed, 28 insertions(+), 13 deletions(-)

diff --git a/paddle/fluid/eager/auto_code_generator/eager_generator.cc b/paddle/fluid/eager/auto_code_generator/eager_generator.cc
index 817a0de6e0ca9..73baf21015833 100644
--- a/paddle/fluid/eager/auto_code_generator/eager_generator.cc
+++ b/paddle/fluid/eager/auto_code_generator/eager_generator.cc
@@ -1206,22 +1206,37 @@ static std::string GenerateGradNodeCreationContent(
     if (!input.duplicable()) {
       compute_require_grad_args += ", " + input_autograd_name;
       size_t input_position = fwd_inputs_name_pos_map.at(input_name);
-
-      const char* SET_GRAD_OUT_META_TEMPLATE =
-          "      grad_node->SetGradOutMeta(%s, %d);\n";
-      grad_node_creation_str +=
-          paddle::string::Sprintf(SET_GRAD_OUT_META_TEMPLATE,
-                                  LegalizeVarName(input_name), input_position);
-
+      bool found_target_name = false;
+      for (const auto& iter : op_base_infos) {
+        const auto& grad_outs_slot_map = iter.GetGradOutsSlotnameMap();
+        for (auto iter : grad_outs_slot_map) {
+          if ((!found_target_name) && (input_name == iter.second)) {
+            const char* SET_GRAD_OUT_META_TEMPLATE =
+                "      grad_node->SetGradOutMeta(%s, %d);\n";
+            grad_node_creation_str += paddle::string::Sprintf(
+                SET_GRAD_OUT_META_TEMPLATE, LegalizeVarName(input_name),
+                input_position);
+            found_target_name = true;
+          }
+        }
+      }
     } else {
       compute_require_grad_args += ", &" + input_autograd_name;
       size_t input_position = fwd_inputs_name_pos_map.at(input_name);
-
-      const char* SET_GRAD_OUT_META_TEMPLATE =
-          "      grad_node->SetGradOutMeta(%s, %d);\n";
-      grad_node_creation_str +=
-          paddle::string::Sprintf(SET_GRAD_OUT_META_TEMPLATE,
-                                  LegalizeVarName(input_name), input_position);
+      bool found_target_name = false;
+      for (const auto& iter : op_base_infos) {
+        const auto& grad_outs_slot_map = iter.GetGradOutsSlotnameMap();
+        for (auto iter : grad_outs_slot_map) {
+          if ((!found_target_name) && (input_name == iter.second)) {
+            const char* SET_GRAD_OUT_META_TEMPLATE =
+                "      grad_node->SetGradOutMeta(%s, %d);\n";
+            grad_node_creation_str += paddle::string::Sprintf(
+                SET_GRAD_OUT_META_TEMPLATE, LegalizeVarName(input_name),
+                input_position);
+            found_target_name = true;
+          }
+        }
+      }
     }
   }
 

From 8c3777dfcba3e4d20045087db2957ea34f076aec Mon Sep 17 00:00:00 2001
From: Wilber <jiweibo@baidu.com>
Date: Tue, 7 Jun 2022 12:46:06 +0800
Subject: [PATCH 17/53] [multi-stream] Fix split and concat problem. (#43039)

---
 .../fluid/inference/api/analysis_predictor.cc |  6 ---
 .../inference/tests/infer_ut/CMakeLists.txt   |  2 +
 .../inference/tests/infer_ut/test_LeViT.cc    |  2 +-
 paddle/fluid/memory/memcpy.cc                 |  2 +-
 paddle/fluid/platform/device_context.cc       |  4 +-
 paddle/phi/backends/gpu/gpu_context.cc        |  3 +-
 .../kernels/funcs/concat_and_split_functor.cu | 39 ++++++++++---------
 7 files changed, 29 insertions(+), 29 deletions(-)

diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index 5f9051ff2fdb9..18229c302db39 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -1090,12 +1090,6 @@ CreatePaddlePredictor<AnalysisConfig, PaddleEngineKind::kAnalysis>(
         process_level_allocator_enabled = true;
       }
 
-      // TODO(Jingzhuangzhuang): Fix trt error when allocator_strategy is
-      // auto_growth
-      if (config.tensorrt_engine_enabled()) {
-        gflags.push_back("--allocator_strategy=naive_best_fit");
-      }
-
       if (framework::InitGflags(gflags)) {
         VLOG(3) << "The following gpu analysis configurations only take effect "
                    "for the first predictor: ";
diff --git a/paddle/fluid/inference/tests/infer_ut/CMakeLists.txt b/paddle/fluid/inference/tests/infer_ut/CMakeLists.txt
index 5aef30bf335c3..0aee989367e4b 100644
--- a/paddle/fluid/inference/tests/infer_ut/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/infer_ut/CMakeLists.txt
@@ -87,9 +87,11 @@ endif()
 
 if(WITH_GPU)
   if(NOT WIN32)
+    add_definitions("-DPADDLE_WITH_GPU")
     set(CUDA_LIB
         "/usr/local/cuda/lib64/"
         CACHE STRING "CUDA Library")
+    include_directories("${CUDA_LIB}/../include")
   else()
     set(CUDA_LIB
         ""
diff --git a/paddle/fluid/inference/tests/infer_ut/test_LeViT.cc b/paddle/fluid/inference/tests/infer_ut/test_LeViT.cc
index b74d1189b804b..b069feaec1ae7 100644
--- a/paddle/fluid/inference/tests/infer_ut/test_LeViT.cc
+++ b/paddle/fluid/inference/tests/infer_ut/test_LeViT.cc
@@ -157,7 +157,7 @@ TEST(tensorrt_tester_LeViT, multi_thread4_trt_fp32_bz2) {
   for (int i = 0; i < thread_num; ++i) {
     threads.emplace_back(paddle::test::SingleThreadPrediction,
                          pred_pool.Retrive(i), &my_input_data_map,
-                         &infer_output_data, 2);
+                         &infer_output_data, 10);
   }
 
   // thread join & check outputs
diff --git a/paddle/fluid/memory/memcpy.cc b/paddle/fluid/memory/memcpy.cc
index 3198b4f8d935e..c45180f600e3e 100644
--- a/paddle/fluid/memory/memcpy.cc
+++ b/paddle/fluid/memory/memcpy.cc
@@ -648,7 +648,7 @@ void Copy<platform::CUDAPlace, platform::CPUPlace>(
 
   platform::SetDeviceId(dst_place.device);
   VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
-          << dst_place << " by thream(" << stream << ")";
+          << dst_place << " by stream(" << stream << ")";
   if (stream) {
     platform::RecordEvent record_event(
         "GpuMemcpyAsync:CPU->GPU", platform::TracerEventType::UserDefined, 1);
diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc
index 0bd606257f541..fd61b813f0aa2 100644
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -54,7 +54,9 @@ AllocationPtr Alloc(const platform::DeviceContext& dev_ctx, size_t size) {
     auto& desired_dev_ctx =
         static_cast<const platform::CUDADeviceContext&>(dev_ctx);
     if (default_dev_ctx->stream() == desired_dev_ctx.stream()) {
-      return Alloc(place, size);
+      return paddle::memory::Alloc(desired_dev_ctx.GetPlace(), size,
+                                   phi::Stream(reinterpret_cast<phi::StreamId>(
+                                       desired_dev_ctx.stream())));
     } else {
       return allocation::CUDADeviceContextAllocatorPool::Instance().Alloc(
           desired_dev_ctx, size);
diff --git a/paddle/phi/backends/gpu/gpu_context.cc b/paddle/phi/backends/gpu/gpu_context.cc
index f51f287ee4a08..f68e451039092 100644
--- a/paddle/phi/backends/gpu/gpu_context.cc
+++ b/paddle/phi/backends/gpu/gpu_context.cc
@@ -504,8 +504,7 @@ struct GPUContext::Impl {
 
   void AddStreamCallback(const std::function<void()>& callback) const {
     // NOTE(zhiqiu): better use threadpool here, otherwise "std::async" may
-    // launch too
-    // many threads and result in thread oversubscription.
+    // launch too many threads and result in thread oversubscription.
     auto* callback_func = new std::function<void()>(std::move(callback));
     auto* func = new std::function<void()>([this, callback_func] {
       std::lock_guard<std::mutex> lock(stream_call_back_mtx_);
diff --git a/paddle/phi/kernels/funcs/concat_and_split_functor.cu b/paddle/phi/kernels/funcs/concat_and_split_functor.cu
index 5abaf6c2ffa87..1c9fbffa2ac19 100644
--- a/paddle/phi/kernels/funcs/concat_and_split_functor.cu
+++ b/paddle/phi/kernels/funcs/concat_and_split_functor.cu
@@ -276,10 +276,7 @@ struct ConcatFunctor<phi::GPUContext, T> {
     int64_t out_row = in_row, out_col = 0;
 
     int inputs_col_num = in_num + 1;
-    std::vector<const T*> inputs_data_vec(in_num);
-    std::vector<int64_t> inputs_col_vec(inputs_col_num);
-    const T** inputs_data = inputs_data_vec.data();
-    int64_t* inputs_col = inputs_col_vec.data();
+    paddle::memory::AllocationPtr data_alloc, col_alloc;
 
 // There are some differences between hip runtime and NV runtime.
 // In NV, when the pageable memory data less than 64K is transferred from
@@ -289,16 +286,22 @@ struct ConcatFunctor<phi::GPUContext, T> {
 // 3.2.6.1. Concurrent Execution between Host and Device
 // Memory copies from host to device of a memory block of 64 KB or less
 #ifdef PADDLE_WITH_HIP
-    paddle::memory::AllocationPtr data_alloc, col_alloc;
     // TODO(chentianyu03): try to find a method to remove the Alloc function
     data_alloc = paddle::memory::Alloc(paddle::platform::CUDAPinnedPlace(),
                                        in_num * sizeof(T*));
-    inputs_data = reinterpret_cast<const T**>(data_alloc->ptr());
     // TODO(chentianyu03): try to find a method to remove the Alloc function
     col_alloc = paddle::memory::Alloc(paddle::platform::CUDAPinnedPlace(),
                                       inputs_col_num * sizeof(int));
-    inputs_col = reinterpret_cast<int64_t*>(col_alloc->ptr());
+#else
+    // TODO(pinned): cuda-graph not support pinned memory, we just use the cpu
+    // allocator.
+    data_alloc = paddle::memory::Alloc(paddle::platform::CPUPlace(),
+                                       in_num * sizeof(T*));
+    col_alloc = paddle::memory::Alloc(paddle::platform::CPUPlace(),
+                                      (inputs_col_num) * sizeof(int64_t));
 #endif
+    const T** inputs_data = reinterpret_cast<const T**>(data_alloc->ptr());
+    int64_t* inputs_col = reinterpret_cast<int64_t*>(col_alloc->ptr());
 
     inputs_col[0] = 0;
     bool has_same_shape = true;
@@ -387,7 +390,6 @@ struct ConcatFunctor<phi::GPUContext, T> {
           output->data<T>());
     }
 
-#ifdef PADDLE_WITH_HIP
     // Prevent the pinned memory value from being covered and release the memory
     // after the launch kernel of the stream is executed (reapply pinned memory
     // next time)
@@ -401,7 +403,6 @@ struct ConcatFunctor<phi::GPUContext, T> {
       paddle::memory::allocation::Allocator::AllocationDeleter(
           col_alloc_released);
     });
-#endif
   }
 };
 
@@ -432,10 +433,7 @@ class SplitFunctor<phi::GPUContext, T> {
     bool has_same_shape = true;
 
     int outputs_cols_num = o_num + 1;
-    std::vector<T*> outputs_data_vec(o_num);
-    std::vector<int64_t> outputs_cols_vec(outputs_cols_num);
-    T** outputs_data = outputs_data_vec.data();
-    int64_t* outputs_cols = outputs_cols_vec.data();
+    paddle::memory::AllocationPtr data_alloc, cols_alloc;
 
 // There are some differences between hip runtime and NV runtime.
 // In NV, when the pageable memory data less than 64K is transferred from
@@ -445,16 +443,22 @@ class SplitFunctor<phi::GPUContext, T> {
 // 3.2.6.1. Concurrent Execution between Host and Device
 // Memory copies from host to device of a memory block of 64 KB or less
 #ifdef PADDLE_WITH_HIP
-    paddle::memory::AllocationPtr data_alloc, cols_alloc;
     // TODO(chentianyu03): try to find a method to remove the Alloc function
     data_alloc = paddle::memory::Alloc(paddle::platform::CUDAPinnedPlace(),
                                        o_num * sizeof(T*));
-    outputs_data = reinterpret_cast<T**>(data_alloc->ptr());
     // TODO(chentianyu03): try to find a method to remove the Alloc function
     cols_alloc = paddle::memory::Alloc(paddle::platform::CUDAPinnedPlace(),
                                        (outputs_cols_num) * sizeof(int64_t));
-    outputs_cols = reinterpret_cast<int64_t*>(cols_alloc->ptr());
+#else
+    // TODO(pinned): cuda-graph not support pinned memory, we just use the cpu
+    // allocator.
+    data_alloc =
+        paddle::memory::Alloc(paddle::platform::CPUPlace(), o_num * sizeof(T*));
+    cols_alloc = paddle::memory::Alloc(paddle::platform::CPUPlace(),
+                                       (outputs_cols_num) * sizeof(int64_t));
 #endif
+    T** outputs_data = reinterpret_cast<T**>(data_alloc->ptr());
+    int64_t* outputs_cols = reinterpret_cast<int64_t*>(cols_alloc->ptr());
 
     outputs_cols[0] = 0;
     for (int i = 0; i < o_num; ++i) {
@@ -547,7 +551,7 @@ class SplitFunctor<phi::GPUContext, T> {
           static_cast<int>(outputs_cols_num),
           dev_out_gpu_data);
     }
-#ifdef PADDLE_WITH_HIP
+
     // Prevent the pinned memory value from being covered and release the memory
     // after the launch kernel of the stream is executed (reapply pinned memory
     // next time)
@@ -559,7 +563,6 @@ class SplitFunctor<phi::GPUContext, T> {
       paddle::memory::allocation::Allocator::AllocationDeleter(
           cols_alloc_released);
     });
-#endif
   }
 };
 

From d95293f3de7113eb58f427ed7990f759f797909f Mon Sep 17 00:00:00 2001
From: Weilong Wu <veyron_wu@163.com>
Date: Tue, 7 Jun 2022 14:05:39 +0800
Subject: [PATCH 18/53] [Eager] fix 2 fused op test and add retain_grad flag
 under eager (#43258)

---
 ...sed_bias_dropout_residual_layer_norm_op.py |  4 +-
 .../unittests/test_fused_gate_attention_op.py |  4 +-
 .../test_tensor_fill_diagonal_tensor.py       | 54 ++++++++++---------
 3 files changed, 36 insertions(+), 26 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/test_fused_bias_dropout_residual_layer_norm_op.py b/python/paddle/fluid/tests/unittests/test_fused_bias_dropout_residual_layer_norm_op.py
index 92c815a246f73..f31cc78986e56 100644
--- a/python/paddle/fluid/tests/unittests/test_fused_bias_dropout_residual_layer_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fused_bias_dropout_residual_layer_norm_op.py
@@ -26,7 +26,9 @@
 from paddle.fluid import layers
 import unittest
 from op_test import OpTest
-from paddle.fluid.framework import default_main_program
+from paddle.fluid.framework import default_main_program, _enable_legacy_dygraph
+
+_enable_legacy_dygraph()
 
 default_main_program().random_seed = 42
 
diff --git a/python/paddle/fluid/tests/unittests/test_fused_gate_attention_op.py b/python/paddle/fluid/tests/unittests/test_fused_gate_attention_op.py
index 2d6243955478c..edfb46f5813b6 100644
--- a/python/paddle/fluid/tests/unittests/test_fused_gate_attention_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fused_gate_attention_op.py
@@ -21,9 +21,11 @@
 from op_test import OpTest, convert_float_to_uint16
 from test_sparse_attention_op import get_cuda_version
 from paddle import _C_ops
-from paddle.fluid.framework import default_main_program
+from paddle.fluid.framework import default_main_program, _enable_legacy_dygraph
 from paddle.fluid import core
 
+_enable_legacy_dygraph()
+
 
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "Paddle is not compiled with CUDA")
diff --git a/python/paddle/fluid/tests/unittests/test_tensor_fill_diagonal_tensor.py b/python/paddle/fluid/tests/unittests/test_tensor_fill_diagonal_tensor.py
index 4765b540c7e60..e71cc3b7239f1 100644
--- a/python/paddle/fluid/tests/unittests/test_tensor_fill_diagonal_tensor.py
+++ b/python/paddle/fluid/tests/unittests/test_tensor_fill_diagonal_tensor.py
@@ -30,10 +30,10 @@ def setUp(self):
 
     def test_dim2(self):
         fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
-        expected_np = np.array(
-            [[1, 2, 2], [2, 1, 2], [2, 2, 1], [2, 2, 2]]).astype('float32')
-        expected_grad = np.array(
-            [[0, 1, 1], [1, 0, 1], [1, 1, 0], [1, 1, 1]]).astype('float32')
+        expected_np = np.array([[1, 2, 2], [2, 1, 2], [2, 2, 1],
+                                [2, 2, 2]]).astype('float32')
+        expected_grad = np.array([[0, 1, 1], [1, 0, 1], [1, 1, 0],
+                                  [1, 1, 1]]).astype('float32')
 
         for idx, p in enumerate(self.places):
             if idx == 0:
@@ -59,10 +59,10 @@ def test_dim2(self):
 
     def test_dim2_offset_1(self):
         fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
-        expected_np = np.array(
-            [[2, 2, 2], [1, 2, 2], [2, 1, 2], [2, 2, 1]]).astype('float32')
-        expected_grad = np.array(
-            [[1, 1, 1], [0, 1, 1], [1, 0, 1], [1, 1, 0]]).astype('float32')
+        expected_np = np.array([[2, 2, 2], [1, 2, 2], [2, 1, 2],
+                                [2, 2, 1]]).astype('float32')
+        expected_grad = np.array([[1, 1, 1], [0, 1, 1], [1, 0, 1],
+                                  [1, 1, 0]]).astype('float32')
 
         for idx, p in enumerate(self.places):
             if idx == 0:
@@ -88,10 +88,10 @@ def test_dim2_offset_1(self):
 
     def test_dim2_offset1(self):
         fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
-        expected_np = np.array(
-            [[2, 1, 2], [2, 2, 1], [2, 2, 2], [2, 2, 2]]).astype('float32')
-        expected_grad = np.array(
-            [[1, 0, 1], [1, 1, 0], [1, 1, 1], [1, 1, 1]]).astype('float32')
+        expected_np = np.array([[2, 1, 2], [2, 2, 1], [2, 2, 2],
+                                [2, 2, 2]]).astype('float32')
+        expected_grad = np.array([[1, 0, 1], [1, 1, 0], [1, 1, 1],
+                                  [1, 1, 1]]).astype('float32')
 
         for idx, p in enumerate(self.places):
             if idx == 0:
@@ -117,18 +117,22 @@ def test_dim2_offset1(self):
 
     def test_dim4(self):
         fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
-        expected_np = np.array(
-            [[[[0, 3], [2, 2], [2, 2]], [[2, 2], [1, 4], [2, 2]],
-              [[2, 2], [2, 2], [2, 5]], [[2, 2], [2, 2], [2, 2]]],
-             [[[6, 9], [2, 2], [2, 2]], [[2, 2], [7, 10], [2, 2]],
-              [[2, 2], [2, 2], [8, 11]],
-              [[2, 2], [2, 2], [2, 2]]]]).astype('float32')
-        expected_grad = np.array(
-            [[[[0, 0], [1, 1], [1, 1]], [[1, 1], [0, 0], [1, 1]],
-              [[1, 1], [1, 1], [0, 0]], [[1, 1], [1, 1], [1, 1]]],
-             [[[0, 0], [1, 1], [1, 1]], [[1, 1], [0, 0], [1, 1]],
-              [[1, 1], [1, 1], [0, 0]],
-              [[1, 1], [1, 1], [1, 1]]]]).astype('float32')
+        expected_np = np.array([[[[0, 3], [2, 2], [2, 2]],
+                                 [[2, 2], [1, 4], [2, 2]],
+                                 [[2, 2], [2, 2], [2, 5]],
+                                 [[2, 2], [2, 2], [2, 2]]],
+                                [[[6, 9], [2, 2], [2, 2]],
+                                 [[2, 2], [7, 10], [2, 2]],
+                                 [[2, 2], [2, 2], [8, 11]],
+                                 [[2, 2], [2, 2], [2, 2]]]]).astype('float32')
+        expected_grad = np.array([[[[0, 0], [1, 1], [1, 1]],
+                                   [[1, 1], [0, 0], [1, 1]],
+                                   [[1, 1], [1, 1], [0, 0]],
+                                   [[1, 1], [1, 1], [1, 1]]],
+                                  [[[0, 0], [1, 1], [1, 1]],
+                                   [[1, 1], [0, 0], [1, 1]],
+                                   [[1, 1], [1, 1], [0, 0]],
+                                   [[1, 1], [1, 1], [1, 1]]]]).astype('float32')
 
         for idx, p in enumerate(self.places):
             if idx == 0:
@@ -154,6 +158,7 @@ def test_dim4(self):
         fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False})
 
     def test_largedim(self):
+        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
         if len(self.places) > 1:
             bsdim = 1024
             fsdim = 128
@@ -175,6 +180,7 @@ def test_largedim(self):
 
                 self.assertEqual((ny == expected_pred).all(), True)
                 self.assertEqual((y.grad == expected_grad).all(), True)
+        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False})
 
 
 if __name__ == '__main__':

From eac125f9124a3bc04a89acec4fc01ec8a3de9677 Mon Sep 17 00:00:00 2001
From: BrilliantYuKaimin <91609464+BrilliantYuKaimin@users.noreply.github.com>
Date: Tue, 7 Jun 2022 14:09:24 +0800
Subject: [PATCH 19/53] =?UTF-8?q?=E4=BF=AE=E5=A4=8D=20paddle.assign=20?=
 =?UTF-8?q?=E7=AD=89=20API=20=E7=9A=84=E6=96=87=E6=A1=A3=20=20(#42942)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Update creation.py

* Update search.py

* Update search.py

* Update xavier.py

* Update xavier.py

* Update pooling.py

* Update pooling.py

* Update pooling.py

* Update search.py
---
 python/paddle/nn/functional/pooling.py | 22 +++++------
 python/paddle/nn/initializer/xavier.py | 34 ++++++++---------
 python/paddle/nn/layer/pooling.py      | 36 +++++++----------
 python/paddle/tensor/creation.py       | 15 ++++----
 python/paddle/tensor/search.py         | 53 ++++++++++++--------------
 5 files changed, 70 insertions(+), 90 deletions(-)

diff --git a/python/paddle/nn/functional/pooling.py b/python/paddle/nn/functional/pooling.py
index f79a43fbc03a6..4bb53e1737bf8 100755
--- a/python/paddle/nn/functional/pooling.py
+++ b/python/paddle/nn/functional/pooling.py
@@ -1273,24 +1273,20 @@ def max_pool3d(x,
 
 def adaptive_avg_pool1d(x, output_size, name=None):
     """
-    This API implements adaptive average pooling 1d operation.
-    See more details in :ref:`api_nn_pooling_AdaptiveAvgPool1d` .
+    Adaptive average pooling 1d operation on :attr:`x` according to :attr:`output_size`. 
+    
+    Notes:
+        See more details in :ref:`api_nn_pooling_AdaptiveAvgPool1d` .
 
     Args:
-        x (Tensor): The input tensor of pooling operator, which is a 3-D tensor
-                              with shape [N, C, L].  The format of input tensor is NCL,
-                              where N is batch size, C is the number of channels, L is the
-                              length of the feature. The data type is float32 or float64.
-        output_size (int): The target output size. It must be an integer.
-        name(str, optional): For detailed information, please refer
-                                 to :ref:`api_guide_Name`. Usually name is no need to set and
-                                 None by default.
+        x (Tensor): The input Tensor of pooling, which is a 3-D tensor with shape :math:`[N, C, L]`, where :math:`N` is batch size, :math:`C` is the number of channels and :math:`L` is the length of the feature. The data type is float32 or float64.
+        output_size (int): The target output size. Its data type must be int.
+        name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
     Returns:
-            Tensor: The output tensor of adaptive average pooling result. The data type is same
-                      as input tensor.
+        Tensor: The result of 1D adaptive average pooling. Its data type is same as input.
     Examples:
         .. code-block:: python
-          :name: code-example1
+          :name: adaptive_avg_pool1d-example
 
               # average adaptive pool1d
               # suppose input data in shape of [N, C, L], `output_size` is m or [m],
diff --git a/python/paddle/nn/initializer/xavier.py b/python/paddle/nn/initializer/xavier.py
index e11790df7dfbc..d6570f9db2fe5 100644
--- a/python/paddle/nn/initializer/xavier.py
+++ b/python/paddle/nn/initializer/xavier.py
@@ -22,28 +22,26 @@ class XavierNormal(XavierInitializer):
     This class implements the Xavier weight initializer from the paper
     `Understanding the difficulty of training deep feedforward neural
     networks <http://proceedings.mlr.press/v9/glorot10a/glorot10a.pdf>`_
-    by Xavier Glorot and Yoshua Bengio, using a normal distribution.
-
-    The mean is 0 and the standard deviation is
+    by Xavier Glorot and Yoshua Bengio, using a normal distribution whose mean is :math:`0` and standard deviation is
 
     .. math::
 
-        \sqrt{\frac{2.0}{fan\_in + fan\_out}}
+        \sqrt{\frac{2.0}{fan\_in + fan\_out}}.
 
 
     Args:
-        fan_in (float, optional): fan_in for Xavier initialization, It is
-                inferred from the tensor. The default value is None.
-        fan_out (float, optional): fan_out for Xavier initialization, it is
-                 inferred from the tensor. The default value is None.
-        name(str, optional): The default value is None. Normally there is no need for user to set this
-            property. For more information, please refer to :ref:`api_guide_Name`.
+        fan_in (float, optional): fan_in for Xavier initialization, which is
+                inferred from the Tensor. The default value is None.
+        fan_out (float, optional): fan_out for Xavier initialization, which is
+                 inferred from the Tensor. The default value is None.
+        name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
 
     Returns:
         A parameter initialized by Xavier weight, using a normal distribution.
 
     Examples:
         .. code-block:: python
+            :name: initializer_XavierNormal-example
 
             import paddle
 
@@ -81,25 +79,25 @@ class XavierUniform(XavierInitializer):
 
     This initializer is designed to keep the scale of the gradients
     approximately same in all the layers. In case of Uniform distribution,
-    the range is [-x, x], where
+    the range is :math:`[-x,x]`, where
 
     .. math::
 
-        x = \sqrt{\frac{6.0}{fan\_in + fan\_out}}
+        x = \sqrt{\frac{6.0}{fan\_in + fan\_out}}.
 
     Args:
-        fan_in (float, optional): fan_in for Xavier initialization, it is
-                inferred from the tensor. The default value is None.
-        fan_out (float, optional): fan_out for Xavier initialization, it is
-                 inferred from the tensor. The default value is None.
-        name(str, optional): The default value is None. Normally there is no need for user to set this
-            property. For more information, please refer to :ref:`api_guide_Name`.
+        fan_in (float, optional): fan_in for Xavier initialization, which is
+                inferred from the Tensor. The default value is None.
+        fan_out (float, optional): fan_out for Xavier initialization, which is
+                 inferred from the Tensor. The default value is None.
+        name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
 
     Returns:
         A parameter initialized by Xavier weight, using a uniform distribution.
 
     Examples:
         .. code-block:: python
+            :name: initializer_XavierUniform-example
 
             import paddle
 
diff --git a/python/paddle/nn/layer/pooling.py b/python/paddle/nn/layer/pooling.py
index 990d0b6107864..e7b6fc24afad8 100755
--- a/python/paddle/nn/layer/pooling.py
+++ b/python/paddle/nn/layer/pooling.py
@@ -619,42 +619,32 @@ def extra_repr(self):
 class AdaptiveAvgPool1D(Layer):
     r"""
 
-    This operation applies a 1D adaptive average pooling over an input signal composed
-    of several input planes, based on the input, output_size, return_mask parameters.
-    Input(X) and output(Out) are in NCL format, where N is batch
-    size, C is the number of channels, L is the length of the feature.
-    The output tensor shape will be [N, C, output_size].
+    A 1D adaptive average pooling over an input signal composed
+    of several input planes, based on :attr:`output_size`.
+    Input and output are in NCL format, where N is batch
+    size, C is the number of channels and L is the length of the feature.
+    The shape of output will be :math:`[N, C, output\_size]`.
 
-    For average adaptive pool1d:
+    The formulation for average adaptive pool1d is
 
     ..  math::
 
-        lstart &= floor(i * L_{in} / L_{out})
+        lstart &= \lfloor i * L_{in} / L_{out}\rfloor,
 
-        lend &= ceil((i + 1) * L_{in} / L_{out})
+        lend &= \lceil(i + 1) * L_{in} / L_{out}\rceil,
 
-        Output(i) &= \frac{ \sum Input[lstart:lend]}{lend - lstart}
+        Output(i) &= \frac{\sum Input[lstart:lend]}{lend - lstart}.
 
     Parameters:
-        output_size(int): The target output size. It must be an integer.
-        name(str, optional): For detailed information, please refer to :ref:`api_guide_Name`.
-            Usually name is no need to set and None by default.
+        output_size(int): The target output size. Its data type must be int.
+        name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
 
     Returns:
-        A callable object of AdaptiveAvgPool1D.
-
-    Raises:
-        ValueError: 'output_size' should be an integer.
-
-    Shape:
-        - x(Tensor): 3-D tensor. The input tensor of adaptive avg pool1d operator, which is a 3-D tensor.
-          The data type can be float32, float64.
-        - output(Tensor): 3-D tensor. The output tensor of adaptive avg pool1d operator, which is a 3-D tensor.
-          The data type is same as input x.
+        A callable object for computing 1D adaptive average pooling.
 
     Examples:
         .. code-block:: python
-
+            :name: AdaptiveAvgPool1D-example
             # average adaptive pool1d
             # suppose input data in shape of [N, C, L], `output_size` is m or [m],
             # output shape is [N, C, m], adaptive pool divide L dimension
diff --git a/python/paddle/tensor/creation.py b/python/paddle/tensor/creation.py
index 67547212bb196..521839af902b5 100644
--- a/python/paddle/tensor/creation.py
+++ b/python/paddle/tensor/creation.py
@@ -1479,22 +1479,21 @@ def empty_like(x, dtype=None, name=None):
 def assign(x, output=None):
     """
 
-    The OP copies the :attr:`x` to the :attr:`output`.
+    Copy value of the :attr:`x` to the :attr:`output`.
  
     Parameters:
-        x (Tensor|np.ndarray|list|tuple|scalar): A tensor, numpy ndarray, tuple/list of scalar,
-            or scalar. Its data type supports float16, float32, float64, int32, int64, and bool.
-            Note: the float64 data will be converted to float32 because of current platform protobuf
+        x (Tensor|np.ndarray|list|tuple|scalar): A Tensor, numpy ndarray, tuple/list of scalar,
+            or scalar. Its data type can be float16, float32, float64, int32, int64 or bool. Note: the float64 data will be converted to float32 because of current platform protobuf
             data limitation.
-        output (Tensor, optional): A tensor. If :attr:`output` is None, a new tensor will
-            be created as :attr:`output`. Default: None.
+        output (Tensor, optional): A Tensor. If :attr:`output` is None, a new Tensor will be created as :attr:`output`. Default: None.
  
     Returns:
-        Tensor: A tensor with the same shape, data type and value as :attr:`x`.
+        Tensor: A Tensor with the same shape, data type and value as :attr:`x`.
  
     Examples:
         .. code-block:: python
- 
+          :name: assign-example
+
           import paddle
           import numpy as np
           data = paddle.full(shape=[3, 2], fill_value=2.5, dtype='float64') # [[2.5, 2.5], [2.5, 2.5], [2.5, 2.5]]
diff --git a/python/paddle/tensor/search.py b/python/paddle/tensor/search.py
index 42087ac7dafa3..94a05294aaa63 100644
--- a/python/paddle/tensor/search.py
+++ b/python/paddle/tensor/search.py
@@ -572,49 +572,46 @@ def mode(x, axis=-1, keepdim=False, name=None):
 
 def where(condition, x=None, y=None, name=None):
     r"""
-    Return a tensor of elements selected from either $x$ or $y$, depending on $condition$.
-
-    **Note**:
-        ``paddle.where(condition)`` is identical to ``paddle.nonzero(condition, as_tuple=True)``.
+    Return a Tensor of elements selected from either :attr:`x` or :attr:`y` according to corresponding elements of :attr:`condition`. Concretely,
 
     .. math::
 
-      out_i =
-      \begin{cases}
-      x_i, \quad  \text{if}  \ condition_i \  is \ True \\
-      y_i, \quad  \text{if}  \ condition_i \  is \ False \\
-      \end{cases}
+        out_i =
+        \begin{cases}
+        x_i, & \text{if}  \ condition_i \  \text{is} \ True \\
+        y_i, & \text{if}  \ condition_i \  \text{is} \ False \\
+        \end{cases}.
 
+    Notes:
+        ``numpy.where(condition)`` is identical to ``paddle.nonzero(condition, as_tuple=True)``, please refer to :ref:`api_tensor_search_nonzero`.
 
     Args:
-        condition(Tensor): The condition to choose x or y. When True(nonzero), yield x, otherwise yield y.
-        x(Tensor or Scalar, optional): x is a Tensor or Scalar with data type float32, float64, int32, int64. Either both or neither of x and y should be given.
-        y(Tensor or Scalar, optional): y is a Tensor or Scalar with data type float32, float64, int32, int64. Either both or neither of x and y should be given.
-
-        name(str, optional): The default value is None. Normally there is no
-            need for user to set this property. For more information, please
-            refer to :ref:`api_guide_Name`.
+        condition (Tensor): The condition to choose x or y. When True (nonzero), yield x, otherwise yield y.
+        x (Tensor|scalar, optional): A Tensor or scalar to choose when the condition is True with data type of float32, float64, int32 or int64. Either both or neither of x and y should be given.
+        y (Tensor|scalar, optional): A Tensor or scalar to choose when the condition is False with data type of float32, float64, int32 or int64. Either both or neither of x and y should be given.
+        name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
 
     Returns:
-        Tensor: A Tensor with the same data dype as x. 
+        Tensor: A Tensor with the same shape as :attr:`condition` and same data type as :attr:`x` and :attr:`y`.
 
     Examples:
         .. code-block:: python
+            :name:where-example
 
-          import paddle
+            import paddle
 
-          x = paddle.to_tensor([0.9383, 0.1983, 3.2, 1.2])
-          y = paddle.to_tensor([1.0, 1.0, 1.0, 1.0])
-          out = paddle.where(x>1, x, y)
+            x = paddle.to_tensor([0.9383, 0.1983, 3.2, 1.2])
+            y = paddle.to_tensor([1.0, 1.0, 1.0, 1.0])
+            out = paddle.where(x>1, x, y)
 
-          print(out)
-          #out: [1.0, 1.0, 3.2, 1.2]
+            print(out)
+            #out: [1.0, 1.0, 3.2, 1.2]
 
-          out = paddle.where(x>1)
-          print(out)
-          #out: (Tensor(shape=[2, 1], dtype=int64, place=CPUPlace, stop_gradient=True,
-          #            [[2],
-          #             [3]]),)
+            out = paddle.where(x>1)
+            print(out)
+            #out: (Tensor(shape=[2, 1], dtype=int64, place=CPUPlace, stop_gradient=True,
+            #            [[2],
+            #             [3]]),)
     """
     if np.isscalar(x):
         x = paddle.full([1], x, np.array([x]).dtype.name)

From 0fdb3ced4574487c3fbed7f325aa7b89f71af28b Mon Sep 17 00:00:00 2001
From: Guoxia Wang <mingzilaochongtu@gmail.com>
Date: Tue, 7 Jun 2022 14:23:18 +0800
Subject: [PATCH 20/53] add bf16 dtype for flatten kernel (#43264)

---
 paddle/phi/kernels/flatten_grad_kernel.cc | 2 ++
 paddle/phi/kernels/flatten_kernel.cc      | 4 ++++
 2 files changed, 6 insertions(+)

diff --git a/paddle/phi/kernels/flatten_grad_kernel.cc b/paddle/phi/kernels/flatten_grad_kernel.cc
index 54279fca6e429..73d963f606e3f 100644
--- a/paddle/phi/kernels/flatten_grad_kernel.cc
+++ b/paddle/phi/kernels/flatten_grad_kernel.cc
@@ -38,6 +38,7 @@ PD_REGISTER_KERNEL(flatten_grad,
                    CPU,
                    ALL_LAYOUT,
                    phi::FlattenGradKernel,
+                   phi::dtype::bfloat16,
                    float,
                    double,
                    uint8_t,
@@ -52,6 +53,7 @@ PD_REGISTER_KERNEL(flatten_grad,
                    phi::FlattenGradKernel,
                    float,
                    phi::dtype::float16,
+                   phi::dtype::bfloat16,
                    double,
                    uint8_t,
                    int8_t,
diff --git a/paddle/phi/kernels/flatten_kernel.cc b/paddle/phi/kernels/flatten_kernel.cc
index dd000896073c7..006d3438288c1 100644
--- a/paddle/phi/kernels/flatten_kernel.cc
+++ b/paddle/phi/kernels/flatten_kernel.cc
@@ -54,6 +54,7 @@ PD_REGISTER_KERNEL(flatten,
                    ALL_LAYOUT,
                    phi::FlattenKernel,
                    float,
+                   phi::dtype::bfloat16,
                    double,
                    uint8_t,
                    int8_t,
@@ -66,6 +67,7 @@ PD_REGISTER_KERNEL(flatten_with_xshape,
                    ALL_LAYOUT,
                    phi::FlattenWithXShape,
                    float,
+                   phi::dtype::bfloat16,
                    double,
                    uint8_t,
                    int8_t,
@@ -80,6 +82,7 @@ PD_REGISTER_KERNEL(flatten,
                    phi::FlattenKernel,
                    float,
                    phi::dtype::float16,
+                   phi::dtype::bfloat16,
                    double,
                    uint8_t,
                    int8_t,
@@ -93,6 +96,7 @@ PD_REGISTER_KERNEL(flatten_with_xshape,
                    phi::FlattenWithXShape,
                    float,
                    phi::dtype::float16,
+                   phi::dtype::bfloat16,
                    double,
                    uint8_t,
                    int8_t,

From 42dd0f1b7a46d13a59e4d901dcd8499e7643bbce Mon Sep 17 00:00:00 2001
From: qipengh <huangqipeng@cambricon.com>
Date: Tue, 7 Jun 2022 14:29:49 +0800
Subject: [PATCH 21/53] [MLU]support cast double type (#43058)

* [MLU]support cast double type

* [MLU]fix cast test
---
 paddle/fluid/operators/cast_op_mlu.cc         | 32 +------------------
 paddle/fluid/operators/mlu/mlu_baseop.h       |  3 ++
 .../tests/unittests/mlu/test_cast_op_mlu.py   | 19 +++++++++++
 3 files changed, 23 insertions(+), 31 deletions(-)

diff --git a/paddle/fluid/operators/cast_op_mlu.cc b/paddle/fluid/operators/cast_op_mlu.cc
index f28889e7acf87..f0df271a8d07e 100644
--- a/paddle/fluid/operators/cast_op_mlu.cc
+++ b/paddle/fluid/operators/cast_op_mlu.cc
@@ -44,37 +44,7 @@ class CastMLUKernel : public framework::OpKernel<T> {
                           framework::DataTypeToString(src_type),
                           framework::DataTypeToString(dst_type)));
 
-    switch (dst_type) {
-      case VT::FP32:
-        output->mutable_data<float>(place);
-        break;
-      case VT::FP16:
-        output->mutable_data<paddle::platform::float16>(place);
-        break;
-      case VT::INT32:
-        output->mutable_data<int32_t>(place);
-        break;
-      case VT::INT16:
-        output->mutable_data<int16_t>(place);
-        break;
-      case VT::INT8:
-        output->mutable_data<int8_t>(place);
-        break;
-      case VT::UINT8:
-        output->mutable_data<uint8_t>(place);
-        break;
-      case VT::BOOL:
-        output->mutable_data<bool>(place);
-        break;
-      case VT::INT64:
-        output->mutable_data<int64_t>(place);
-        break;
-      default:
-        PADDLE_THROW(platform::errors::Unavailable(
-            "Not supported cast %d -> %d",
-            framework::DataTypeToString(src_type),
-            framework::DataTypeToString(dst_type)));
-    }
+    output->mutable_data(place, framework::TransToPhiDataType(dst_type));
 
     MLUCnnlTensorDesc input_desc(*input);
     MLUCnnlTensorDesc output_desc(*output);
diff --git a/paddle/fluid/operators/mlu/mlu_baseop.h b/paddle/fluid/operators/mlu/mlu_baseop.h
index f048ac7c5c3be..c97ee3efd3f56 100644
--- a/paddle/fluid/operators/mlu/mlu_baseop.h
+++ b/paddle/fluid/operators/mlu/mlu_baseop.h
@@ -75,6 +75,9 @@ inline cnnlDataType_t ToCnnlDataType(
     case DataType::FLOAT32:
       type = CNNL_DTYPE_FLOAT;
       break;
+    case DataType::FLOAT64:
+      type = CNNL_DTYPE_DOUBLE;
+      break;
     case DataType::INT8:
       type = CNNL_DTYPE_INT8;
       break;
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_cast_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_cast_op_mlu.py
index 6ba62b11499f4..88b46af8df2a3 100644
--- a/python/paddle/fluid/tests/unittests/mlu/test_cast_op_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_cast_op_mlu.py
@@ -61,6 +61,25 @@ def setUp(self):
         self.op_type = 'cast'
         self.place = paddle.device.MLUPlace(0)
         self.__class__.use_mlu = True
+        self.__class__.no_need_check_grad = True
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, atol=1e-3)
+
+
+class TestCastOpFp32ToFp64(OpTest):
+    def setUp(self):
+        ipt = np.random.random(size=[10, 10])
+        self.inputs = {'X': ipt.astype('float32')}
+        self.outputs = {'Out': ipt.astype('float64')}
+        self.attrs = {
+            'in_dtype': int(core.VarDesc.VarType.FP32),
+            'out_dtype': int(core.VarDesc.VarType.FP64)
+        }
+        self.op_type = 'cast'
+        self.place = paddle.device.MLUPlace(0)
+        self.__class__.use_mlu = True
+        self.__class__.no_need_check_grad = True
 
     def test_check_output(self):
         self.check_output_with_place(self.place, atol=1e-3)

From 2922985a3d0d2a77a4e3fe4b5650755166be5768 Mon Sep 17 00:00:00 2001
From: Haohongxiang <86215757+haohongxiang@users.noreply.github.com>
Date: Tue, 7 Jun 2022 15:31:03 +0800
Subject: [PATCH 22/53] [Dygraph] Fix bugs of EagerReducer for complex control
 flows (#43252)

* fix bugs of reducer

* update

* update
---
 paddle/fluid/distributed/collective/reducer.cc        |  7 +++++++
 .../distributed/fleet/utils/hybrid_parallel_util.py   | 11 ++++++-----
 python/paddle/fluid/tests/unittests/CMakeLists.txt    |  2 +-
 .../unittests/test_parallel_dygraph_dataparallel.py   |  4 +++-
 .../test_parallel_dygraph_no_sync_gradient_check.py   |  4 ++++
 5 files changed, 21 insertions(+), 7 deletions(-)

diff --git a/paddle/fluid/distributed/collective/reducer.cc b/paddle/fluid/distributed/collective/reducer.cc
index 9c04b95a732e8..f3ac17cc46cd2 100644
--- a/paddle/fluid/distributed/collective/reducer.cc
+++ b/paddle/fluid/distributed/collective/reducer.cc
@@ -775,6 +775,13 @@ void EagerReducer::ProcessUnusedDenseVars() {
         continue;
       }
 
+      // NOTE(haohongxiang): Calling SetFakeEmpty here is to make sure that
+      // gradient accumulation can continue normally after clear_gradients()
+      // especiall in cases including complex control flow.
+      std::static_pointer_cast<egr::GradNodeAccumulation>(
+          GetGradNodeFromTensor(&tensors_[var_index]))
+          ->SetFakeEmpty(false);
+
       Tensor grad_value(std::make_shared<phi::DenseTensor>(src_tensor));
 
       auto dest_var_base = tensors_[var_index];
diff --git a/python/paddle/distributed/fleet/utils/hybrid_parallel_util.py b/python/paddle/distributed/fleet/utils/hybrid_parallel_util.py
index e2f7af769d39e..161f4d3262ab1 100644
--- a/python/paddle/distributed/fleet/utils/hybrid_parallel_util.py
+++ b/python/paddle/distributed/fleet/utils/hybrid_parallel_util.py
@@ -43,12 +43,11 @@ def _apply_collective_grads(parameters, comm_group):
 
     coalesced_grads_and_vars = build_groups(grad_vars, 128 * 1024 * 1024)
 
+    nranks = paddle.distributed.get_world_size(
+    ) if comm_group is None else comm_group.nranks
     for coalesced_grad, _, _ in coalesced_grads_and_vars:
         # need to div nranks
-        nranks = paddle.distributed.get_world_size(
-        ) if comm_group is None else comm_group.nranks
         div_factor = paddle.to_tensor(nranks, dtype=coalesced_grad.dtype)
-        paddle.distributed.all_reduce(coalesced_grad, group=comm_group)
         paddle.fluid.framework._dygraph_tracer().trace_op(
             type="elementwise_div",
             inputs={
@@ -57,6 +56,7 @@ def _apply_collective_grads(parameters, comm_group):
             },
             outputs={'Out': coalesced_grad},
             attrs={'axis': -1})
+        paddle.distributed.all_reduce(coalesced_grad, group=comm_group)
 
     _split_tensors(coalesced_grads_and_vars)
 
@@ -76,10 +76,11 @@ def _apply_collective_grads_eager(parameters, comm_group):
 
     coalesced_grads_and_vars = build_groups(grad_vars, 128 * 1024 * 1024)
 
-    div_factor = 1.0 / comm_group.nranks
+    nranks = paddle.distributed.get_world_size(
+    ) if comm_group is None else comm_group.nranks
     for coalesced_grad, _, _ in coalesced_grads_and_vars:
         # need to div nranks
-        coalesced_grad.scale_(div_factor)
+        coalesced_grad.scale_(1.0 / nranks)
         paddle.distributed.all_reduce(coalesced_grad, group=comm_group)
 
     _split_tensors(coalesced_grads_and_vars)
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 214c68c250ea9..6710ddb97dc24 100755
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -1507,7 +1507,7 @@ if(WITH_DISTRIBUTE
                                                                      350)
   set_tests_properties(test_parallel_dygraph_no_sync PROPERTIES TIMEOUT 300)
   set_tests_properties(test_parallel_dygraph_no_sync_gradient_check
-                       PROPERTIES TIMEOUT 30)
+                       PROPERTIES TIMEOUT 60)
   set_tests_properties(test_parallel_dygraph_pipeline_parallel
                        PROPERTIES TIMEOUT 500)
   set_tests_properties(test_parallel_dygraph_tensor_parallel PROPERTIES TIMEOUT
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_dataparallel.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_dataparallel.py
index 930bf5345fcae..1e8aae7226a7e 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_dataparallel.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_dataparallel.py
@@ -200,7 +200,8 @@ def run_mnist_2cpu(self, target_file_name):
 class TestDataParallelGradientCheck(TestMultipleGpus):
 
     def test_multiple_gpus_dynamic(self):
-        self.run_mnist_2gpu('parallel_dygraph_gradient_check.py')
+        self.run_mnist_2gpu('parallel_dygraph_gradient_check.py',
+                            eager_mode=False)
 
 
 class TestDataParallelWithPyLayer(TestMultipleGpus):
@@ -218,4 +219,5 @@ def test_multiple_gpus_dynamic(self):
 
 
 if __name__ == "__main__":
+    os.environ["FLAGS_enable_eager_mode"] = "1"
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_no_sync_gradient_check.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_no_sync_gradient_check.py
index fad9e902cc91e..d6a48b504a2dc 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_no_sync_gradient_check.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_no_sync_gradient_check.py
@@ -14,6 +14,7 @@
 
 from __future__ import print_function
 
+import os
 import unittest
 import paddle.fluid as fluid
 
@@ -24,7 +25,10 @@ class TestDataParallelLayer(TestMultipleGpus):
 
     def test_parallel_dygraph_dataparallel_no_sync(self):
         self.run_mnist_2gpu('parallel_dygraph_no_sync_gradient_check.py')
+        self.run_mnist_2gpu('parallel_dygraph_no_sync_gradient_check.py',
+                            eager_mode=False)
 
 
 if __name__ == "__main__":
+    os.environ["FLAGS_enable_eager_mode"] = "1"
     unittest.main()

From b4a3dab727e5d5c50b040326ab9e52ba82b957f7 Mon Sep 17 00:00:00 2001
From: Yuang Liu <liuyuang@baidu.com>
Date: Tue, 7 Jun 2022 16:20:37 +0800
Subject: [PATCH 23/53] [cuda graph] Add cuda graph attr to op desc (#43228)

---
 python/paddle/device/cuda/graphs.py           |  20 +++
 python/paddle/fluid/backward.py               | 137 ++++++++++++++----
 python/paddle/fluid/framework.py              |  35 +++++
 .../test_cuda_graph_partial_graph_static.py   |  71 +++++++++
 4 files changed, 237 insertions(+), 26 deletions(-)
 create mode 100644 python/paddle/fluid/tests/unittests/test_cuda_graph_partial_graph_static.py

diff --git a/python/paddle/device/cuda/graphs.py b/python/paddle/device/cuda/graphs.py
index c6554d78fb86a..dca32fb6bb85b 100644
--- a/python/paddle/device/cuda/graphs.py
+++ b/python/paddle/device/cuda/graphs.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import os
+import paddle
 from paddle.fluid.core import is_compiled_with_cuda, is_compiled_with_rocm, CUDAPlace
 
 if is_compiled_with_cuda() and not is_compiled_with_rocm():
@@ -28,6 +29,7 @@ def is_cuda_graph_supported():
 
 
 ALL_MODES = ["global", "thread_local", "relaxed"]
+cuda_graph_id = 0
 
 
 class CUDAGraph:
@@ -68,6 +70,24 @@ def print_to_dot_files(self, dirname, flags=None):
 
 def wrap_cuda_graph(function, mode="thread_local", memory_pool="default"):
     assert mode in ALL_MODES
+    if not paddle.in_dynamic_mode():
+        # static mode
+        from paddle.fluid.framework import _cuda_graph_guard
+        global cuda_graph_id
+        graph_id = str(cuda_graph_id)
+        cuda_graph_id += 1
+        if memory_pool == 'default':
+            memory_pool_id = 0
+        elif memory_pool == 'new':
+            memory_pool_id = CoreCUDAGraph.gen_new_memory_pool_id()
+        else:
+            raise ValueError(
+                "memory_pool should be one of default or new under static mode, but got",
+                memory_pool)
+        return _cuda_graph_guard(
+            mode + ';' + str(memory_pool_id) + ';' +
+            graph_id)(lambda *args, **kwargs: function(*args, **kwargs))
+
     from paddle.jit import to_static
     from paddle.nn import Layer
     new_function = to_static(function)
diff --git a/python/paddle/fluid/backward.py b/python/paddle/fluid/backward.py
index 0ca69b5f94de4..c37ac87da71b8 100755
--- a/python/paddle/fluid/backward.py
+++ b/python/paddle/fluid/backward.py
@@ -236,7 +236,11 @@ def _pretty_op_desc_(op_desc, prefix):
     return out_s
 
 
-def _add_needed_descs_to_block(descs, block, main_block, in_memory_vars):
+def _add_needed_descs_to_block(descs,
+                               block,
+                               main_block,
+                               in_memory_vars,
+                               grad_op_id_to_fwd_op=None):
     if len(descs) == 0:
         return []
     result_descs = []
@@ -244,8 +248,11 @@ def _add_needed_descs_to_block(descs, block, main_block, in_memory_vars):
         core.op_proto_and_checker_maker.kOpRoleAttrName()
     backward = core.op_proto_and_checker_maker.OpRole.Backward
     for desc in descs:
+        origin_desc = desc
+        origin_is_operator = False
         if isinstance(desc, framework.Operator):
             desc = desc.desc
+            origin_is_operator = True
         if isinstance(desc, tuple):
             desc = desc[0]
         is_needed = False
@@ -255,6 +262,8 @@ def _add_needed_descs_to_block(descs, block, main_block, in_memory_vars):
             if name not in in_memory_vars:
                 is_needed = True
         if is_needed:
+            if origin_is_operator and grad_op_id_to_fwd_op is not None:
+                grad_op_id_to_fwd_op[desc.original_id()] = origin_desc
             new_op_desc = block.desc.append_op()
             new_op_desc.copy_from(desc)
             new_op_desc._set_attr(op_role_attr_name, backward)
@@ -264,7 +273,7 @@ def _add_needed_descs_to_block(descs, block, main_block, in_memory_vars):
     return result_descs
 
 
-def _add_descs_to_block(descs, block):
+def _add_descs_to_block(descs, block, grad_op_id_to_fwd_op=None):
     if len(descs) == 0:
         return []
     result_descs = []
@@ -273,6 +282,9 @@ def _add_descs_to_block(descs, block):
     backward = core.op_proto_and_checker_maker.OpRole.Backward
     for desc in descs:
         if isinstance(desc, framework.Operator):
+            # for recompute, should record recompute ops
+            if grad_op_id_to_fwd_op is not None:
+                grad_op_id_to_fwd_op[desc.desc.original_id()] = desc
             desc = desc.desc
         if isinstance(desc, tuple):
             desc = desc[0]
@@ -489,7 +501,10 @@ def _accumulate_gradients_by_add_ops_(var_name,
     renamed_vars[var_name] = [var_name]
 
 
-def _addup_repetitive_outputs_(op_descs, block_idx, grad_var_to_var=None):
+def _addup_repetitive_outputs_(op_descs,
+                               block_idx,
+                               grad_var_to_var=None,
+                               grad_op_id_to_fwd_op=None):
     """
     In backward part, an variable may be the output of more than one ops.
     And one op may yield its multiple outputs to the same variable.
@@ -500,6 +515,7 @@ def _addup_repetitive_outputs_(op_descs, block_idx, grad_var_to_var=None):
         grad_var_to_var(dict): used to build the mapping between grad var name and forward var name.
         Only for auto parallel.
     """
+
     _MAX_ADD_NUM_ = framework._global_flags()['FLAGS_max_inplace_grad_add']
     #pending_sum_ops = []
     pending_sum_ops = collections.OrderedDict()
@@ -604,6 +620,7 @@ def _addup_repetitive_outputs_(op_descs, block_idx, grad_var_to_var=None):
                                                   len(op_descs),
                                                   var_device[var_name])
 
+    op_descs_len = len(op_descs)
     # sum_op descs are sorted according to their insert position
     for key, value in collections.OrderedDict(
             reversed(list(pending_sum_ops.items()))).items():
@@ -614,12 +631,18 @@ def _addup_repetitive_outputs_(op_descs, block_idx, grad_var_to_var=None):
         # If not reverse, we first insert 'a' at idx 1, it becomes [0, 1, 'a', 2], and then insert 'b' at idx 2, it becomes [0, 1, 'a', 'b', 2].
         idx = key
         for i, op in enumerate(value):
+            # update the mapping between fwd and bwd
+            target_idx = idx - 1 if idx == op_descs_len else idx + i
+            if grad_op_id_to_fwd_op is not None and grad_op_id_to_fwd_op.get(
+                    op_descs[target_idx].original_id(), None) is not None:
+                grad_op_id_to_fwd_op[op.original_id()] = grad_op_id_to_fwd_op[
+                    op_descs[target_idx].original_id()]
             op_descs.insert(idx + i, op)
 
     return op_descs
 
 
-def _remove_no_grad_branch_(op_descs, no_grad_set):
+def _remove_no_grad_branch_(op_descs, no_grad_set, grad_op_id_to_fwd_op=None):
     """
     Remove unnecessary grad ops
     A grad op can be removed in two cases:
@@ -653,9 +676,14 @@ def _op_can_be_removed_(op_desc, no_grad_set):
                 x_in = _strip_grad_suffix_(arg)
                 # the reason should be: arg can be input of another grad op
                 # and the op is a not-to-remove op
-                to_insert.append(
-                    (_create_op_desc_("fill_zeros_like", {"X": [x_in]},
-                                      {"Out": [arg]}, {}), idx))
+                new_op_desc = _create_op_desc_("fill_zeros_like", {"X": [x_in]},
+                                               {"Out": [arg]}, {})
+                # update the mapping between fwd and bwd
+                if grad_op_id_to_fwd_op is not None and grad_op_id_to_fwd_op.get(
+                        op_desc.original_id(), None) is not None:
+                    grad_op_id_to_fwd_op[new_op_desc.original_id(
+                    )] = grad_op_id_to_fwd_op[op_desc.original_id()]
+                to_insert.append((new_op_desc, idx))
 
     list([op_descs.insert(p[1], p[0]) for p in reversed(to_insert)])
 
@@ -794,9 +822,13 @@ def serialize_op_decs(op_desc):
     return proto.__str__()
 
 
-def _append_backward_ops_with_checkpoints_(block, ops, target_block,
-                                           no_grad_dict, grad_to_var,
-                                           checkpoints):
+def _append_backward_ops_with_checkpoints_(block,
+                                           ops,
+                                           target_block,
+                                           no_grad_dict,
+                                           grad_to_var,
+                                           checkpoints,
+                                           grad_op_id_to_fwd_op=None):
     """
     Create grad ops with forward ops, and insert them into given block
 
@@ -926,12 +958,19 @@ def _append_backward_ops_with_checkpoints_(block, ops, target_block,
                                 _pretty_op_desc_(op.desc, "with_sub_block"))
             grad_op_desc, op_grad_to_var = core.get_grad_op_desc(
                 op.desc, cpt.to_text(no_grad_dict[block.idx]), [])
+
+            # record the mapping between fwd and bwd
+            if grad_op_id_to_fwd_op is not None:
+                for op_desc in grad_op_desc:
+                    grad_op_id_to_fwd_op[op_desc.original_id()] = op
+
             # Set device for grad_op according to forward Op
             if op.desc.has_attr(device_attr_name):
                 op_device = op.desc.attr(device_attr_name)
                 for op_desc in grad_op_desc:
                     op_desc._set_attr(device_attr_name, op_device)
-            added_descs = _add_descs_to_block(grad_op_desc, local_block)
+            added_descs = _add_descs_to_block(grad_op_desc, local_block,
+                                              grad_op_id_to_fwd_op)
             grad_op_descs.extend(added_descs)
             grad_to_var.update(op_grad_to_var)
 
@@ -945,12 +984,19 @@ def _append_backward_ops_with_checkpoints_(block, ops, target_block,
                                 _pretty_op_desc_(op.desc, "with_sub_block"))
             grad_op_desc, op_grad_to_var = core.get_grad_op_desc(
                 op.desc, cpt.to_text(no_grad_dict[block.idx]), [])
+
+            # record the mapping between fwd and bwd
+            if grad_op_id_to_fwd_op is not None:
+                for op_desc in grad_op_desc:
+                    grad_op_id_to_fwd_op[op_desc.original_id()] = op
+
             # Set device for grad_op according to forward Op
             if op.desc.has_attr(device_attr_name):
                 op_device = op.desc.attr(device_attr_name)
                 for op_desc in grad_op_desc:
                     op_desc._set_attr(device_attr_name, op_device)
-            added_descs = _add_descs_to_block(grad_op_desc, local_block)
+            added_descs = _add_descs_to_block(grad_op_desc, local_block,
+                                              grad_op_id_to_fwd_op)
             grad_op_descs.extend(added_descs)
             grad_to_var.update(op_grad_to_var)
 
@@ -984,8 +1030,10 @@ def _append_backward_ops_with_checkpoints_(block, ops, target_block,
 
         # 3.a. add ops in current recompute_segment as forward recomputation ops
         buffer_descs = _add_needed_descs_to_block(ff_ops, buffer_block, block,
-                                                  vars_in_memory)
-        added_descs = _add_descs_to_block(ff_ops, local_block)
+                                                  vars_in_memory,
+                                                  grad_op_id_to_fwd_op)
+        added_descs = _add_descs_to_block(ff_ops, local_block,
+                                          grad_op_id_to_fwd_op)
 
         # 3.b. rename all non-checkpoint variables in recomputation ops
         for key in var_name_dict:
@@ -999,6 +1047,12 @@ def _append_backward_ops_with_checkpoints_(block, ops, target_block,
             grad_op_desc, op_grad_to_var = core.get_grad_op_desc(
                 op_desc, cpt.to_text(no_grad_dict[block.idx]), [])
 
+            # record the mapping between fwd and bwd
+            if grad_op_id_to_fwd_op is not None:
+                for g_op_desc in grad_op_desc:
+                    grad_op_id_to_fwd_op[g_op_desc.original_id(
+                    )] = grad_op_id_to_fwd_op[op_desc.original_id()]
+
             # Set device for grad_op according to forward Op
             if op_desc.has_attr(device_attr_name):
                 op_device = op_desc.attr(device_attr_name)
@@ -1011,11 +1065,14 @@ def _append_backward_ops_with_checkpoints_(block, ops, target_block,
             grad_to_var.update(op_grad_to_var)
 
     # 3.d. add sum op for repetitive_outputs
-    grad_op_descs = _addup_repetitive_outputs_(grad_op_descs, block.idx)
+    grad_op_descs = _addup_repetitive_outputs_(
+        grad_op_descs, block.idx, grad_op_id_to_fwd_op=grad_op_id_to_fwd_op)
     # 4) remove no grad branch as it is in _remove_no_grad_branch_
     grad_op_descs = _remove_no_grad_branch_(grad_op_descs,
-                                            no_grad_dict[block.idx])
-    added_descs = _add_descs_to_block(grad_op_descs, target_block)
+                                            no_grad_dict[block.idx],
+                                            grad_op_id_to_fwd_op)
+    added_descs = _add_descs_to_block(grad_op_descs, target_block,
+                                      grad_op_id_to_fwd_op)
     return program_stat, checkpoints_name, vars_should_be_hold, recompute_segments
 
 
@@ -1090,7 +1147,8 @@ def _append_backward_ops_(block,
                           input_grad_names_set=None,
                           op_path_dict=None,
                           distop_context=None,
-                          rename_var_map=None):
+                          rename_var_map=None,
+                          grad_op_id_to_fwd_op=None):
     """
     Create all grad ops, and insert them into given block
 
@@ -1152,9 +1210,15 @@ def update_distop_context(distop_context, op_grad_to_var,
             pre_input_grad_names_set = copy.copy(input_grad_names_set)
             input_grad_names_set = None
             sub_block_path = op_path_dict[op._block_attr_id("sub_block")]
-            _append_backward_ops_(sub_block, sub_block_path, grad_sub_block,
-                                  no_grad_dict, grad_to_var, callbacks,
-                                  input_grad_names_set, op_path_dict)
+            _append_backward_ops_(sub_block,
+                                  sub_block_path,
+                                  grad_sub_block,
+                                  no_grad_dict,
+                                  grad_to_var,
+                                  callbacks,
+                                  input_grad_names_set,
+                                  op_path_dict,
+                                  grad_op_id_to_fwd_op=grad_op_id_to_fwd_op)
             input_grad_names_set = pre_input_grad_names_set
 
             program._rollback()
@@ -1164,6 +1228,11 @@ def update_distop_context(distop_context, op_grad_to_var,
         grad_op_desc, op_grad_to_var = core.get_grad_op_desc(
             op.desc, cpt.to_text(no_grad_dict[block.idx]), grad_sub_block_list)
 
+        # record the mapping between fwd and bwd
+        if grad_op_id_to_fwd_op is not None:
+            for op_desc in grad_op_desc:
+                grad_op_id_to_fwd_op[op_desc.original_id()] = op
+
         # Build the mapping between the forward op and backward op (Only for auto parallel)
         if distop_context is not None:
             update_distop_context(distop_context, op_grad_to_var,
@@ -1251,13 +1320,17 @@ def update_distop_context(distop_context, op_grad_to_var,
         grad_var_to_var = distop_context.grad_var_to_var[
             program._appending_grad_times]
     # sum parameter's gradients' var given multiple var gradient
-    grad_op_descs = _addup_repetitive_outputs_(grad_op_descs, block.idx,
-                                               grad_var_to_var)
+    grad_op_descs = _addup_repetitive_outputs_(
+        grad_op_descs,
+        block.idx,
+        grad_var_to_var,
+        grad_op_id_to_fwd_op=grad_op_id_to_fwd_op)
 
     # if all outputs of the grad op are in no_grad_set, then just remove and fill zero
     # if all inputs of the grad op are in no_grad_set, just remove this op
     grad_op_descs = _remove_no_grad_branch_(grad_op_descs,
-                                            no_grad_dict[block.idx])
+                                            no_grad_dict[block.idx],
+                                            grad_op_id_to_fwd_op)
 
     # remove some backward ops
     not_need_ops = _find_not_need_ops(grad_op_descs, ops, input_grad_names_set)
@@ -1585,6 +1658,9 @@ def append_backward(loss,
             p_g_list6 = paddle.static.append_backward(loss=avg_loss, parameter_list=all_weights, no_grad_set=set(all_weights))
 
     """
+    grad_op_id_to_fwd_op = {
+    }  # for cuda graph usage, recording the mapping between grad op original id to fwd op
+
     check_type(loss, 'loss', framework.Variable,
                'paddle.static.append_backward')
 
@@ -1644,7 +1720,9 @@ def append_backward(loss,
 
     grad_to_var = dict()
 
+    # pass the cuda_graph_attr to the fill_constant which generates the loss_grad
     op_desc = _create_loss_op_desc_(loss)
+    grad_op_id_to_fwd_op[op_desc.original_id()] = loss.op
     target_grad_block.desc.append_op().copy_from(op_desc)
 
     for block_idx in son_parent_block_idx_dict:
@@ -1690,7 +1768,8 @@ def append_backward(loss,
                     root_block,
                     no_grad_dict,
                     grad_to_var,
-                    checkpoints)
+                    checkpoints,
+                    grad_op_id_to_fwd_op)
         else:
             _append_backward_ops_(
                 block,  # the block where forward ops are in
@@ -1702,7 +1781,7 @@ def append_backward(loss,
                 input_grad_names_set=input_grad_names_set,
                 op_path_dict=op_path_dict,
                 distop_context=distop_context,
-            )
+                grad_op_id_to_fwd_op=grad_op_id_to_fwd_op)
 
     grad_info_map = dict()
 
@@ -1722,6 +1801,12 @@ def append_backward(loss,
     program.current_block_idx = current_block_idx
     program._sync_with_cpp()
 
+    # for cuda graph, copy the cuda graph attr from forward op to backward op
+    for op in target_grad_block.ops:
+        if grad_op_id_to_fwd_op.get(op.desc.original_id(), None) is not None:
+            fwd_op = grad_op_id_to_fwd_op[op.desc.original_id()]
+            op._cuda_graph_attr = fwd_op._cuda_graph_attr
+
     if parameter_list is not None:
         check_type(parameter_list, 'parameter_list', (list, tuple, set),
                    'fluid.backward.append_backward')
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index e0b4f8d19e861..fdd5c0b47b4dc 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -81,6 +81,7 @@
 _current_pipeline_stage = None
 _already_patch_eager_tensor = False
 _already_patch_varbase = False
+_current_cuda_graph_mode = None
 _global_flags_ = core.globals()
 
 # Some explanation of our execution system 2022.03
@@ -2622,6 +2623,9 @@ def __init__(self,
                 op_attrs = dict()
             del attrs
 
+            # attr for static mode cuda graph
+            self._cuda_graph_attr = _current_cuda_graph_mode
+
             op_maker = core.op_proto_and_checker_maker
 
             if op_maker.kOpRoleAttrName() not in op_attrs:
@@ -7017,6 +7021,37 @@ def device_guard(device=None):
         switch_device(pre_device)
 
 
+def _switch_cuda_graph_mode(cuda_graph_attr):
+    global _current_cuda_graph_mode
+    pre_mode = _current_cuda_graph_mode
+    _current_cuda_graph_mode = cuda_graph_attr
+    return pre_mode
+
+
+@signature_safe_contextmanager
+def _cuda_graph_guard(cuda_graph_attr=None):
+    """
+
+    Note:
+        The API only supports static mode.
+
+    A context manager that specifies the cuda_graph_mode which indicating the cuda graph capture under static mode.
+
+    Args:
+        cuda_graph_attr(str|None): The cuda graph attr with the format of:
+                                   cuda_graph_capture_mode;memory_pool_id;cuda_graph_id
+    """
+    assert not _non_static_mode(
+    ), "cuda_graph_guard only works under static mode"
+    assert core.is_compiled_with_cuda(
+    ), "cuda_graph_guard context can be only used when Paddle is compiled with cuda"
+    pre_mode = _switch_cuda_graph_mode(cuda_graph_attr)
+    try:
+        yield
+    finally:
+        _switch_cuda_graph_mode(pre_mode)
+
+
 def set_flags(flags):
     """
     This function sets the GFlags value in Paddle.
diff --git a/python/paddle/fluid/tests/unittests/test_cuda_graph_partial_graph_static.py b/python/paddle/fluid/tests/unittests/test_cuda_graph_partial_graph_static.py
new file mode 100644
index 0000000000000..b70be74ea92a5
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_cuda_graph_partial_graph_static.py
@@ -0,0 +1,71 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.nn as nn
+import unittest
+import numpy as np
+from paddle.device.cuda.graphs import wrap_cuda_graph, is_cuda_graph_supported
+
+paddle.enable_static()
+
+
+class SimpleModel(nn.Layer):
+
+    def __init__(self, in_size, out_size):
+        super(SimpleModel, self).__init__()
+        self.linear = nn.Linear(in_size, out_size)
+        self.dropout_1 = paddle.nn.Dropout(0.1)
+        self.relu = nn.ReLU()
+        self.dropout_2 = paddle.nn.Dropout(0.5)
+        self.gelu = nn.GELU()
+
+    def forward(self, x):
+        x = self.linear(x)
+        x = self.dropout_1(x)
+        x = self.relu(x)
+        x = self.dropout_2(x)
+        x = self.gelu(x)
+        return x
+
+
+class TestCudaGraphAttrAll(unittest.TestCase):
+
+    def test_all_program(self):
+        if not is_cuda_graph_supported():
+            return
+        main_prog = paddle.static.Program()
+        start_prog = paddle.static.Program()
+        with paddle.static.program_guard(main_prog, start_prog):
+            model = SimpleModel(10, 20)
+            cuda_graph_model = wrap_cuda_graph(model)
+            x = paddle.static.data(shape=[3, 10], dtype='float32', name='x')
+            y = cuda_graph_model(x)
+            loss = paddle.mean(y)
+            opt = paddle.optimizer.SGD()
+            opt.minimize(loss)
+            block = main_prog.global_block()
+            for op in block.ops:
+                if op._cuda_graph_attr is None:
+                    # the loss and opt are not wrapped
+                    assert op.type in [
+                        'sgd', 'reduce_mean', 'fill_constant',
+                        'reduce_mean_grad'
+                    ]
+                else:
+                    assert op._cuda_graph_attr == 'thread_local;0;0'
+
+
+if __name__ == "__main__":
+    unittest.main()

From c0ed75a8babb86a1fec345601f9aa39cd1756ee5 Mon Sep 17 00:00:00 2001
From: liutiexing <74819124+liutiexing@users.noreply.github.com>
Date: Tue, 7 Jun 2022 16:22:47 +0800
Subject: [PATCH 24/53] Update profiler (#42998)

* Update Profiler

* make HostEventRecorder templated
---
 paddle/fluid/platform/profiler.cc             | 30 +++++++++++--------
 .../platform/profiler/host_event_recorder.h   | 23 +++++++-------
 paddle/fluid/platform/profiler/host_tracer.cc |  8 ++---
 3 files changed, 34 insertions(+), 27 deletions(-)

diff --git a/paddle/fluid/platform/profiler.cc b/paddle/fluid/platform/profiler.cc
index c573650f1791f..47141bd73a555 100644
--- a/paddle/fluid/platform/profiler.cc
+++ b/paddle/fluid/platform/profiler.cc
@@ -192,15 +192,15 @@ void RecordEvent::End() {
   if (LIKELY(FLAGS_enable_host_event_recorder_hook && is_enabled_)) {
     uint64_t end_ns = PosixInNsec();
     if (LIKELY(shallow_copy_name_ != nullptr)) {
-      HostEventRecorder::GetInstance().RecordEvent(
+      HostEventRecorder<CommonEvent>::GetInstance().RecordEvent(
           shallow_copy_name_, start_ns_, end_ns, role_, type_);
     } else if (name_ != nullptr) {
       if (attr_ == nullptr) {
-        HostEventRecorder::GetInstance().RecordEvent(*name_, start_ns_, end_ns,
-                                                     role_, type_);
+        HostEventRecorder<CommonEvent>::GetInstance().RecordEvent(
+            *name_, start_ns_, end_ns, role_, type_);
       } else {
-        HostEventRecorder::GetInstance().RecordEvent(*name_, start_ns_, end_ns,
-                                                     role_, type_, *attr_);
+        HostEventRecorder<CommonEvent>::GetInstance().RecordEvent(
+            *name_, start_ns_, end_ns, role_, type_, *attr_);
         delete attr_;
       }
       delete name_;
@@ -232,8 +232,8 @@ RecordInstantEvent::RecordInstantEvent(const char *name, TracerEventType type,
     return;
   }
   auto start_end_ns = PosixInNsec();
-  HostEventRecorder::GetInstance().RecordEvent(name, start_end_ns, start_end_ns,
-                                               EventRole::kOrdinary, type);
+  HostEventRecorder<CommonEvent>::GetInstance().RecordEvent(
+      name, start_end_ns, start_end_ns, EventRole::kOrdinary, type);
 }
 
 void MemEvenRecorder::PushMemRecord(const void *ptr, const Place &place,
@@ -327,7 +327,7 @@ void PopMemEvent(uint64_t start_ns, uint64_t end_ns, size_t bytes,
 
 void Mark(const std::string &name) {
   if (FLAGS_enable_host_event_recorder_hook) {
-    HostEventRecorder::GetInstance().RecordEvent(
+    HostEventRecorder<CommonEvent>::GetInstance().RecordEvent(
         name, 0, 0, EventRole::kOrdinary, TracerEventType::UserDefined);
     return;
   }
@@ -522,7 +522,8 @@ void DisableHostEventRecorder() {
 
 std::string PrintHostEvents() {
   std::ostringstream oss;
-  auto host_evt_sec = HostEventRecorder::GetInstance().GatherEvents();
+  auto host_evt_sec =
+      HostEventRecorder<CommonEvent>::GetInstance().GatherEvents();
   for (const auto &thr_evt_sec : host_evt_sec.thr_sections) {
     oss << thr_evt_sec.thread_id << std::endl;
     for (const auto &evt : thr_evt_sec.events) {
@@ -534,8 +535,9 @@ std::string PrintHostEvents() {
   return oss.str();
 }
 
-static void EmulateEventPushAndPop(const HostEventSection &host_sec,
-                                   std::map<uint64_t, ThreadEvents> *out) {
+static void EmulateEventPushAndPop(
+    const HostEventSection<CommonEvent> &host_sec,
+    std::map<uint64_t, ThreadEvents> *out) {
   for (const auto &thr_sec : host_sec.thr_sections) {
     uint64_t tid = thr_sec.thread_id;
     auto cur_thr_list = std::make_shared<EventList<Event>>();
@@ -582,7 +584,8 @@ static void EmulateEventPushAndPop(const HostEventSection &host_sec,
   }
 }
 
-static void EmulateCPURecordsAdd(const HostEventSection &host_sec) {
+static void EmulateCPURecordsAdd(
+    const HostEventSection<CommonEvent> &host_sec) {
   DeviceTracer *tracer = GetDeviceTracer();
   if (tracer == nullptr) {
     return;
@@ -610,7 +613,8 @@ static std::map<uint64_t, ThreadEvents> DockHostEventRecorderHostPart() {
   if (FLAGS_enable_host_event_recorder_hook == false) {
     return thr_events;
   }
-  auto host_evt_sec = HostEventRecorder::GetInstance().GatherEvents();
+  auto host_evt_sec =
+      HostEventRecorder<CommonEvent>::GetInstance().GatherEvents();
   EmulateEventPushAndPop(host_evt_sec, &thr_events);
   EmulateCPURecordsAdd(host_evt_sec);
   return thr_events;
diff --git a/paddle/fluid/platform/profiler/host_event_recorder.h b/paddle/fluid/platform/profiler/host_event_recorder.h
index 1359c3b85a096..d5b495e8b25b6 100644
--- a/paddle/fluid/platform/profiler/host_event_recorder.h
+++ b/paddle/fluid/platform/profiler/host_event_recorder.h
@@ -21,7 +21,6 @@
 #include "paddle/fluid/framework/new_executor/workqueue/thread_data_registry.h"
 #include "paddle/fluid/platform/macros.h"
 #include "paddle/fluid/platform/os_info.h"
-#include "paddle/fluid/platform/profiler/common_event.h"
 
 namespace paddle {
 namespace platform {
@@ -182,12 +181,14 @@ char *EventContainer<EventType>::GetStringStorage(size_t sz) {
   return storage;
 }
 
+template <typename EventType>
 struct ThreadEventSection {
   std::string thread_name;
   uint64_t thread_id;
-  std::vector<CommonEvent> events;
+  std::vector<EventType> events;
 };
 
+template <typename EventType>
 class ThreadEventRecorder {
  public:
   ThreadEventRecorder() {
@@ -204,8 +205,8 @@ class ThreadEventRecorder {
     base_evt_cntr_.Record(std::forward<Args>(args)...);
   }
 
-  ThreadEventSection GatherEvents() {
-    ThreadEventSection thr_sec;
+  ThreadEventSection<EventType> GatherEvents() {
+    ThreadEventSection<EventType> thr_sec;
     thr_sec.thread_name = thread_name_;
     thr_sec.thread_id = thread_id_;
     thr_sec.events = std::move(base_evt_cntr_.Reduce());
@@ -215,15 +216,17 @@ class ThreadEventRecorder {
  private:
   uint64_t thread_id_;
   std::string thread_name_;
-  EventContainer<CommonEvent> base_evt_cntr_;
+  EventContainer<EventType> base_evt_cntr_;
 };
 
+template <typename EventType>
 struct HostEventSection {
   std::string process_name;
   uint64_t process_id;
-  std::vector<ThreadEventSection> thr_sections;
+  std::vector<ThreadEventSection<EventType>> thr_sections;
 };
 
+template <typename EventType>
 class HostEventRecorder {
  public:
   // singleton
@@ -244,10 +247,10 @@ class HostEventRecorder {
 
   // thread-unsafe, make sure make sure there is no running tracing.
   // Poor performance, call it at the ending
-  HostEventSection GatherEvents() {
+  HostEventSection<EventType> GatherEvents() {
     auto thr_recorders =
         ThreadEventRecorderRegistry::GetInstance().GetAllThreadDataByRef();
-    HostEventSection host_sec;
+    HostEventSection<EventType> host_sec;
     host_sec.process_id = GetProcessId();
     host_sec.thr_sections.reserve(thr_recorders.size());
     for (auto &kv : thr_recorders) {
@@ -260,12 +263,12 @@ class HostEventRecorder {
 
  private:
   using ThreadEventRecorderRegistry =
-      framework::ThreadDataRegistry<ThreadEventRecorder>;
+      framework::ThreadDataRegistry<ThreadEventRecorder<EventType>>;
 
   HostEventRecorder() = default;
   DISABLE_COPY_AND_ASSIGN(HostEventRecorder);
 
-  ThreadEventRecorder *GetThreadLocalRecorder() {
+  ThreadEventRecorder<EventType> *GetThreadLocalRecorder() {
     return ThreadEventRecorderRegistry::GetInstance()
         .GetMutableCurrentThreadData();
   }
diff --git a/paddle/fluid/platform/profiler/host_tracer.cc b/paddle/fluid/platform/profiler/host_tracer.cc
index 8a36a3a8bab44..bde1395c1253c 100644
--- a/paddle/fluid/platform/profiler/host_tracer.cc
+++ b/paddle/fluid/platform/profiler/host_tracer.cc
@@ -30,7 +30,7 @@ namespace platform {
 
 namespace {
 
-void ProcessHostEvents(const HostEventSection& host_events,
+void ProcessHostEvents(const HostEventSection<CommonEvent>& host_events,
                        TraceEventCollector* collector) {
   for (const auto& thr_sec : host_events.thr_sections) {
     uint64_t tid = thr_sec.thread_id;
@@ -62,7 +62,7 @@ void HostTracer::StartTracing() {
   PADDLE_ENFORCE_EQ(
       state_ == TracerState::READY || state_ == TracerState::STOPED, true,
       platform::errors::PreconditionNotMet("TracerState must be READY"));
-  HostEventRecorder::GetInstance().GatherEvents();
+  HostEventRecorder<CommonEvent>::GetInstance().GatherEvents();
   HostTraceLevel::GetInstance().SetLevel(options_.trace_level);
   state_ = TracerState::STARTED;
 }
@@ -79,8 +79,8 @@ void HostTracer::CollectTraceData(TraceEventCollector* collector) {
   PADDLE_ENFORCE_EQ(
       state_, TracerState::STOPED,
       platform::errors::PreconditionNotMet("TracerState must be STOPED"));
-  HostEventSection host_events =
-      HostEventRecorder::GetInstance().GatherEvents();
+  HostEventSection<CommonEvent> host_events =
+      HostEventRecorder<CommonEvent>::GetInstance().GatherEvents();
   ProcessHostEvents(host_events, collector);
 }
 

From f3d43fa9903a6c89afc0f238d0c08fb38fa58ef5 Mon Sep 17 00:00:00 2001
From: Wilber <jiweibo@baidu.com>
Date: Tue, 7 Jun 2022 16:50:43 +0800
Subject: [PATCH 25/53] patch pr (#43270)

---
 .../fluid/inference/api/resource_manager.cc   | 165 +++++++++++++-----
 paddle/fluid/inference/api/resource_manager.h |  66 ++++++-
 2 files changed, 174 insertions(+), 57 deletions(-)

diff --git a/paddle/fluid/inference/api/resource_manager.cc b/paddle/fluid/inference/api/resource_manager.cc
index d88f282ce7a62..4cd84995a2e2f 100644
--- a/paddle/fluid/inference/api/resource_manager.cc
+++ b/paddle/fluid/inference/api/resource_manager.cc
@@ -14,6 +14,8 @@
 
 #include "paddle/fluid/inference/api/resource_manager.h"
 
+#include <memory>
+#include <mutex>
 #include <unordered_map>
 
 #include "paddle/fluid/memory/allocation/allocator_facade.h"
@@ -106,31 +108,26 @@ class EigenGpuStreamDevice : public Eigen::StreamInterface {
 #endif
 }  // namespace internal
 
-ResourceManager::ResourceManager(const phi::Place& place, void* stream)
-    : place_(place) {
-  InitCPUResource();
-
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  InitGPUResource(stream);
-#endif
-}
-
-ResourceManager::~ResourceManager() {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  DestroyGPUResource();
-#endif
+Eigen::DefaultDevice* CPUContextResource::GetCPUEigenDevice() const {
+  return cpu_eigen_device_.get();
 }
 
-void ResourceManager::InitCPUResource() {
+void CPUContextResource::InitCPUResource() {
   cpu_eigen_device_.reset(new Eigen::DefaultDevice());
 }
 
-Eigen::DefaultDevice* ResourceManager::GetCpuEigenDevice() {
-  return cpu_eigen_device_.get();
-}
+CPUContextResource::CPUContextResource() { InitCPUResource(); }
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-void ResourceManager::InitGPUResource(void* stream) {
+GPUContextResource::GPUContextResource(const phi::Place& place, void* stream)
+    : place_(place) {
+  InitGPUResource(stream);
+}
+
+GPUContextResource::~GPUContextResource() { DestroyGPUResource(); }
+
+void GPUContextResource::InitGPUResource(void* stream) {
+  phi::backends::gpu::GPUDeviceGuard guard(place_.device);
   if (stream == nullptr) {
     owned_stream_ = true;
     phi::InitStream(&stream_);
@@ -148,7 +145,7 @@ void ResourceManager::InitGPUResource(void* stream) {
   InitSparseHandle();
 }
 
-void ResourceManager::DestroyGPUResource() {
+void GPUContextResource::DestroyGPUResource() {
   if (owned_stream_) {
 #ifdef PADDLE_WITH_HIP
     PADDLE_ENFORCE_GPU_SUCCESS(hipStreamDestroy(stream_));
@@ -165,15 +162,14 @@ void ResourceManager::DestroyGPUResource() {
   DestroySparseHandle();
 }
 
-void ResourceManager::InitGpuProperties() {
-  phi::backends::gpu::GPUDeviceGuard guard(place_.device);
+void GPUContextResource::InitGpuProperties() {
   phi::InitGpuProperties(place_, &compute_capability_, &runtime_version_,
                          &driver_version_, &multi_process_,
                          &max_threads_per_mp_, &max_threads_per_block_,
                          &max_grid_dim_size_);
 }
 
-void ResourceManager::InitGpuEigenDevice() {
+void GPUContextResource::InitGpuEigenDevice() {
   auto* allocator = paddle::memory::allocation::AllocatorFacade::Instance()
                         .GetAllocator(place_)
                         .get();
@@ -182,13 +178,15 @@ void ResourceManager::InitGpuEigenDevice() {
   gpu_eigen_device_.reset(new Eigen::GpuDevice(eigen_stream_.get()));
 }
 
-void ResourceManager::InitDnnHanlde() {
+void GPUContextResource::InitDnnHanlde() {
   phi::InitDnnHandle(&dnn_handle_, stream_, place_);
 }
 
-void ResourceManager::DestroyDnnHandle() { phi::DestroyDnnHandle(dnn_handle_); }
+void GPUContextResource::DestroyDnnHandle() {
+  phi::DestroyDnnHandle(dnn_handle_);
+}
 
-void ResourceManager::InitBlasHandle() {
+void GPUContextResource::InitBlasHandle() {
   phi::InitBlasHandle(&blas_handle_, stream_);
 #ifdef PADDLE_WITH_CUDA
 #if CUDA_VERSION >= 9000
@@ -204,87 +202,158 @@ void ResourceManager::InitBlasHandle() {
 #endif
 }
 
-void ResourceManager::DestroyBlasHandle() {
+void GPUContextResource::DestroyBlasHandle() {
   phi::DestroyBlasHandle(blas_handle_);
   phi::DestroyBlasHandle(blas_tensor_core_handle_);
   phi::DestroyBlasHandle(blas_tf32_tensor_core_handle_);
 }
 
-void ResourceManager::InitBlasLtHandle() {
+void GPUContextResource::InitBlasLtHandle() {
   phi::InitBlasLtHandle(&blaslt_handle_);
 }
 
-void ResourceManager::DestroyBlasLtHandle() {
+void GPUContextResource::DestroyBlasLtHandle() {
   phi::DestroyBlasLtHandle(blaslt_handle_);
 }
 
-void ResourceManager::InitSolverHandle() {
+void GPUContextResource::InitSolverHandle() {
   phi::InitSolverHandle(&solver_handle_, stream_);
 }
 
-void ResourceManager::DestroySolverHandle() {
+void GPUContextResource::DestroySolverHandle() {
   phi::DestroySolverHandle(solver_handle_);
 }
 
-void ResourceManager::InitSparseHandle() {
+void GPUContextResource::InitSparseHandle() {
   phi::InitSparseHandle(&sparse_handle_, stream_);
 }
 
-void ResourceManager::DestroySparseHandle() {
+void GPUContextResource::DestroySparseHandle() {
   phi::DestroySparseHandle(sparse_handle_);
 }
 
-gpuStream_t ResourceManager::GetStream() const { return stream_; }
+gpuStream_t GPUContextResource::GetStream() const { return stream_; }
 
-dnnHandle_t ResourceManager::GetDnnHandle() const { return dnn_handle_; }
+dnnHandle_t GPUContextResource::GetDnnHandle() const { return dnn_handle_; }
 
-blasHandle_t ResourceManager::GetBlasHandle() const { return blas_handle_; }
+blasHandle_t GPUContextResource::GetBlasHandle() const { return blas_handle_; }
 
-blasHandle_t ResourceManager::GetBlasTensorCoreHandle() const {
+blasHandle_t GPUContextResource::GetBlasTensorCoreHandle() const {
   return blas_tensor_core_handle_;
 }
 
-blasHandle_t ResourceManager::GetBlasTF32Handle() const {
+blasHandle_t GPUContextResource::GetBlasTF32Handle() const {
   return blas_tf32_tensor_core_handle_;
 }
 
-blasLtHandle_t ResourceManager::GetBlasLtHandle() const {
+blasLtHandle_t GPUContextResource::GetBlasLtHandle() const {
   return blaslt_handle_;
 }
 
-phi::solverHandle_t ResourceManager::GetSolverDnHandle() const {
+phi::solverHandle_t GPUContextResource::GetSolverDnHandle() const {
   return solver_handle_;
 }
 
-phi::sparseHandle_t ResourceManager::GetSparseHandle() const {
+phi::sparseHandle_t GPUContextResource::GetSparseHandle() const {
   return sparse_handle_;
 }
 
-Eigen::GpuDevice* ResourceManager::GetGpuEigenDevice() const {
+Eigen::GpuDevice* GPUContextResource::GetGpuEigenDevice() const {
   return gpu_eigen_device_.get();
 }
 
-int ResourceManager::GetGpuComputeCapability() const {
+int GPUContextResource::GetGpuComputeCapability() const {
   return compute_capability_;
 }
 
-int ResourceManager::GetGpuRuntimeVersion() const { return runtime_version_; }
+int GPUContextResource::GetGpuRuntimeVersion() const {
+  return runtime_version_;
+}
 
-int ResourceManager::GetGpuDriverVersion() const { return driver_version_; }
+int GPUContextResource::GetGpuDriverVersion() const { return driver_version_; }
 
-int ResourceManager::GetGPUMultiProcessors() const { return multi_process_; }
+int GPUContextResource::GetGPUMultiProcessors() const { return multi_process_; }
 
-int ResourceManager::GetGpuMaxThreadsPerMp() const {
+int GPUContextResource::GetGpuMaxThreadsPerMp() const {
   return max_threads_per_mp_;
 }
 
-int ResourceManager::GetGpuMaxThreadsPerBlock() const {
+int GPUContextResource::GetGpuMaxThreadsPerBlock() const {
   return max_threads_per_block_;
 }
 
-std::array<int, 3> ResourceManager::GetGpuMaxGridDimSize() const {
+std::array<int, 3> GPUContextResource::GetGpuMaxGridDimSize() const {
   return max_grid_dim_size_;
 }
 
 #endif
+
+void ResourceManager::InitCPUResource() {
+  std::lock_guard<std::mutex> lock_gurad(cpu_mutex_);
+  if (cpu_resource_ == nullptr) {
+    cpu_resource_.reset(new CPUContextResource());
+  }
+}
+
+CPUContextResource* ResourceManager::GetCPUResource() const {
+  PADDLE_ENFORCE_NOT_NULL(
+      cpu_resource_.get(),
+      platform::errors::PreconditionNotMet("cpu_resource should be not null!"));
+  return cpu_resource_.get();
+}
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+void* ResourceManager::InitGPUResource(const phi::Place& place, void* stream) {
+  std::lock_guard<std::mutex> lock_gurad(gpu_mutex_);
+  if (gpu_resources_.count(stream)) {
+    Increase(stream);
+    return stream;
+  } else {
+    std::unique_ptr<GPUContextResource> resource{
+        new GPUContextResource(place, stream)};
+    gpuStream_t s = resource->GetStream();
+    ref_count_[s] = 1;
+    gpu_resources_.emplace(s, std::move(resource));
+    return s;
+  }
+}
+
+void ResourceManager::DestroyGPUResource(void* stream) {
+  PADDLE_ENFORCE_EQ(gpu_resources_.count(stream), true,
+                    platform::errors::InvalidArgument(
+                        "The stream[%p] not found in gpu_resources.", stream));
+  Decrease(stream);
+}
+
+void ResourceManager::Decrease(void* stream) {
+  PADDLE_ENFORCE_EQ(ref_count_.count(stream), true,
+                    platform::errors::InvalidArgument(
+                        "The stream[%p] not found in ref_count.", stream));
+  --ref_count_[stream];
+  if (ref_count_[stream] == 0) {
+    ref_count_.erase(stream);
+    gpu_resources_.erase(stream);
+  }
+}
+
+void ResourceManager::Increase(void* stream) {
+  PADDLE_ENFORCE_EQ(ref_count_.count(stream), true,
+                    platform::errors::InvalidArgument(
+                        "The stream[%p] not found in ref_count.", stream));
+  ++ref_count_[stream];
+}
+
+GPUContextResource* ResourceManager::GetGPUResource(void* stream) const {
+  PADDLE_ENFORCE_EQ(gpu_resources_.count(stream), true,
+                    platform::errors::InvalidArgument(
+                        "The stream[%p] not found in gpu_resources.", stream));
+  return gpu_resources_.at(stream).get();
+}
+
+int ResourceManager::RefCount(void* stream) const {
+  if (ref_count_.count(stream) == 0) return 0;
+  return ref_count_.at(stream);
+}
+#endif
+
 }  // namespace paddle
diff --git a/paddle/fluid/inference/api/resource_manager.h b/paddle/fluid/inference/api/resource_manager.h
index 24e76598e400b..03345403159d5 100644
--- a/paddle/fluid/inference/api/resource_manager.h
+++ b/paddle/fluid/inference/api/resource_manager.h
@@ -13,9 +13,13 @@
 // limitations under the License.
 #pragma once
 
+#include <atomic>
 #include <functional>
+#include <map>
 #include <memory>
+#include <mutex>
 
+#include "paddle/fluid/platform/macros.h"
 #include "paddle/phi/api/include/tensor.h"
 #include "paddle/phi/backends/cpu/forwards.h"
 
@@ -31,24 +35,24 @@ namespace internal {
 class EigenGpuStreamDevice;
 }  // namespace internal
 
-class ResourceManager {
- public:
-  explicit ResourceManager(const phi::Place& place, void* stream);
-  ~ResourceManager();
-
+class CPUContextResource {
  public:
-  Eigen::DefaultDevice* GetCpuEigenDevice();
+  CPUContextResource();
+  Eigen::DefaultDevice* GetCPUEigenDevice() const;
 
  private:
   void InitCPUResource();
 
  private:
-  phi::Place place_;
   std::unique_ptr<Eigen::DefaultDevice> cpu_eigen_device_;
+};
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-
+class GPUContextResource {
  public:
+  explicit GPUContextResource(const phi::Place& place, void* stream);
+  ~GPUContextResource();
+
   gpuStream_t GetStream() const;
   dnnHandle_t GetDnnHandle() const;
   blasHandle_t GetBlasHandle() const;
@@ -83,6 +87,8 @@ class ResourceManager {
   void DestroySparseHandle();
 
  private:
+  phi::Place place_;
+
   int compute_capability_;
   int runtime_version_;
   int driver_version_;
@@ -103,8 +109,50 @@ class ResourceManager {
   dnnHandle_t dnn_handle_{nullptr};
   phi::solverHandle_t solver_handle_{nullptr};
   phi::sparseHandle_t sparse_handle_{nullptr};
-// DnnWorkspaceHandle
+  // DnnWorkspaceHandle
+};
 #endif
+
+class ResourceManager {
+ public:
+  ResourceManager() = default;
+  static ResourceManager& Instance() {
+    static ResourceManager* resource_manager = new ResourceManager;
+    return *resource_manager;
+  }
+
+  // CPU Resource
+ public:
+  void InitCPUResource();
+  CPUContextResource* GetCPUResource() const;
+
+ private:
+  std::mutex cpu_mutex_;
+  std::unique_ptr<CPUContextResource> cpu_resource_{nullptr};
+
+// GPU Resource
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+
+ public:
+  void* InitGPUResource(const phi::Place& place, void* stream);
+  void DestroyGPUResource(void* stream);
+  GPUContextResource* GetGPUResource(void* stream) const;
+  int RefCount(void* stream) const;
+
+ private:
+  void Decrease(void* stream);
+  void Increase(void* stream);
+
+ private:
+  std::mutex gpu_mutex_;
+  // a stream corresponding to a series of resource.
+  std::map<void* /*stream*/, std::atomic<int>> ref_count_;
+  std::map<void* /*stream*/, std::unique_ptr<GPUContextResource>>
+      gpu_resources_;
+#endif
+
+ private:
+  DISABLE_COPY_AND_ASSIGN(ResourceManager);
 };
 
 }  // namespace paddle

From 5dcebb9b3b4dc25e29cc1a98f802e80e7547a1cd Mon Sep 17 00:00:00 2001
From: Leo Chen <39020268+leo0519@users.noreply.github.com>
Date: Tue, 7 Jun 2022 18:37:22 +0800
Subject: [PATCH 26/53] Allocate and use new memory for temp data in cumsum
 kernel (#43101)

---
 paddle/phi/kernels/gpu/cumsum_kernel.cu | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/paddle/phi/kernels/gpu/cumsum_kernel.cu b/paddle/phi/kernels/gpu/cumsum_kernel.cu
index ed131e0ff5413..460aa37f8f995 100644
--- a/paddle/phi/kernels/gpu/cumsum_kernel.cu
+++ b/paddle/phi/kernels/gpu/cumsum_kernel.cu
@@ -263,8 +263,9 @@ void CumsumKernel(const Context& dev_ctx,
   dim3 blocks(32, 8);
   dim3 transpose_grids((width + tile_size - 1) / tile_size,
                        (height + tile_size - 1) / tile_size);
-  out->Resize(out_dims);
-  auto* tmp_data = out->data<T>();
+  DenseTensor tmp_tensor;
+  tmp_tensor.Resize(out_dims);
+  auto* tmp_data = dev_ctx.template Alloc<T>(&tmp_tensor);
 
   T* next_in_data = out_data;
   T* next_out_data = tmp_data;

From 601d7a353da347418116dc4713cc221d68a77783 Mon Sep 17 00:00:00 2001
From: sneaxiy <32832641+sneaxiy@users.noreply.github.com>
Date: Tue, 7 Jun 2022 19:00:06 +0800
Subject: [PATCH 27/53] Add use_master_acc_grad for DistributedFusedLamb
 (#43266)

* add use_master_acc_grad

* add ut
---
 .../optimizers/distributed_fused_lamb_op.cc       |  3 +++
 .../optimizers/distributed_fused_lamb_op.cu       | 15 +++++++++++----
 .../unittests/distributed_fused_lamb_test_base.py |  9 ++++++++-
 .../test_distributed_fused_lamb_op_with_clip.py   |  4 +++-
 ...stributed_fused_lamb_op_with_gradient_merge.py |  6 ++++++
 .../incubate/optimizer/distributed_fused_lamb.py  |  3 +++
 6 files changed, 34 insertions(+), 6 deletions(-)

diff --git a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cc b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cc
index 0159e250d317e..0f9bcc4c2d977 100644
--- a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cc
+++ b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cc
@@ -141,6 +141,9 @@ class DistributedFusedLambOpMaker : public framework::OpProtoAndCheckerMaker {
         "NCCL communication data. If it is false, it would be less accurate "
         "and be less NCCL communication data.")
         .SetDefault(true);
+    AddAttr<bool>("use_master_acc_grad",
+                  "Whether to use master gradient when acc_steps > 1.")
+        .SetDefault(true);
     AddAttr<bool>("is_grad_scaled_by_nranks",
                   "Whether the input gradient has been scaled by nranks.")
         .SetDefault(true);
diff --git a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu
index eb354ef6d7576..e7f6223968f43 100644
--- a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu
+++ b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu
@@ -1193,7 +1193,9 @@ class DistributedFusedLambOpKernel<platform::CUDADeviceContext, T>
 
       platform::float16 *fp16_acc_grad = nullptr;
       float *master_acc_grad = nullptr;
+      bool use_master_acc_grad = false;
       if (has_fp16_param) {
+        use_master_acc_grad = ctx.Attr<bool>("use_master_acc_grad");
         auto *fp16_acc_grad_t =
             ctx.Output<framework::Tensor>("FP16AccFusedGrad");
         PADDLE_ENFORCE_NOT_NULL(
@@ -1201,13 +1203,18 @@ class DistributedFusedLambOpKernel<platform::CUDADeviceContext, T>
                                  "Output(FP16AccFusedGrad) cannot be nullptr "
                                  "when Attr(acc_steps) > 1."));
         if (!fp16_acc_grad_t->IsInitialized()) {
-          fp16_acc_grad_t->Resize({static_cast<int64_t>(3 * fp16_numel)});
+          auto acc_grad_size =
+              use_master_acc_grad ? (3 * fp16_numel) : fp16_numel;
+          fp16_acc_grad_t->Resize({static_cast<int64_t>(acc_grad_size)});
           fp16_acc_grad =
               fp16_acc_grad_t->mutable_data<platform::float16>(place);
         } else {
           fp16_acc_grad = fp16_acc_grad_t->data<platform::float16>();
         }
-        master_acc_grad = reinterpret_cast<float *>(fp16_acc_grad + fp16_numel);
+        if (use_master_acc_grad) {
+          master_acc_grad =
+              reinterpret_cast<float *>(fp16_acc_grad + fp16_numel);
+        }
       }
 
       // Inplace addto
@@ -1222,8 +1229,8 @@ class DistributedFusedLambOpKernel<platform::CUDADeviceContext, T>
       }
 
       if (has_fp16_param) {
-        if (acc_steps == 2) {
-          if (rounded_step == 0) {
+        if (acc_steps == 2 || !use_master_acc_grad) {
+          if (rounded_step != 1) {
             LaunchElementwiseAddWithCastKernel(dev_ctx, fp16_acc_grad,
                                                fp16_grad, fp16_acc_grad,
                                                fp16_numel, stream);
diff --git a/python/paddle/fluid/tests/unittests/distributed_fused_lamb_test_base.py b/python/paddle/fluid/tests/unittests/distributed_fused_lamb_test_base.py
index ee2b180586dd2..611d3d2891bac 100644
--- a/python/paddle/fluid/tests/unittests/distributed_fused_lamb_test_base.py
+++ b/python/paddle/fluid/tests/unittests/distributed_fused_lamb_test_base.py
@@ -162,6 +162,7 @@ def run_model(use_distributed_lamb, use_fp16, use_master_param_norm, **kwargs):
                 kwargs = dict(kwargs)
                 kwargs.pop('clip_after_allreduce', None)
                 kwargs.pop('alignment', None)
+                kwargs.pop('use_master_acc_grad', None)
                 base_clip = grad_clip if grad_clip is not None else IdentityGradClip(
                 )
                 kwargs['grad_clip'] = GradClipDecorator(base_clip,
@@ -271,6 +272,7 @@ def config(self):
             distutils.util.strtobool(os.getenv('CLIP_AFTER_ALLREDUCE', 'True')))
         max_global_norm = float(os.getenv('MAX_GLOBAL_NORM', -1.0))
         gm_steps = int(os.getenv('GRADIENT_MERGE_STEPS', 1))
+        use_master_acc_grad = bool(int(os.getenv('USE_MASTER_ACC_GRAD', '1')))
         print('clip_after_allreduce = {}, max_global_norm = {}'.format(
             clip_after_allreduce, max_global_norm))
         return {
@@ -281,9 +283,14 @@ def config(self):
             'grad_clip':
             paddle.nn.ClipGradByGlobalNorm(max_global_norm)
             if max_global_norm > 0 else None,
+            'use_master_acc_grad':
+            use_master_acc_grad,
         }
 
-    def run_main(self, use_fp16, use_master_param_norm=True):
+    def run_main(self,
+                 use_fp16,
+                 use_master_param_norm=True,
+                 use_master_acc_grad=True):
         if not paddle.is_compiled_with_cuda():
             return
 
diff --git a/python/paddle/fluid/tests/unittests/test_distributed_fused_lamb_op_with_clip.py b/python/paddle/fluid/tests/unittests/test_distributed_fused_lamb_op_with_clip.py
index 324da95f37d80..b2c2b6e31a5f2 100644
--- a/python/paddle/fluid/tests/unittests/test_distributed_fused_lamb_op_with_clip.py
+++ b/python/paddle/fluid/tests/unittests/test_distributed_fused_lamb_op_with_clip.py
@@ -36,7 +36,8 @@ def remove_file_if_exists(file_name):
 
 def run_test(clip_after_allreduce=True,
              max_global_norm=-1.0,
-             gradient_merge_steps=1):
+             gradient_merge_steps=1,
+             use_master_acc_grad=True):
     if not paddle.is_compiled_with_cuda():
         return
     if os.name == 'nt':
@@ -58,6 +59,7 @@ def run_test(clip_after_allreduce=True,
     os.environ['CLIP_AFTER_ALLREDUCE'] = str(clip_after_allreduce)
     os.environ['MAX_GLOBAL_NORM'] = str(max_global_norm)
     os.environ['GRADIENT_MERGE_STEPS'] = str(gradient_merge_steps)
+    os.environ['USE_MASTER_ACC_GRAD'] = str(1 if use_master_acc_grad else 0)
 
     touch_file_env = 'SUCCESS_TOUCH_FILE'
     touch_file_name = 'distributed_fused_lamb_touch_file_{}'.format(os.getpid())
diff --git a/python/paddle/fluid/tests/unittests/test_distributed_fused_lamb_op_with_gradient_merge.py b/python/paddle/fluid/tests/unittests/test_distributed_fused_lamb_op_with_gradient_merge.py
index c2089b1d97db6..01ca09916a1e6 100644
--- a/python/paddle/fluid/tests/unittests/test_distributed_fused_lamb_op_with_gradient_merge.py
+++ b/python/paddle/fluid/tests/unittests/test_distributed_fused_lamb_op_with_gradient_merge.py
@@ -23,6 +23,12 @@ def test_gm(self):
                  max_global_norm=-1.0,
                  gradient_merge_steps=2)
 
+    def test_gm_with_fp16_acc_grad(self):
+        run_test(clip_after_allreduce=True,
+                 max_global_norm=-1.0,
+                 gradient_merge_steps=2,
+                 use_master_acc_grad=False)
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/incubate/optimizer/distributed_fused_lamb.py b/python/paddle/incubate/optimizer/distributed_fused_lamb.py
index 4fddaff7ec959..3029c3a294a00 100644
--- a/python/paddle/incubate/optimizer/distributed_fused_lamb.py
+++ b/python/paddle/incubate/optimizer/distributed_fused_lamb.py
@@ -40,6 +40,7 @@ def __init__(self,
                  alignment=128,
                  use_master_param_norm=True,
                  gradient_accumulation_steps=1,
+                 use_master_acc_grad=True,
                  name=None):
         assert not framework._non_static_mode(
         ), "DistributedFusedLamb does not support dygraph mode"
@@ -67,6 +68,7 @@ def __init__(self,
         self._ring_id = 0
         self._use_master_param_norm = use_master_param_norm
         self._gradient_accumulation_steps = gradient_accumulation_steps
+        self._use_master_acc_grad = use_master_acc_grad
         assert self._gradient_accumulation_steps >= 1
 
         self.helper = LayerHelper('distributed_fused_lamb')
@@ -353,5 +355,6 @@ def _apply_gradients_impl(self, params_grads):
                 'use_master_param_norm': self._use_master_param_norm,
                 'is_grad_scaled_by_nranks': self._is_grad_scaled_by_nranks,
                 'acc_steps': self._gradient_accumulation_steps,
+                'use_master_acc_grad': self._use_master_acc_grad,
             })
         return [lamb_op]

From 9551e4666733c120a46ed5e59623557bfeffde57 Mon Sep 17 00:00:00 2001
From: "joanna.wozna.intel" <joanna.wozna@intel.com>
Date: Tue, 7 Jun 2022 15:06:41 +0200
Subject: [PATCH 28/53] Correct skip_quant condition (#43184)

---
 .../ir/mkldnn/compute_propagate_scales_mkldnn_pass.cc         | 4 ++--
 paddle/fluid/framework/ir/mkldnn/quant_dequant_mkldnn_pass.cc | 2 +-
 .../contrib/slim/quantization/quant2_int8_mkldnn_pass.py      | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/paddle/fluid/framework/ir/mkldnn/compute_propagate_scales_mkldnn_pass.cc b/paddle/fluid/framework/ir/mkldnn/compute_propagate_scales_mkldnn_pass.cc
index e19426d01d195..bd945c139f601 100644
--- a/paddle/fluid/framework/ir/mkldnn/compute_propagate_scales_mkldnn_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/compute_propagate_scales_mkldnn_pass.cc
@@ -349,9 +349,9 @@ std::unordered_set<std::string> ComputePropagateScalesMkldnnPass::UpdateScales(
         waiting_for_scale.insert(input_name);
         waiting_for_scale.insert(output_name);
       } else if (in_iter != var_quant_scales->end()) {
-        out_iter->second = in_iter->second;
+        (*var_quant_scales)[output_name] = in_iter->second;
       } else if (out_iter != var_quant_scales->end()) {
-        in_iter->second = out_iter->second;
+        (*var_quant_scales)[input_name] = out_iter->second;
       }
     } else if (op_name == "scale") {
       const std::string output_name = op_node->Op()->Output("Out")[0];
diff --git a/paddle/fluid/framework/ir/mkldnn/quant_dequant_mkldnn_pass.cc b/paddle/fluid/framework/ir/mkldnn/quant_dequant_mkldnn_pass.cc
index 15100b23407b0..5f92a4bb7f15b 100644
--- a/paddle/fluid/framework/ir/mkldnn/quant_dequant_mkldnn_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/quant_dequant_mkldnn_pass.cc
@@ -38,7 +38,7 @@ void QuantDequantMkldnnPass::MarkSkipQuantizedOps(
         for (auto* node_input : op_node->inputs) {
           for (auto* node_input_input : node_input->inputs) {
             if (!node_input_input->IsOp()) continue;
-            if (node_input_input->Name().find("quantize_dequantize") ==
+            if (node_input_input->Name().find("quantize") ==
                 std::string::npos) {
               is_quantized_op = false;
               break;
diff --git a/python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py b/python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py
index 220016bd653bc..76feab207eebd 100644
--- a/python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py
+++ b/python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py
@@ -158,7 +158,7 @@ def _label_skip_quantized_op(self, graph):
                 is_quantized_op = True
                 for var_node in op_node.inputs:
                     for front_op_node in var_node.inputs:
-                        if "quantize_dequantize" not in front_op_node.name():
+                        if "quantize" not in front_op_node.name():
                             is_quantized_op = False
                 if not is_quantized_op:
                     op_node.op()._set_attr("skip_quant", True)

From d5afc1bac8e19cfe0c53fcace4be6d06f60a10ef Mon Sep 17 00:00:00 2001
From: shixingbo <90814748+bmb0537@users.noreply.github.com>
Date: Tue, 7 Jun 2022 21:27:01 +0800
Subject: [PATCH 29/53] Optimized the performance of activation op in XPU2 
 (#43187)

---
 .../operators/optimizers/cast_with_ptr.h      |  2 +-
 paddle/phi/kernels/funcs/elementwise_base.h   | 77 +++++++++++++------
 .../kernels/primitive/datamover_primitives.h  |  5 +-
 .../primitive/datamover_primitives_xpu2.h     | 25 +++---
 4 files changed, 73 insertions(+), 36 deletions(-)

diff --git a/paddle/fluid/operators/optimizers/cast_with_ptr.h b/paddle/fluid/operators/optimizers/cast_with_ptr.h
index eb031ae0c933a..ec7db8537b3f3 100644
--- a/paddle/fluid/operators/optimizers/cast_with_ptr.h
+++ b/paddle/fluid/operators/optimizers/cast_with_ptr.h
@@ -44,7 +44,7 @@ static void VecCastKernel(const platform::CUDADeviceContext &ctx, const InT *x,
   phi::Array<_ptr_ OutT *, 1> out_arr;
   out_arr[0] = y;
   phi::funcs::VectorizedElementwiseKernel<OutT, FunctorT, 1, 1, VecSize>
-      <<<block, thread, 0, stream>>>(in_arr, out_arr, n, main_offset,
+      <<<block, thread, 0, stream>>>(in_arr, out_arr, n, main_offset, VecSize,
                                      FunctorT());
 }
 
diff --git a/paddle/phi/kernels/funcs/elementwise_base.h b/paddle/phi/kernels/funcs/elementwise_base.h
index 71dfbc206a191..8b5a3cf8aaafe 100644
--- a/paddle/phi/kernels/funcs/elementwise_base.h
+++ b/paddle/phi/kernels/funcs/elementwise_base.h
@@ -513,19 +513,23 @@ struct Loader {
                                ArgsT *args,
                                int num,
                                int data_offset,
+                               int read_lens,
                                bool is_boundary) {
     using Type = std::tuple_element_t<Index, ArgsT>;
-    kps::Init<Type, ArgsT, Index, VecSize>(args, static_cast<Type>(1.0f));
+    kps::Init<Type, ArgsT, Index, VecSize>(
+        args, static_cast<Type>(1.0f), read_lens);
     if (is_boundary) {
       kps::ReadData<Type, VecSize, 1, 1, ArgsT, Index, true>(
           args,
           reinterpret_cast<const _ptr_ Type *>(in[Index]) + data_offset,
-          num);
+          num,
+          read_lens);
     } else {
       kps::ReadData<Type, VecSize, 1, 1, ArgsT, Index, false>(
           args,
           reinterpret_cast<const _ptr_ Type *>(in[Index]) + data_offset,
-          num);
+          num,
+          read_lens);
     }
   }
 };
@@ -660,11 +664,20 @@ template <typename OutT,
           typename ArgsT,
           int Arity>
 struct SameDimsElementwisePrimitiveCaller {
-  __device__ inline void operator()(Functor func, ArgsT *args, OutT *result) {
+  __device__ inline void operator()(Functor func,
+                                    ArgsT *args,
+                                    OutT *result,
+                                    int read_lens) {
+#ifdef PADDLE_WITH_XPU_KP
+    for (int idx = 0; idx < read_lens; ++idx) {
+      result[idx] = static_cast<OutT>(Apply(func, args[idx]));
+    }
+#else
 #pragma unroll
     for (int idx = 0; idx < VecSize; ++idx) {
       result[idx] = static_cast<OutT>(Apply(func, args[idx]));
     }
+#endif
   }
 };
 
@@ -750,6 +763,7 @@ __device__ void VectorizedElementwiseKernelImpl(
     phi::Array<_ptr_ OutT *, NumOuts> outs,
     int num,
     int data_offset,
+    int read_lens,
     Functor func) {
   using Traits = paddle::platform::FunctionTraits<Functor>;
   using ArgsT = typename Traits::ArgsTuple;
@@ -757,16 +771,16 @@ __device__ void VectorizedElementwiseKernelImpl(
   ConditionalT<OutT, NumOuts> result[VecSize];
 
   Unroller<Loader, VecSize, Arity>::step(
-      in, args, num, data_offset, IsBoundary);
+      in, args, num, data_offset, read_lens, IsBoundary);
 
   SameDimsElementwisePrimitiveCaller<ConditionalT<OutT, NumOuts>,
                                      VecSize,
                                      Functor,
                                      ArgsT,
-                                     Arity>()(func, args, result);
+                                     Arity>()(func, args, result, read_lens);
 
-  ElementwiseWriteDataCaller<OutT, VecSize, IsBoundary, NumOuts>()(
-      outs, result, data_offset, num);
+  ElementwiseWriteDataCallerBc<OutT, VecSize, IsBoundary, NumOuts>()(
+      outs, result, data_offset, num, read_lens);
 }
 
 template <typename OutT, typename Functor, int Arity, int NumOuts, int VecSize>
@@ -775,9 +789,10 @@ __global__ void VectorizedElementwiseKernel(
     phi::Array<_ptr_ OutT *, NumOuts> outs,
     int size,
     int main_offset,
+    int read_lens,
     Functor func) {
-  int data_offset = BLOCK_ID_X * BLOCK_NUM_X * VecSize;
-  int stride = BLOCK_NUM_X * GRID_NUM_X * VecSize;
+  int data_offset = BLOCK_ID_X * BLOCK_NUM_X * read_lens;
+  int stride = BLOCK_NUM_X * GRID_NUM_X * read_lens;
   for (; data_offset < main_offset; data_offset += stride) {
     VectorizedElementwiseKernelImpl<OutT,
                                     Functor,
@@ -785,7 +800,7 @@ __global__ void VectorizedElementwiseKernel(
                                     NumOuts,
                                     VecSize,
                                     false>(
-        ins, outs, VecSize * BLOCK_NUM_X, data_offset, func);
+        ins, outs, read_lens * BLOCK_NUM_X, data_offset, read_lens, func);
   }
 
   int num = size - data_offset;
@@ -795,7 +810,8 @@ __global__ void VectorizedElementwiseKernel(
                                     Arity,
                                     NumOuts,
                                     VecSize,
-                                    true>(ins, outs, num, data_offset, func);
+                                    true>(
+        ins, outs, num, data_offset, read_lens, func);
   }
 }
 
@@ -803,6 +819,7 @@ template <typename OutT, typename Functor, int Arity, int NumOuts, int VecSize>
 void ElementwiseCudaKernel(const KPDevice &ctx,
                            const std::vector<const DenseTensor *> &ins,
                            std::vector<DenseTensor *> *outs,
+                           int read_lens,
                            Functor func) {
   auto numel =
       (*outs)[0]->numel();  // To avoid running errors when ins.size()== 0
@@ -817,10 +834,10 @@ void ElementwiseCudaKernel(const KPDevice &ctx,
   int block_size = 64;
   int grid_size = 8;
   auto stream = ctx.x_context()->xpu_stream;
-  int main_offset = (numel / (VecSize * block_size)) * VecSize * block_size;
+  int main_offset = (numel / (read_lens * block_size)) * read_lens * block_size;
   VectorizedElementwiseKernel<OutT, Functor, Arity, NumOuts, VecSize>
       <<<grid_size, block_size, 0, stream>>>(
-          ins_data, outs_data, numel, main_offset, func);
+          ins_data, outs_data, numel, main_offset, read_lens, func);
 #else
   auto gpu_config =
       phi::backends::gpu::GetGpuLaunchConfig1D(ctx, numel, VecSize);
@@ -829,7 +846,7 @@ void ElementwiseCudaKernel(const KPDevice &ctx,
   auto stream = ctx.stream();
   VectorizedElementwiseKernel<OutT, Functor, Arity, NumOuts, VecSize>
       <<<gpu_config.block_per_grid, gpu_config.thread_per_block, 0, stream>>>(
-          ins_data, outs_data, numel, main_offset, func);
+          ins_data, outs_data, numel, main_offset, VecSize, func);
 #endif
 }
 
@@ -868,20 +885,32 @@ void ElementwiseKernel(const KPDevice &ctx,
     }
   }
 
+#ifdef PADDLE_WITH_XPU_KP
+  const int buf_size = 256;
+  int numel = (*outs)[0]->numel();
+  int block_size = 64;
+  int grid_size = 8;
+  int nthreads = block_size * grid_size;
+  int read_lens =
+      std::min(buf_size, kps::details::RoundUpDiv(numel, 32 * nthreads) * 32);
+  int vec_size = buf_size;
+#else
   // calculate the max vec_size for all ins and outs
   int vec_size = GetVectorizedSizeForTensors<OutT, Functor>(ins, *outs);
+  int read_lens = vec_size;
+#endif
   switch (vec_size) {
-    case 4:
-      ElementwiseCudaKernel<OutT, Functor, kArity, NumOuts, 4>(
-          ctx, ins, outs, func);
+    case VecSizeL:
+      ElementwiseCudaKernel<OutT, Functor, kArity, NumOuts, VecSizeL>(
+          ctx, ins, outs, read_lens, func);
       break;
-    case 2:
-      ElementwiseCudaKernel<OutT, Functor, kArity, NumOuts, 2>(
-          ctx, ins, outs, func);
+    case VecSizeM:
+      ElementwiseCudaKernel<OutT, Functor, kArity, NumOuts, VecSizeM>(
+          ctx, ins, outs, read_lens, func);
       break;
-    case 1:
-      ElementwiseCudaKernel<OutT, Functor, kArity, NumOuts, 1>(
-          ctx, ins, outs, func);
+    case VecSizeS:
+      ElementwiseCudaKernel<OutT, Functor, kArity, NumOuts, VecSizeS>(
+          ctx, ins, outs, read_lens, func);
       break;
     default: {
       PADDLE_THROW(phi::errors::Unimplemented(
diff --git a/paddle/phi/kernels/primitive/datamover_primitives.h b/paddle/phi/kernels/primitive/datamover_primitives.h
index 8b0c42c9d19b1..bf60d1610e322 100644
--- a/paddle/phi/kernels/primitive/datamover_primitives.h
+++ b/paddle/phi/kernels/primitive/datamover_primitives.h
@@ -259,7 +259,7 @@ __device__ __forceinline__ void Init(T* dst, T init_data, int read_lens) {
  * it supports different data types of inputs.
  */
 template <typename T, typename ArgsT, int Index, int NX>
-__device__ __forceinline__ void Init(ArgsT* dst, T init_data) {
+__device__ __forceinline__ void Init(ArgsT* dst, T init_data, int read_lens) {
 #pragma unroll
   for (int i = 0; i < NX; i++) {
     std::get<Index>(dst[i]) = init_data;
@@ -382,7 +382,8 @@ template <typename T,
           bool IsBoundary = false>
 __device__ __forceinline__ void ReadData(ArgsT* dst,
                                          const T* __restrict__ src,
-                                         int num) {
+                                         int num,
+                                         int read_lens) {
   if (IsBoundary) {  // blockDim.x * NX > num
     int thread_offset = threadIdx.x * NX;
 #pragma unroll
diff --git a/paddle/phi/kernels/primitive/datamover_primitives_xpu2.h b/paddle/phi/kernels/primitive/datamover_primitives_xpu2.h
index 1e5dfe2a542b0..f2d187f89b252 100644
--- a/paddle/phi/kernels/primitive/datamover_primitives_xpu2.h
+++ b/paddle/phi/kernels/primitive/datamover_primitives_xpu2.h
@@ -21,6 +21,8 @@ namespace phi {
 namespace kps {
 namespace details {
 
+int RoundUpDiv(int n, int k) { return (n + k - 1) / k; }
+
 enum class OptType {    // Optimize type of calc after input shape compressed
   CanNotOptimize = -1,  // can not optimize, broadcast first
   N_1,                  // just like {1} op {100} or {100} op {1}
@@ -425,9 +427,10 @@ __device__ __inline__ void Init(T* dst, T init_data, int read_lens) {
  * it supports different data types of inputs.
  */
 template <typename T, typename ArgsT, int Index, int NX>
-__device__ __forceinline__ void Init(ArgsT* dst, T init_data) {
+__device__ __forceinline__ void Init(ArgsT* dst, T init_data, int read_lens) {
+  mfence();
 #pragma unroll
-  for (int i = 0; i < NX; i++) {
+  for (int i = 0; i < read_lens; i++) {
     std::get<Index>(dst[i]) = init_data;
   }
 }
@@ -523,22 +526,24 @@ template <typename T,
           bool IsBoundary>
 __device__ __forceinline__ void ReadData(ArgsT* dst,
                                          const T _global_ptr_* src,
-                                         int num) {
-  int thread_offset = core_id() * NX;
+                                         int num,
+                                         int read_lens) {
+  int thread_offset = core_id() * read_lens;
   __local__ T in_temp[1];
   __local__ T in_vec[NX];
-  if (IsBoundary) {  // core_num() * NX > num
+  if (IsBoundary) {  // core_num() * read_lens > num
 #pragma unroll
-    for (int idx = 0; idx < NX; ++idx) {
+    for (int idx = 0; idx < read_lens; ++idx) {
       if (idx + thread_offset < num) {
         GM2LM(src + thread_offset + idx, in_temp, sizeof(T));
         std::get<Index>(dst[idx]) = in_temp[0];
+        mfence();
       }
     }
-  } else {  // core_num() * NX < num
-    GM2LM(src + thread_offset, in_vec, NX * sizeof(T));
+  } else {  // core_num() * read_lens < num
+    GM2LM(src + thread_offset, in_vec, read_lens * sizeof(T));
 #pragma unroll
-    for (int idx = 0; idx < NX; ++idx) {
+    for (int idx = 0; idx < read_lens; ++idx) {
       std::get<Index>(dst[idx]) = in_vec[idx];
     }
   }
@@ -727,10 +732,12 @@ __device__ void WriteData(T _global_ptr_* dst,
     for (int idx = 0; idx < read_lens; ++idx) {
       if (idx + thread_offset < num) {
         in_temp[0] = src[idx];
+        mfence();
         LM2GM(in_temp, dst + idx + thread_offset, sizeof(T));
       }
     }
   } else {  // core_num() * read_lens < num
+    mfence();
     LM2GM(src, dst + thread_offset, read_lens * sizeof(T));
   }
 }

From a3f9bcb239e3ddf6e75d5bed358c9280f111767b Mon Sep 17 00:00:00 2001
From: Guanghua Yu <742925032@qq.com>
Date: Tue, 7 Jun 2022 22:13:35 +0800
Subject: [PATCH 30/53] Modify quantization use tempfile to place the temporary
 files (#43267)

---
 .../slim/tests/test_imperative_out_scale.py   |  53 +++----
 .../contrib/slim/tests/test_imperative_ptq.py | 129 +++++++++---------
 .../slim/tests/test_imperative_qat_amp.py     |  13 +-
 ...t_post_training_quantization_lstm_model.py |  13 +-
 .../test_post_training_quantization_mnist.py  |  14 +-
 ..._post_training_quantization_mobilenetv1.py |  13 +-
 .../tests/test_quantization_scale_pass.py     |  11 +-
 .../tests/test_user_defined_quantization.py   |  16 ++-
 .../unittests/test_inference_model_io.py      |  24 +++-
 ...est_save_inference_model_conditional_op.py |  25 ++--
 10 files changed, 150 insertions(+), 161 deletions(-)

diff --git a/python/paddle/fluid/contrib/slim/tests/test_imperative_out_scale.py b/python/paddle/fluid/contrib/slim/tests/test_imperative_out_scale.py
index 5e0269a271790..e12c8031cf46b 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_imperative_out_scale.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_imperative_out_scale.py
@@ -20,6 +20,7 @@
 import unittest
 import logging
 import warnings
+import tempfile
 
 import paddle
 import paddle.fluid as fluid
@@ -111,6 +112,16 @@ def forward(self, inputs):
 
 class TestImperativeOutSclae(unittest.TestCase):
 
+    def setUp(self):
+        self.root_path = tempfile.TemporaryDirectory()
+        self.param_save_path = os.path.join(self.root_path.name,
+                                            "lenet.pdparams")
+        self.save_path = os.path.join(self.root_path.name,
+                                      "lenet_dynamic_outscale_infer_model")
+
+    def tearDown(self):
+        self.root_path.cleanup()
+
     def func_out_scale_acc(self):
         seed = 1000
         lr = 0.001
@@ -138,46 +149,16 @@ def func_out_scale_acc(self):
             loss_list = train_lenet(lenet, reader, adam)
             lenet.eval()
 
-        param_save_path = "test_save_quantized_model/lenet.pdparams"
         save_dict = lenet.state_dict()
-        paddle.save(save_dict, param_save_path)
-
-        save_path = "./dynamic_outscale_infer_model/lenet"
-        imperative_out_scale.save_quantized_model(
-            layer=lenet,
-            path=save_path,
-            input_spec=[
-                paddle.static.InputSpec(shape=[None, 1, 28, 28],
-                                        dtype='float32')
-            ])
+        paddle.save(save_dict, self.param_save_path)
 
         for i in range(len(loss_list) - 1):
             self.assertTrue(loss_list[i] > loss_list[i + 1],
                             msg='Failed to do the imperative qat.')
 
-    def test_out_scale_acc(self):
-        with _test_eager_guard():
-            self.func_out_scale_acc()
-        self.func_out_scale_acc()
-
-
-class TestSaveQuanztizedModelFromCheckPoint(unittest.TestCase):
-
-    def func_save_quantized_model(self):
-        lr = 0.001
-
-        load_param_path = "test_save_quantized_model/lenet.pdparams"
-        save_path = "./dynamic_outscale_infer_model_from_checkpoint/lenet"
-
-        weight_quantize_type = 'abs_max'
-        activation_quantize_type = 'moving_average_abs_max'
-        imperative_out_scale = ImperativeQuantAware(
-            weight_quantize_type=weight_quantize_type,
-            activation_quantize_type=activation_quantize_type)
-
         with fluid.dygraph.guard():
             lenet = ImperativeLenet()
-            load_dict = paddle.load(load_param_path)
+            load_dict = paddle.load(self.param_save_path)
             imperative_out_scale.quantize(lenet)
             lenet.set_dict(load_dict)
 
@@ -191,7 +172,7 @@ def func_save_quantized_model(self):
 
         imperative_out_scale.save_quantized_model(
             layer=lenet,
-            path=save_path,
+            path=self.save_path,
             input_spec=[
                 paddle.static.InputSpec(shape=[None, 1, 28, 28],
                                         dtype='float32')
@@ -201,10 +182,10 @@ def func_save_quantized_model(self):
             self.assertTrue(loss_list[i] > loss_list[i + 1],
                             msg='Failed to do the imperative qat.')
 
-    def test_save_quantized_model(self):
+    def test_out_scale_acc(self):
         with _test_eager_guard():
-            self.func_save_quantized_model()
-        self.func_save_quantized_model()
+            self.func_out_scale_acc()
+        self.func_out_scale_acc()
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/contrib/slim/tests/test_imperative_ptq.py b/python/paddle/fluid/contrib/slim/tests/test_imperative_ptq.py
index 402113e5f8d78..cde739b2c9f74 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_imperative_ptq.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_imperative_ptq.py
@@ -22,6 +22,7 @@
 import unittest
 import copy
 import logging
+import tempfile
 
 import paddle.nn as nn
 import paddle
@@ -73,10 +74,6 @@ class TestImperativePTQ(unittest.TestCase):
 
     @classmethod
     def setUpClass(cls):
-        timestamp = time.strftime('%Y-%m-%d-%H-%M-%S', time.localtime())
-        cls.root_path = os.path.join(os.getcwd(), "imperative_ptq_" + timestamp)
-        cls.save_path = os.path.join(cls.root_path, "model")
-
         cls.download_path = 'dygraph_int8/download'
         cls.cache_folder = os.path.expanduser('~/.cache/paddle/dataset/' +
                                               cls.download_path)
@@ -89,14 +86,6 @@ def setUpClass(cls):
         paddle.static.default_main_program().random_seed = seed
         paddle.static.default_startup_program().random_seed = seed
 
-    @classmethod
-    def tearDownClass(cls):
-        try:
-            pass
-            # shutil.rmtree(cls.root_path)
-        except Exception as e:
-            print("Failed to delete {} due to {}".format(cls.root_path, str(e)))
-
     def cache_unzipping(self, target_folder, zip_path):
         if not os.path.exists(target_folder):
             cmd = 'mkdir {0} && tar xf {1} -C {0}'.format(
@@ -217,32 +206,34 @@ def func_ptq(self):
         input_spec = [
             paddle.static.InputSpec(shape=[None, 1, 28, 28], dtype='float32')
         ]
-        self.ptq.save_quantized_model(model=quant_model,
-                                      path=self.save_path,
-                                      input_spec=input_spec)
-        print('Quantized model saved in {%s}' % self.save_path)
-
-        after_acc_top1 = self.model_test(quant_model, self.batch_num,
-                                         self.batch_size)
-
-        paddle.enable_static()
-        infer_acc_top1 = self.program_test(self.save_path, self.batch_num,
-                                           self.batch_size)
-        paddle.disable_static()
-
-        # Check
-        print('Before converted acc_top1: %s' % before_acc_top1)
-        print('After converted acc_top1: %s' % after_acc_top1)
-        print('Infer acc_top1: %s' % infer_acc_top1)
-
-        self.assertTrue(after_acc_top1 >= self.eval_acc_top1,
-                        msg="The test acc {%f} is less than {%f}." %
-                        (after_acc_top1, self.eval_acc_top1))
-        self.assertTrue(infer_acc_top1 >= after_acc_top1,
-                        msg='The acc is lower after converting model.')
-
-        end_time = time.time()
-        print("total time: %ss \n" % (end_time - start_time))
+        with tempfile.TemporaryDirectory(prefix="imperative_ptq_") as tmpdir:
+            save_path = os.path.join(tmpdir, "model")
+            self.ptq.save_quantized_model(model=quant_model,
+                                          path=save_path,
+                                          input_spec=input_spec)
+            print('Quantized model saved in {%s}' % save_path)
+
+            after_acc_top1 = self.model_test(quant_model, self.batch_num,
+                                             self.batch_size)
+
+            paddle.enable_static()
+            infer_acc_top1 = self.program_test(save_path, self.batch_num,
+                                               self.batch_size)
+            paddle.disable_static()
+
+            # Check
+            print('Before converted acc_top1: %s' % before_acc_top1)
+            print('After converted acc_top1: %s' % after_acc_top1)
+            print('Infer acc_top1: %s' % infer_acc_top1)
+
+            self.assertTrue(after_acc_top1 >= self.eval_acc_top1,
+                            msg="The test acc {%f} is less than {%f}." %
+                            (after_acc_top1, self.eval_acc_top1))
+            self.assertTrue(infer_acc_top1 >= after_acc_top1,
+                            msg='The acc is lower after converting model.')
+
+            end_time = time.time()
+            print("total time: %ss \n" % (end_time - start_time))
 
     def test_ptq(self):
         with _test_eager_guard():
@@ -278,36 +269,38 @@ def func_ptq(self):
         input_spec = [
             paddle.static.InputSpec(shape=[None, 1, 28, 28], dtype='float32')
         ]
-        self.ptq.save_quantized_model(model=quant_model,
-                                      path=self.save_path,
-                                      input_spec=input_spec)
-        print('Quantized model saved in {%s}' % self.save_path)
-
-        after_acc_top1 = self.model_test(quant_model, self.batch_num,
-                                         self.batch_size)
-
-        paddle.enable_static()
-        infer_acc_top1 = self.program_test(self.save_path, self.batch_num,
-                                           self.batch_size)
-        paddle.disable_static()
-
-        # Check
-        print('Before converted acc_top1: %s' % before_acc_top1)
-        print('After converted acc_top1: %s' % after_acc_top1)
-        print('Infer acc_top1: %s' % infer_acc_top1)
-
-        #Check whether the quant_model is correct after converting.
-        #The acc of quantized model should be higher than 0.95.
-        self.assertTrue(after_acc_top1 >= self.eval_acc_top1,
-                        msg="The test acc {%f} is less than {%f}." %
-                        (after_acc_top1, self.eval_acc_top1))
-        #Check the saved infer_model.The acc of infer model
-        #should not be lower than the one of dygraph model.
-        self.assertTrue(infer_acc_top1 >= after_acc_top1,
-                        msg='The acc is lower after converting model.')
-
-        end_time = time.time()
-        print("total time: %ss \n" % (end_time - start_time))
+        with tempfile.TemporaryDirectory(prefix="imperative_ptq_") as tmpdir:
+            save_path = os.path.join(tmpdir, "model")
+            self.ptq.save_quantized_model(model=quant_model,
+                                          path=save_path,
+                                          input_spec=input_spec)
+            print('Quantized model saved in {%s}' % save_path)
+
+            after_acc_top1 = self.model_test(quant_model, self.batch_num,
+                                             self.batch_size)
+
+            paddle.enable_static()
+            infer_acc_top1 = self.program_test(save_path, self.batch_num,
+                                               self.batch_size)
+            paddle.disable_static()
+
+            # Check
+            print('Before converted acc_top1: %s' % before_acc_top1)
+            print('After converted acc_top1: %s' % after_acc_top1)
+            print('Infer acc_top1: %s' % infer_acc_top1)
+
+            #Check whether the quant_model is correct after converting.
+            #The acc of quantized model should be higher than 0.95.
+            self.assertTrue(after_acc_top1 >= self.eval_acc_top1,
+                            msg="The test acc {%f} is less than {%f}." %
+                            (after_acc_top1, self.eval_acc_top1))
+            #Check the saved infer_model.The acc of infer model
+            #should not be lower than the one of dygraph model.
+            self.assertTrue(infer_acc_top1 >= after_acc_top1,
+                            msg='The acc is lower after converting model.')
+
+            end_time = time.time()
+            print("total time: %ss \n" % (end_time - start_time))
 
     def test_ptq(self):
         with _test_eager_guard():
diff --git a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_amp.py b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_amp.py
index 804c56cfd873b..e40816f39545a 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_amp.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_amp.py
@@ -21,6 +21,7 @@
 import time
 import unittest
 import logging
+import tempfile
 
 import paddle
 import paddle.fluid as fluid
@@ -46,10 +47,9 @@ class TestImperativeQatAmp(unittest.TestCase):
 
     @classmethod
     def setUpClass(cls):
-        timestamp = time.strftime('%Y-%m-%d-%H-%M-%S', time.localtime())
-        cls.root_path = os.path.join(os.getcwd(),
-                                     "imperative_qat_amp_" + timestamp)
-        cls.save_path = os.path.join(cls.root_path, "model")
+        cls.root_path = tempfile.TemporaryDirectory(
+            prefix="imperative_qat_amp_")
+        cls.save_path = os.path.join(cls.root_path.name, "model")
 
         cls.download_path = 'dygraph_int8/download'
         cls.cache_folder = os.path.expanduser('~/.cache/paddle/dataset/' +
@@ -65,10 +65,7 @@ def setUpClass(cls):
 
     @classmethod
     def tearDownClass(cls):
-        try:
-            shutil.rmtree(cls.root_path)
-        except Exception as e:
-            print("Failed to delete {} due to {}".format(cls.root_path, str(e)))
+        cls.root_path.cleanup()
 
     def cache_unzipping(self, target_folder, zip_path):
         if not os.path.exists(target_folder):
diff --git a/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_lstm_model.py b/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_lstm_model.py
index 1beb0f916d48e..6100ed4f82a0e 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_lstm_model.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_lstm_model.py
@@ -20,6 +20,7 @@
 import functools
 import contextlib
 import struct
+import tempfile
 import numpy as np
 import paddle
 import paddle.fluid as fluid
@@ -38,9 +39,9 @@ def setUp(self):
         self.download_path = 'int8/download'
         self.cache_folder = os.path.expanduser('~/.cache/paddle/dataset/' +
                                                self.download_path)
-        self.timestamp = time.strftime('%Y-%m-%d-%H-%M-%S', time.localtime())
-        self.int8_model_path = os.path.join(os.getcwd(),
-                                            "post_training_" + self.timestamp)
+        self.root_path = tempfile.TemporaryDirectory()
+        self.int8_model_path = os.path.join(self.root_path.name,
+                                            "post_training_quantization")
         try:
             os.system("mkdir -p " + self.int8_model_path)
         except Exception as e:
@@ -49,11 +50,7 @@ def setUp(self):
             sys.exit(-1)
 
     def tearDown(self):
-        try:
-            os.system("rm -rf {}".format(self.int8_model_path))
-        except Exception as e:
-            print("Failed to delete {} due to {}".format(
-                self.int8_model_path, str(e)))
+        self.root_path.cleanup()
 
     def cache_unzipping(self, target_folder, zip_path):
         if not os.path.exists(target_folder):
diff --git a/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mnist.py b/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mnist.py
index cb76f4bbac084..ca2bf80765ec9 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mnist.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mnist.py
@@ -18,6 +18,7 @@
 import random
 import math
 import functools
+import tempfile
 import contextlib
 import numpy as np
 import paddle
@@ -34,12 +35,12 @@
 class TestPostTrainingQuantization(unittest.TestCase):
 
     def setUp(self):
+        self.root_path = tempfile.TemporaryDirectory()
+        self.int8_model_path = os.path.join(self.root_path.name,
+                                            "post_training_quantization")
         self.download_path = 'int8/download'
         self.cache_folder = os.path.expanduser('~/.cache/paddle/dataset/' +
                                                self.download_path)
-        self.timestamp = time.strftime('%Y-%m-%d-%H-%M-%S', time.localtime())
-        self.int8_model_path = os.path.join(os.getcwd(),
-                                            "post_training_" + self.timestamp)
         try:
             os.system("mkdir -p " + self.int8_model_path)
         except Exception as e:
@@ -48,11 +49,7 @@ def setUp(self):
             sys.exit(-1)
 
     def tearDown(self):
-        try:
-            os.system("rm -rf {}".format(self.int8_model_path))
-        except Exception as e:
-            print("Failed to delete {} due to {}".format(
-                self.int8_model_path, str(e)))
+        self.root_path.cleanup()
 
     def cache_unzipping(self, target_folder, zip_path):
         if not os.path.exists(target_folder):
@@ -123,7 +120,6 @@ def generate_quantized_model(self,
 
         place = fluid.CPUPlace()
         exe = fluid.Executor(place)
-        scope = fluid.global_scope()
         val_reader = paddle.dataset.mnist.train()
 
         ptq = PostTrainingQuantization(executor=exe,
diff --git a/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mobilenetv1.py b/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mobilenetv1.py
index b36f036d41590..9c076d85fd2d5 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mobilenetv1.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mobilenetv1.py
@@ -19,6 +19,7 @@
 import math
 import functools
 import contextlib
+import tempfile
 import numpy as np
 from PIL import Image, ImageEnhance
 import paddle
@@ -150,16 +151,12 @@ def setUp(self):
         self.infer_iterations = 50000 if os.environ.get(
             'DATASET') == 'full' else 2
 
-        self.timestamp = time.strftime('%Y-%m-%d-%H-%M-%S', time.localtime())
-        self.int8_model = os.path.join(os.getcwd(),
-                                       "post_training_" + self.timestamp)
+        self.root_path = tempfile.TemporaryDirectory()
+        self.int8_model = os.path.join(self.root_path.name,
+                                       "post_training_quantization")
 
     def tearDown(self):
-        try:
-            os.system("rm -rf {}".format(self.int8_model))
-        except Exception as e:
-            print("Failed to delete {} due to {}".format(
-                self.int8_model, str(e)))
+        self.root_path.cleanup()
 
     def cache_unzipping(self, target_folder, zip_path):
         if not os.path.exists(target_folder):
diff --git a/python/paddle/fluid/contrib/slim/tests/test_quantization_scale_pass.py b/python/paddle/fluid/contrib/slim/tests/test_quantization_scale_pass.py
index acf3c68600ce6..0f4c450cfa98d 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_quantization_scale_pass.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_quantization_scale_pass.py
@@ -17,6 +17,7 @@
 import random
 import numpy as np
 import six
+import tempfile
 import paddle.fluid as fluid
 import paddle
 from paddle.fluid.framework import IrGraph
@@ -166,15 +167,19 @@ def build_program(main, startup, is_test):
                     marked_nodes.add(op)
             test_graph.draw('.', 'quant_scale' + dev_name, marked_nodes)
 
-        with open('quant_scale_model' + dev_name + '.txt', 'w') as f:
+        tempdir = tempfile.TemporaryDirectory()
+        mapping_table_path = os.path.join(
+            tempdir.name, 'quant_scale_model' + dev_name + '.txt')
+        save_path = os.path.join(tempdir.name, 'quant_scale_model' + dev_name)
+        with open(mapping_table_path, 'w') as f:
             f.write(str(server_program))
 
         with fluid.scope_guard(scope):
-            fluid.io.save_inference_model('quant_scale_model' + dev_name,
-                                          ['image', 'label'], [loss],
+            fluid.io.save_inference_model(save_path, ['image', 'label'], [loss],
                                           exe,
                                           server_program,
                                           clip_extra=True)
+        tempdir.cleanup()
 
     def test_quant_scale_cuda(self):
         if fluid.core.is_compiled_with_cuda():
diff --git a/python/paddle/fluid/contrib/slim/tests/test_user_defined_quantization.py b/python/paddle/fluid/contrib/slim/tests/test_user_defined_quantization.py
index 96c56529cf14b..ec9ab8820a613 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_user_defined_quantization.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_user_defined_quantization.py
@@ -18,6 +18,7 @@
 import random
 import numpy as np
 import six
+import tempfile
 import paddle.fluid as fluid
 import paddle
 from paddle.fluid.framework import IrGraph
@@ -110,18 +111,20 @@ def build_program(main, startup, is_test):
         def get_optimizer():
             return fluid.optimizer.MomentumOptimizer(0.0001, 0.9)
 
-        def load_dict():
-            with open('mapping_table_for_saving_inference_model', 'r') as file:
+        def load_dict(mapping_table_path):
+            with open(mapping_table_path, 'r') as file:
                 data = file.read()
                 data = json.loads(data)
                 return data
 
-        def save_dict(Dict):
-            with open('mapping_table_for_saving_inference_model', 'w') as file:
+        def save_dict(Dict, mapping_table_path):
+            with open(mapping_table_path, 'w') as file:
                 file.write(json.dumps(Dict))
 
         random.seed(0)
         np.random.seed(0)
+        tempdir = tempfile.TemporaryDirectory()
+        mapping_table_path = os.path.join(tempdir.name, 'inference')
 
         main = fluid.Program()
         startup = fluid.Program()
@@ -162,7 +165,7 @@ def save_dict(Dict):
             executor=exe)
 
         test_transform_pass.apply(test_graph)
-        save_dict(test_graph.out_node_mapping_table)
+        save_dict(test_graph.out_node_mapping_table, mapping_table_path)
 
         add_quant_dequant_pass = AddQuantDequantPass(scope=scope, place=place)
         add_quant_dequant_pass.apply(main_graph)
@@ -203,10 +206,11 @@ def save_dict(Dict):
             activation_bits=8,
             weight_quantize_type=weight_quant_type)
 
-        mapping_table = load_dict()
+        mapping_table = load_dict(mapping_table_path)
         test_graph.out_node_mapping_table = mapping_table
         if act_quantize_func == None and weight_quantize_func == None:
             freeze_pass.apply(test_graph)
+        tempdir.cleanup()
 
     def test_act_preprocess_cuda(self):
         if fluid.core.is_compiled_with_cuda():
diff --git a/python/paddle/fluid/tests/unittests/test_inference_model_io.py b/python/paddle/fluid/tests/unittests/test_inference_model_io.py
index c19c2c65e6e6a..89b7771700f57 100644
--- a/python/paddle/fluid/tests/unittests/test_inference_model_io.py
+++ b/python/paddle/fluid/tests/unittests/test_inference_model_io.py
@@ -18,6 +18,7 @@
 
 import os
 import six
+import tempfile
 import numpy as np
 import paddle.fluid.core as core
 import paddle.fluid as fluid
@@ -46,8 +47,9 @@ def __init__(self, list):
 class TestBook(unittest.TestCase):
 
     def test_fit_line_inference_model(self):
-        MODEL_DIR = "./tmp/inference_model"
-        UNI_MODEL_DIR = "./tmp/inference_model1"
+        root_path = tempfile.TemporaryDirectory()
+        MODEL_DIR = os.path.join(root_path.name, "inference_model")
+        UNI_MODEL_DIR = os.path.join(root_path.name, "inference_model1")
 
         init_program = Program()
         program = Program()
@@ -118,6 +120,8 @@ def test_fit_line_inference_model(self):
             print("fetch %s" % str(model.fetch_vars[0]))
             self.assertEqual(expected, actual)
 
+        root_path.cleanup()
+
         self.assertRaises(ValueError, fluid.io.load_inference_model, None, exe,
                           model_str, None)
 
@@ -125,7 +129,8 @@ def test_fit_line_inference_model(self):
 class TestSaveInferenceModel(unittest.TestCase):
 
     def test_save_inference_model(self):
-        MODEL_DIR = "./tmp/inference_model2"
+        root_path = tempfile.TemporaryDirectory()
+        MODEL_DIR = os.path.join(root_path.name, "inference_model2")
         init_program = Program()
         program = Program()
 
@@ -144,9 +149,11 @@ def test_save_inference_model(self):
         exe.run(init_program, feed={}, fetch_list=[])
 
         save_inference_model(MODEL_DIR, ["x", "y"], [avg_cost], exe, program)
+        root_path.cleanup()
 
     def test_save_inference_model_with_auc(self):
-        MODEL_DIR = "./tmp/inference_model4"
+        root_path = tempfile.TemporaryDirectory()
+        MODEL_DIR = os.path.join(root_path.name, "inference_model4")
         init_program = Program()
         program = Program()
 
@@ -168,6 +175,7 @@ def test_save_inference_model_with_auc(self):
             warnings.simplefilter("always")
             save_inference_model(MODEL_DIR, ["x", "y"], [avg_cost], exe,
                                  program)
+            root_path.cleanup()
             expected_warn = "please ensure that you have set the auc states to zeros before saving inference model"
             self.assertTrue(len(w) > 0)
             self.assertTrue(expected_warn == str(w[0].message))
@@ -176,7 +184,8 @@ def test_save_inference_model_with_auc(self):
 class TestInstance(unittest.TestCase):
 
     def test_save_inference_model(self):
-        MODEL_DIR = "./tmp/inference_model3"
+        root_path = tempfile.TemporaryDirectory()
+        MODEL_DIR = os.path.join(root_path.name, "inference_model3")
         init_program = Program()
         program = Program()
 
@@ -202,12 +211,14 @@ def test_save_inference_model(self):
         save_inference_model(MODEL_DIR, ["x", "y"], [avg_cost], exe, cp_prog)
         self.assertRaises(TypeError, save_inference_model,
                           [MODEL_DIR, ["x", "y"], [avg_cost], [], cp_prog])
+        root_path.cleanup()
 
 
 class TestSaveInferenceModelNew(unittest.TestCase):
 
     def test_save_and_load_inference_model(self):
-        MODEL_DIR = "./tmp/inference_model5"
+        root_path = tempfile.TemporaryDirectory()
+        MODEL_DIR = os.path.join(root_path.name, "inference_model5")
         init_program = fluid.default_startup_program()
         program = fluid.default_main_program()
 
@@ -303,6 +314,7 @@ def test_save_and_load_inference_model(self):
 
         model = InferModel(paddle.static.io.load_inference_model(
             MODEL_DIR, exe))
+        root_path.cleanup()
 
         outs = exe.run(model.program,
                        feed={
diff --git a/python/paddle/fluid/tests/unittests/test_save_inference_model_conditional_op.py b/python/paddle/fluid/tests/unittests/test_save_inference_model_conditional_op.py
index 9f8f9c382ca01..9075f40a57ec2 100644
--- a/python/paddle/fluid/tests/unittests/test_save_inference_model_conditional_op.py
+++ b/python/paddle/fluid/tests/unittests/test_save_inference_model_conditional_op.py
@@ -17,6 +17,7 @@
 import os
 import unittest
 import numpy as np
+import tempfile
 
 import paddle
 import paddle.fluid as fluid
@@ -93,54 +94,60 @@ def test_while_op(self):
                                        paddle.static.InputSpec(
                                            shape=[1, 3, 8, 8], dtype='float32')
                                    ])
-        paddle.jit.save(net, './while_net')
+        root_path = tempfile.TemporaryDirectory()
+        model_file = os.path.join(root_path.name, "while_net")
+        paddle.jit.save(net, model_file)
 
         right_pdmodel = set([
             "uniform_random", "shape", "slice", "not_equal", "while",
             "elementwise_add"
         ])
         paddle.enable_static()
-        pdmodel = getModelOp("while_net.pdmodel")
-        #print(len(right_pdmodel.difference(pdmodel)))
+        pdmodel = getModelOp(model_file + ".pdmodel")
         self.assertTrue(
             len(right_pdmodel.difference(pdmodel)) == 0,
             "The while op is pruned by mistake.")
+        root_path.cleanup()
 
     def test_for_op(self):
         paddle.disable_static()
         net = ForNet()
         net = paddle.jit.to_static(
             net, input_spec=[paddle.static.InputSpec(shape=[1], dtype='int32')])
-        paddle.jit.save(net, './for_net')
+        root_path = tempfile.TemporaryDirectory()
+        model_file = os.path.join(root_path.name, "for_net")
+        paddle.jit.save(net, model_file)
 
         right_pdmodel = set([
             "randint", "fill_constant", "cast", "less_than", "while",
             "elementwise_add"
         ])
         paddle.enable_static()
-        pdmodel = getModelOp("for_net.pdmodel")
-        #print(len(right_pdmodel.difference(pdmodel)))
+        pdmodel = getModelOp(model_file + ".pdmodel")
         self.assertTrue(
             len(right_pdmodel.difference(pdmodel)) == 0,
             "The for op is pruned by mistake.")
+        root_path.cleanup()
 
     def test_if_op(self):
         paddle.disable_static()
         net = IfElseNet()
         net = paddle.jit.to_static(
             net, input_spec=[paddle.static.InputSpec(shape=[1], dtype='int32')])
-        paddle.jit.save(net, './if_net')
+        root_path = tempfile.TemporaryDirectory()
+        model_file = os.path.join(root_path.name, "if_net")
+        paddle.jit.save(net, model_file)
 
         right_pdmodel = set([
             "assign_value", "greater_than", "cast", "conditional_block",
             "logical_not", "select_input"
         ])
         paddle.enable_static()
-        pdmodel = getModelOp("if_net.pdmodel")
-        #print(len(right_pdmodel.difference(pdmodel)))
+        pdmodel = getModelOp(model_file + ".pdmodel")
         self.assertTrue(
             len(right_pdmodel.difference(pdmodel)) == 0,
             "The if op is pruned by mistake.")
+        root_path.cleanup()
 
 
 if __name__ == '__main__':

From fd40502ebf9da650795cbd88cbd50fac14f2f905 Mon Sep 17 00:00:00 2001
From: Weilong Wu <veyron_wu@163.com>
Date: Tue, 7 Jun 2022 23:27:38 +0800
Subject: [PATCH 31/53] [Eager] Enable legacy for test_layout_autotune (#43273)

---
 python/paddle/fluid/tests/unittests/test_layout_autotune.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/python/paddle/fluid/tests/unittests/test_layout_autotune.py b/python/paddle/fluid/tests/unittests/test_layout_autotune.py
index bd73d9526c0ef..d29f47c8ab11d 100644
--- a/python/paddle/fluid/tests/unittests/test_layout_autotune.py
+++ b/python/paddle/fluid/tests/unittests/test_layout_autotune.py
@@ -20,6 +20,9 @@
 import warnings
 import json
 import os
+from paddle.fluid.framework import _enable_legacy_dygraph
+
+_enable_legacy_dygraph()
 
 
 class SimpleNet(paddle.nn.Layer):

From 5434d663d396d7e97690a5547f1878daee91ffde Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?S=C5=82awomir=20Siwek?= <slawomir.s.siwek@gmail.com>
Date: Tue, 7 Jun 2022 17:49:34 +0200
Subject: [PATCH 32/53] Matmul post-ops for fuses (#43198)

* add method for post ops

* format code

* change post-ops pattern

* code style
---
 .../fluid/operators/mkldnn/conv_mkldnn_op.cc  | 70 +++++++++----------
 .../mkldnn/conv_transpose_mkldnn_op.cc        | 16 ++---
 paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc | 11 ++-
 .../operators/mkldnn/matmul_mkldnn_op.cc      | 39 ++++++-----
 4 files changed, 66 insertions(+), 70 deletions(-)

diff --git a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
index 65092e059f4af..7b790a6081ed7 100644
--- a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
@@ -144,12 +144,6 @@ class ConvMKLDNNHandlerT
                               bias->dims().size()));
       }
 
-      const std::string fuse_activation =
-          ctx.Attr<std::string>("fuse_activation");
-      const float fuse_alpha = ctx.Attr<float>("fuse_alpha");
-      const float fuse_beta = ctx.Attr<float>("fuse_beta");
-      const bool fuse_residual_conn =
-          ctx.Attr<bool>("fuse_residual_connection");
       const int groups = ctx.Attr<int>("groups");
       const std::string padding_algorithm =
           ctx.Attr<std::string>("padding_algorithm");
@@ -221,24 +215,7 @@ class ConvMKLDNNHandlerT
       const auto fwd_prop_kind = is_test ? dnnl::prop_kind::forward_inference
                                          : dnnl::prop_kind::forward_training;
 
-      float sum_scale = 1.0f;
-      float activation_scale = 1.0f;
-      std::vector<float> output_shift_scale;
-      if (platform::is_int8<T>()) {
-        if (ctx.HasAttr("Sum_scale")) {
-          sum_scale = ctx.Attr<float>("Sum_scale");
-          activation_scale = ctx.Attr<float>("Activation_scale");
-          output_shift_scale =
-              ctx.Attr<std::vector<float>>("Output_shift_scale");
-        } else {
-          std::tie(sum_scale, output_shift_scale, activation_scale) =
-              get_int8_scales(ctx);
-        }
-      }
-
-      const dnnl::primitive_attr conv_attr = CreatePostOps(
-          fuse_activation, fuse_alpha, fuse_beta, fuse_residual_conn,
-          output_shift_scale, sum_scale, activation_scale);  // for INT8 only!
+      const dnnl::primitive_attr conv_attr = CreateConvAttrs(ctx);
 
       if (bias) {
         auto bias_tz = phi::vectorize(bias->dims());
@@ -460,12 +437,13 @@ class ConvMKLDNNHandlerT
     auto scale_weights_data = ctx.Attr<std::vector<float>>("Scale_weights");
     bool is_multi_channel = scale_weights_data.size() > 1;
     bool has_activation = !ctx.Attr<std::string>("fuse_activation").empty();
-    float activation_scale = force_fp32_output ? 1.0f
-                             : has_activation  ? ctx.Attr<float>("Scale_out")
-                                               : 1.0f;
-    auto scale_out_data = force_fp32_output ? 1.0f
-                          : has_activation  ? 1.0f
-                                            : ctx.Attr<float>("Scale_out");
+    float activation_scale = (!force_fp32_output && has_activation)
+                                 ? ctx.Attr<float>("Scale_out")
+                                 : 1.0f;
+
+    float scale_out_data = (force_fp32_output || has_activation)
+                               ? 1.0f
+                               : ctx.Attr<float>("Scale_out");
     float sum_scale =
         fuse_residual_conn ? scale_out_data / scale_in_eltwise_data : 1.0f;
     int count =
@@ -490,15 +468,33 @@ class ConvMKLDNNHandlerT
     return std::make_tuple(sum_scale, output_shift_scale, activation_scale);
   }
 
-  dnnl::primitive_attr CreatePostOps(
-      std::string fuse_activation, float fuse_alpha, float fuse_beta,
-      bool fuse_residual_conn, const std::vector<float> output_shift_scale = {},
-      float sum_scale = 1.0f, float activation_scale = 1.0f) {
+  dnnl::primitive_attr CreateConvAttrs(const framework::ExecutionContext& ctx) {
     dnnl::primitive_attr conv_attr;
     dnnl::post_ops post_operations;
-    if (output_shift_scale.size() > 0) {
-      int mask = output_shift_scale.size() > 1 ? 1 << 1 : 0;
-      conv_attr.set_output_scales(mask, output_shift_scale);
+
+    const std::string fuse_activation =
+        ctx.Attr<std::string>("fuse_activation");
+    const float fuse_alpha = ctx.Attr<float>("fuse_alpha");
+    const float fuse_beta = ctx.Attr<float>("fuse_beta");
+    const bool fuse_residual_conn = ctx.Attr<bool>("fuse_residual_connection");
+
+    float sum_scale = 1.0f;
+    float activation_scale = 1.0f;
+    std::vector<float> output_shift_scale;
+    if (platform::is_int8<T>()) {
+      if (ctx.HasAttr("Sum_scale")) {
+        sum_scale = ctx.Attr<float>("Sum_scale");
+        activation_scale = ctx.Attr<float>("Activation_scale");
+        output_shift_scale = ctx.Attr<std::vector<float>>("Output_shift_scale");
+      } else {
+        std::tie(sum_scale, output_shift_scale, activation_scale) =
+            get_int8_scales(ctx);
+      }
+
+      if (output_shift_scale.size() > 0) {
+        int mask = output_shift_scale.size() > 1 ? 1 << 1 : 0;
+        conv_attr.set_output_scales(mask, output_shift_scale);
+      }
     }
 
     // Fusion with Elementwise layer relies on adding a sum post-operation with
diff --git a/paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc
index 99b8d7d6ae385..615c7299bed03 100644
--- a/paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc
@@ -139,10 +139,6 @@ class ConvTransposeMKLDNNHandlerT
      * the memory format preferred for best performance
      */
     const auto chosen_memory_format = MKLDNNMemoryFormat::any;
-    const std::string fuse_activation =
-        ctx.Attr<std::string>("fuse_activation");
-    const float fuse_alpha = ctx.Attr<float>("fuse_alpha");
-    const float fuse_beta = ctx.Attr<float>("fuse_beta");
 
     auto data_type = dnnl::memory::data_type::f32;
     if (ctx.Attr<std::string>("mkldnn_data_type") == "bfloat16" ||
@@ -156,8 +152,7 @@ class ConvTransposeMKLDNNHandlerT
     const auto dst_md = platform::MKLDNNMemDesc(
         dst_tz, platform::MKLDNNGetDataType<T_out>(), chosen_memory_format);
 
-    const dnnl::primitive_attr conv_trans_attr =
-        CreatePostOps(fuse_activation, fuse_alpha, fuse_beta);
+    const dnnl::primitive_attr conv_trans_attr = CreateConvAttrs(ctx);
     auto fwd_prop_kind = is_test_ ? dnnl::prop_kind::forward_inference
                                   : dnnl::prop_kind::forward_training;
     if (bias) {
@@ -176,12 +171,15 @@ class ConvTransposeMKLDNNHandlerT
     }
   }
 
-  dnnl::primitive_attr CreatePostOps(const std::string& fuse_activation,
-                                     const float& fuse_alpha,
-                                     const float& fuse_beta) {
+  dnnl::primitive_attr CreateConvAttrs(const framework::ExecutionContext& ctx) {
     dnnl::primitive_attr conv_attr;
     dnnl::post_ops post_operations;
 
+    const std::string fuse_activation =
+        ctx.Attr<std::string>("fuse_activation");
+    const float fuse_alpha = ctx.Attr<float>("fuse_alpha");
+    const float fuse_beta = ctx.Attr<float>("fuse_beta");
+
     // Fusion with ReLU layer is executed through the PostOps feature. Create a
     // PostOps object and configure it to execute an eltwise relu operation.
     if (fuse_activation == "relu" || fuse_activation == "leaky_relu") {
diff --git a/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc
index 5cbcad5d965a4..590ffe4d0d41b 100644
--- a/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc
@@ -201,7 +201,7 @@ class FCPrimitiveFactory {
         CreateMemDescriptor<T_w>(weight_dims, MKLDNNMemoryFormat::any);
     auto bias_desc = CreateMemDescriptor<float>(bias, MKLDNNMemoryFormat::x);
     auto dst_desc = CreateMemDescriptor<T_out>(output, MKLDNNMemoryFormat::any);
-    const auto attrs = CreatePostOps(ctx);
+    const auto attrs = CreateFCAttrs(ctx);
     return CreateFcPrimDesc(src_desc, weights_desc, bias_desc, dst_desc, attrs);
   }
 
@@ -230,7 +230,7 @@ class FCPrimitiveFactory {
     auto dst_dims = {input_dims[0] * input_dims[1], weight_dims[0]};
     auto dst_desc =
         CreateMemDescriptor<T_out>(dst_dims, MKLDNNMemoryFormat::any);
-    const auto attrs = CreatePostOps(ctx);
+    const auto attrs = CreateFCAttrs(ctx);
     return CreateFcPrimDesc(src_desc, weights_desc, bias_desc, dst_desc, attrs);
   }
 
@@ -255,7 +255,7 @@ class FCPrimitiveFactory {
     auto weights_desc = CreateMemDescriptor<T_w>(dims, MKLDNNMemoryFormat::any);
     auto bias_desc = CreateMemDescriptor<float>(bias, MKLDNNMemoryFormat::x);
     auto dst_desc = CreateMemDescriptor<T_out>(output, MKLDNNMemoryFormat::any);
-    const auto attrs = CreatePostOps(ctx);
+    const auto attrs = CreateFCAttrs(ctx);
     return CreateFcPrimDesc(src_desc, weights_desc, bias_desc, dst_desc, attrs);
   }
 
@@ -455,8 +455,7 @@ class FCPrimitiveFactory {
     bias_ = ReorderWithScale(bias_, fc_prim_desc.bias_desc(), bias_scales);
   }
 
-  // Fuse relu into FC with activation type attribute has been set to 'relu'
-  dnnl::primitive_attr CreatePostOps(const ExecutionContext& ctx) {
+  dnnl::primitive_attr CreateFCAttrs(const ExecutionContext& ctx) {
     dnnl::primitive_attr attributes;
     dnnl::post_ops post_operations;
 
@@ -465,8 +464,8 @@ class FCPrimitiveFactory {
     std::tie(output_shift_scale, scale) = ComputeOutputShiftScale(ctx);
     int mask = CreateMask(1, output_shift_scale.size() > 1);
     attributes.set_output_scales(mask, output_shift_scale);
-    float sum_scale = 1.0f;
 
+    float sum_scale = 1.0f;
     if (ctx.HasAttr("fuse_residual_connection") &&
         ctx.Attr<bool>("fuse_residual_connection")) {
       post_operations.append_sum(sum_scale);
diff --git a/paddle/fluid/operators/mkldnn/matmul_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/matmul_mkldnn_op.cc
index 8921db6cbcef9..12867a482c79f 100644
--- a/paddle/fluid/operators/mkldnn/matmul_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/matmul_mkldnn_op.cc
@@ -147,16 +147,10 @@ class MatMulMKLDNNHandler
     this->AcquireForwardPrimitiveDescriptor(attrs, x_md, y_md, out_md);
   }
   // Constructor for FWD MatMul
-  MatMulMKLDNNHandler(const dnnl::engine engine, const ExecutionContext& ctx,
-                      float scale)
+  MatMulMKLDNNHandler(const dnnl::engine engine, const ExecutionContext& ctx)
       : paddle::platform::MKLDNNHandlerNoCachingT<XT, dnnl::matmul>(
             engine, ctx.GetPlace()) {
-    dnnl::primitive_attr attr;
-    float scale_out = ComputeOutputScale(ctx);
-    if (scale_out != 1.0f) {
-      constexpr unsigned tensor_wide_scale = 0;
-      attr.set_output_scales(tensor_wide_scale, {scale_out});
-    }
+    const dnnl::primitive_attr matmul_attrs = CreateMatmulAttrs(ctx);
 
     auto matmul_dims_ = GetMatmulDims(ctx);
     auto x_md = memory::desc(matmul_dims_.x_dims, MKLDNNGetDataType<XT>(),
@@ -165,7 +159,7 @@ class MatMulMKLDNNHandler
                              matmul_dims_.y_strides);
     auto out_md = memory::desc(matmul_dims_.out_dims, MKLDNNGetDataType<OT>(),
                                matmul_dims_.out_strides);
-    this->AcquireForwardPrimitiveDescriptor(attr, x_md, y_md, out_md);
+    this->AcquireForwardPrimitiveDescriptor(matmul_attrs, x_md, y_md, out_md);
   }
 
   std::shared_ptr<memory> AcquireWeightsMemory(const Tensor* input) {
@@ -429,6 +423,19 @@ class MatMulMKLDNNHandler
     return std::make_tuple(x_offset_, y_offset_, out_offset_);
   }
 
+  dnnl::primitive_attr CreateMatmulAttrs(const ExecutionContext& ctx) {
+    dnnl::primitive_attr matmul_attrs;
+    dnnl::post_ops post_operations;
+
+    float scale_out = ComputeOutputScale(ctx);
+    if (scale_out != 1.0f) {
+      matmul_attrs.set_output_scales(0, {scale_out});
+    }
+
+    matmul_attrs.set_post_ops(post_operations);
+    return matmul_attrs;
+  }
+
  private:
   uint32_t x_offset_;
   uint32_t y_offset_;
@@ -499,23 +506,19 @@ static void ExecuteMatMul(const ExecutionContext& ctx) {
   auto* x = ctx.Input<Tensor>("X");
   auto* y = ctx.Input<Tensor>("Y");
   auto* out = ctx.Output<Tensor>("Out");
-  float alpha = ctx.HasAttr("alpha") ? ctx.Attr<float>("alpha") : 1.0f;
   const auto& dev_ctx =
       ctx.template device_context<paddle::platform::MKLDNNDeviceContext>();
+  const auto& onednn_engine = dev_ctx.GetEngine();
 
   if (force_fp32_output || ((!is_int8) && (!is_bfloat16))) {
-    MatMulMKLDNNHandler<XT, YT, float>(dev_ctx.GetEngine(), ctx, alpha)
-        .Execute(x, y, out);
+    MatMulMKLDNNHandler<XT, YT, float>(onednn_engine, ctx).Execute(x, y, out);
   } else if (is_bfloat16) {
-    MatMulMKLDNNHandler<XT, YT, paddle::platform::bfloat16>(dev_ctx.GetEngine(),
-                                                            ctx, alpha)
+    MatMulMKLDNNHandler<XT, YT, paddle::platform::bfloat16>(onednn_engine, ctx)
         .Execute(x, y, out);
   } else if (fuse_relu) {
-    MatMulMKLDNNHandler<XT, YT, uint8_t>(dev_ctx.GetEngine(), ctx, alpha)
-        .Execute(x, y, out);
+    MatMulMKLDNNHandler<XT, YT, uint8_t>(onednn_engine, ctx).Execute(x, y, out);
   } else {
-    MatMulMKLDNNHandler<XT, YT, int8_t>(dev_ctx.GetEngine(), ctx, alpha)
-        .Execute(x, y, out);
+    MatMulMKLDNNHandler<XT, YT, int8_t>(onednn_engine, ctx).Execute(x, y, out);
   }
 }
 

From 9c17688a14ae72d16cf303b7079ee593a283e19f Mon Sep 17 00:00:00 2001
From: zmxdream <zhangminxu01@baidu.com>
Date: Wed, 8 Jun 2022 08:56:41 +0800
Subject: [PATCH 33/53] [GPUPS]Optimize dump_pool_to_cpu for dymf (#43219)

* optimize dump_to_cpu for dymf

* code clean. test=develop

* fix func. test=develop

* fix code style. test=develop

* fix. test=develop
---
 .../fluid/framework/fleet/ps_gpu_wrapper.cc   | 40 +++++++++++++------
 1 file changed, 28 insertions(+), 12 deletions(-)

diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc
index 65892f8488475..ee2063a5d9abc 100644
--- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc
+++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc
@@ -838,29 +838,42 @@ void PSGPUWrapper::EndPass() {
           std::max(keysize_max, current_task_->device_dim_keys_[i][j].size());
     }
   }
-
-  auto dump_pool_to_cpu_func = [this](int i, int j) {
+  int thread_num = 8;
+  auto dump_pool_to_cpu_func = [this, thread_num](int i, int j, int z) {
     PADDLE_ENFORCE_GPU_SUCCESS(cudaSetDevice(this->resource_->dev_id(i)));
     auto& hbm_pool = this->hbm_pools_[i * this->multi_mf_dim_ + j];
     auto& device_keys = this->current_task_->device_dim_keys_[i][j];
     size_t len = device_keys.size();
+    // ====== multi-thread process feasign================
+    int len_per_thread = len / thread_num;
+    int remain = len % thread_num;
+    int left = -1, right = -1;
+    int real_len = len_per_thread;
+    if (z < remain) real_len++;
+    if (z < remain) {
+      left = z * (len_per_thread + 1);
+      right = left + real_len;
+    } else {
+      left = remain * (len_per_thread + 1) + (z - remain) * len_per_thread;
+      right = left + real_len;
+    }
+    // ============ multi-thread process feasign============
     int mf_dim = this->index_dim_vec_[j];
     VLOG(0) << "dump pool to cpu table: " << i << "with mf dim: " << mf_dim;
     size_t feature_value_size =
         TYPEALIGN(8, sizeof(FeatureValue) + ((mf_dim + 1) * sizeof(float)));
-
-    char* test_build_values = (char*)malloc(feature_value_size * len);
-    cudaMemcpy(test_build_values, hbm_pool->mem(), feature_value_size * len,
-               cudaMemcpyDeviceToHost);
-
+    char* test_build_values = (char*)malloc(feature_value_size * real_len);
+    uint64_t offset = left * feature_value_size;
+    cudaMemcpy(test_build_values, hbm_pool->mem() + offset,
+               feature_value_size * real_len, cudaMemcpyDeviceToHost);
     CHECK(len == hbm_pool->capacity());
     uint64_t unuse_key = std::numeric_limits<uint64_t>::max();
-    for (size_t i = 0; i < len; ++i) {
+    for (int i = left; i < right; ++i) {
       if (device_keys[i] == unuse_key) {
         continue;
       }
-      size_t offset = i * feature_value_size;
-      FeatureValue* gpu_val = (FeatureValue*)(test_build_values + offset);
+      size_t local_offset = (i - left) * feature_value_size;
+      FeatureValue* gpu_val = (FeatureValue*)(test_build_values + local_offset);
 #ifdef PADDLE_WITH_PSLIB
       auto* downpour_value =
           (paddle::ps::DownpourFixedFeatureValue*)(gpu_val->cpu_ptr);
@@ -912,10 +925,13 @@ void PSGPUWrapper::EndPass() {
   if (multi_mf_dim_) {
     VLOG(0) << "psgpu wrapper dump pool: multi_mf_dim_: " << multi_mf_dim_;
     size_t device_num = heter_devices_.size();
-    std::vector<std::thread> threads(device_num * multi_mf_dim_);
+    std::vector<std::thread> threads(device_num * multi_mf_dim_ * thread_num);
     for (size_t i = 0; i < device_num; i++) {
       for (int j = 0; j < multi_mf_dim_; j++) {
-        threads[i + j * device_num] = std::thread(dump_pool_to_cpu_func, i, j);
+        for (int k = 0; k < thread_num; k++) {
+          threads[(i + j * device_num) * thread_num + k] =
+              std::thread(dump_pool_to_cpu_func, i, j, k);
+        }
       }
     }
     for (std::thread& t : threads) {

From 5a6c974ee4886eaa982d2db7d6db2edb7ed6b832 Mon Sep 17 00:00:00 2001
From: Qi Li <qili93@qq.com>
Date: Wed, 8 Jun 2022 10:03:27 +0800
Subject: [PATCH 34/53] fix unit test temp file, test=develop (#43155)

---
 .../fluid/tests/unittests/ipu/test_weight_decay_ipu.py    | 6 +++++-
 python/paddle/fluid/tests/unittests/test_load_op_xpu.py   | 8 ++++++--
 2 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/ipu/test_weight_decay_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_weight_decay_ipu.py
index 630a00f5a7d56..627a8fedff6aa 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_weight_decay_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_weight_decay_ipu.py
@@ -14,7 +14,9 @@
 
 import unittest
 
+import os
 import numpy as np
+import tempfile
 import paddle
 import paddle.static
 from paddle.fluid.tests.unittests.ipu.op_test_ipu import IPUOpTest
@@ -30,6 +32,8 @@ def setUp(self):
         self.set_data_feed()
         self.set_feed_attr()
         self.set_attrs()
+        self.temp_dir = tempfile.TemporaryDirectory()
+        self.model_path = os.path.join(self.temp_dir.name, "weight_decay")
 
     def set_atol(self):
         self.atol = 1e-6
@@ -88,7 +92,7 @@ def exclude_fn(param):
                 place = paddle.CPUPlace()
             exe = paddle.static.Executor(place)
             exe.run(startup_prog)
-            paddle.static.save(main_prog, "weight_decay")
+            paddle.static.save(main_prog, self.model_path)
 
             if run_ipu:
                 feed_list = [image.name]
diff --git a/python/paddle/fluid/tests/unittests/test_load_op_xpu.py b/python/paddle/fluid/tests/unittests/test_load_op_xpu.py
index f3c8024a21ad3..f11c39b7b65df 100644
--- a/python/paddle/fluid/tests/unittests/test_load_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/test_load_op_xpu.py
@@ -16,6 +16,8 @@
 
 import unittest
 import numpy as np
+import os
+import tempfile
 from op_test import OpTest, randomize_probability
 import paddle.fluid as fluid
 import paddle.fluid.layers as layers
@@ -29,6 +31,8 @@ class TestLoadOpXpu(unittest.TestCase):
     """
 
     def setUp(self):
+        self.temp_dir = tempfile.TemporaryDirectory()
+        self.model_path = os.path.join(self.temp_dir.name, "model")
         self.ones = np.ones((4, 4)).astype('float32')
         main_prog = fluid.Program()
         start_prog = fluid.Program()
@@ -44,7 +48,7 @@ def setUp(self):
         exe = fluid.Executor(fluid.XPUPlace(0))
         exe.run(start_prog)
         fluid.io.save_persistables(exe,
-                                   dirname="./model",
+                                   dirname=self.model_path,
                                    main_program=main_prog)
 
     def test_load_xpu(self):
@@ -52,7 +56,7 @@ def test_load_xpu(self):
         start_prog = fluid.Program()
         with fluid.program_guard(main_prog, start_prog):
             var = layers.create_tensor(dtype='float32')
-            layers.load(var, file_path='./model/w')
+            layers.load(var, file_path=self.model_path + '/w')
 
         exe = fluid.Executor(fluid.XPUPlace(0))
         exe.run(start_prog)

From 10f8637c97d4c99fbcfff700858f98e5a878fa16 Mon Sep 17 00:00:00 2001
From: Yiqun Liu <Xreki@users.noreply.github.com>
Date: Wed, 8 Jun 2022 10:13:46 +0800
Subject: [PATCH 35/53] Fix wrong reduce_dims in fused_gate_attention and
 optimize the memory usage. (#43216)

* Polish codes and memory usage for fused_gate_attention.

* Fix wrong reduce_dims in fused_gate_attention when computing gradient of nonbatched_bias.
---
 .../operators/fused/fused_gate_attention.h    | 178 +++++++++-------
 .../fused/fused_gate_attention_op.cc          |   9 +-
 .../fused/fused_gate_attention_op.cu          | 194 +++++++++---------
 .../unittests/test_fused_gate_attention_op.py |  57 +++--
 4 files changed, 247 insertions(+), 191 deletions(-)

diff --git a/paddle/fluid/operators/fused/fused_gate_attention.h b/paddle/fluid/operators/fused/fused_gate_attention.h
index cda33987d68ac..d7ed144f02de7 100644
--- a/paddle/fluid/operators/fused/fused_gate_attention.h
+++ b/paddle/fluid/operators/fused/fused_gate_attention.h
@@ -14,11 +14,11 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h"
 #include "paddle/fluid/operators/transpose_op.cu.h"
 #include "paddle/phi/kernels/funcs/broadcast_function.h"
 #include "paddle/phi/kernels/funcs/elementwise_base.h"
 #include "paddle/phi/kernels/funcs/elementwise_functor.h"
+#include "paddle/phi/kernels/funcs/reduce_function.h"
 #include "paddle/phi/kernels/gpudnn/softmax_gpudnn.h"
 
 namespace paddle {
@@ -27,19 +27,29 @@ namespace operators {
 using Tensor = framework::Tensor;
 
 inline std::string MemoryDebugString(const Tensor& t) {
+  int device_id = platform::GetCurrentDeviceId();
+  int64_t allocated =
+      memory::DeviceMemoryStatCurrentValue("Allocated", device_id);
+  int64_t reserved =
+      memory::DeviceMemoryStatCurrentValue("Reserved", device_id);
+
   std::stringstream ss;
   ss << "shape=[" << t.dims()
      << "], size=" << static_cast<float>(t.memory_size()) / (1 << 20)
-     << " MB, ptr=" << t.data();
-
-  size_t total = 0;
-  size_t available = 0;
-  platform::GpuMemoryUsage(&available, &total);
-  ss << "; memory allocated="
-     << static_cast<float>(total - available) / (1 << 20) << " MB";
+     << " MB, ptr=" << t.data()
+     << "; [MEMORY] allocated=" << static_cast<float>(allocated) / (1 << 20)
+     << " MB"
+     << ", reserved=" << static_cast<float>(reserved) / (1 << 20) << " MB";
   return ss.str();
 }
 
+template <typename T>
+void AllocWithDebugInfo(const platform::CUDADeviceContext& dev_ctx,
+                        const std::string& info, Tensor* t) {
+  t->mutable_data<T>(dev_ctx.GetPlace());
+  VLOG(4) << info << ": " << MemoryDebugString(*t);
+}
+
 template <typename T>
 struct TernaryAddFunctor {
   inline HOSTDEVICE T operator()(T a, T b, T c) const { return a + b + c; }
@@ -48,6 +58,11 @@ struct TernaryAddFunctor {
 template <typename T>
 struct GateAttentionConfig {
  public:
+  const platform::CUDADeviceContext& dev_ctx;
+
+  bool merge_qkv;
+  bool has_gating;
+
   int64_t batch_size;
   int64_t seq_len_m;
   int64_t seq_len_r;
@@ -70,9 +85,11 @@ struct GateAttentionConfig {
   phi::DDim qktv_out_dims;
   phi::DDim gate_out_dims;
 
-  GateAttentionConfig(const Tensor* query, const Tensor* key,
+  GateAttentionConfig(const platform::CUDADeviceContext& dev_ctx,
+                      const Tensor* query, const Tensor* key,
                       const Tensor* query_weight, const Tensor* qkv_weight,
-                      bool merge_qkv) {
+                      bool merge_qkv, bool has_gating)
+      : dev_ctx(dev_ctx), merge_qkv(merge_qkv), has_gating(has_gating) {
     // query: shape=[batch_size, seq_len_m, seq_len_r, q_dim]
     batch_size = query->dims()[0];
     seq_len_m = query->dims()[1];
@@ -131,59 +148,68 @@ struct GateAttentionConfig {
     return batch_size * seq_len_m * seq_len_r * num_heads * key_dim;
   }
 
-  Tensor* GetQKVOut(const platform::CUDADeviceContext& dev_ctx) {
+  Tensor* GetQKVOut() {
     if (!qkv_out.IsInitialized()) {
       qkv_out.Resize(qkv_out_dims);
-      qkv_out.mutable_data<T>(dev_ctx.GetPlace());
-      VLOG(4) << "qkv_out: " << MemoryDebugString(qkv_out);
+      AllocWithDebugInfo<T>(dev_ctx, "qkv_out", &qkv_out);
     }
     return &qkv_out;
   }
 
-  Tensor* GetQueryOut(const platform::CUDADeviceContext& dev_ctx) {
+  Tensor* GetQueryOut() {
     if (!query_out.IsInitialized()) {
       query_out.Resize(q_out_dims);
-      query_out.mutable_data<T>(dev_ctx.GetPlace());
-      VLOG(4) << "query_out: " << MemoryDebugString(query_out);
+      AllocWithDebugInfo<T>(dev_ctx, "query_out", &query_out);
     }
     return &query_out;
   }
 
-  Tensor* GetKeyOut(const platform::CUDADeviceContext& dev_ctx) {
+  Tensor* GetKeyOut() {
     if (!key_out.IsInitialized()) {
       key_out.Resize(kv_out_dims);
-      key_out.mutable_data<T>(dev_ctx.GetPlace());
-      VLOG(4) << "key_out: " << MemoryDebugString(key_out);
+      AllocWithDebugInfo<T>(dev_ctx, "key_out", &key_out);
     }
     return &key_out;
   }
 
-  Tensor* GetValueOut(const platform::CUDADeviceContext& dev_ctx) {
+  Tensor* GetValueOut() {
     if (!value_out.IsInitialized()) {
       value_out.Resize(kv_out_dims);
-      value_out.mutable_data<T>(dev_ctx.GetPlace());
-      VLOG(4) << "value_out: " << MemoryDebugString(value_out);
+      AllocWithDebugInfo<T>(dev_ctx, "value_out", &value_out);
     }
     return &value_out;
   }
 
-  Tensor* GetQKOut(const platform::CUDADeviceContext& dev_ctx,
-                   Tensor* softmax_out) {
+  Tensor* GetQKOut(Tensor* softmax_out) {
     // softmax_dim = qk_out_dim[-1] = qk_out_dim[rank - 1]
     int softmax_dim = m_size;
     if (!softmax_out || phi::UseCudnnSoftmax<T>(dev_ctx, softmax_dim, true)) {
       // Not sure whether cudnn softmax can execute inplace.
       if (!qkv_out.IsInitialized()) {
         qk_out.Resize(qk_out_dims);
-        qk_out.mutable_data<T>(dev_ctx.GetPlace());
-        VLOG(4) << "qk_out: " << MemoryDebugString(qk_out);
+        AllocWithDebugInfo<T>(dev_ctx, "qk_out", &qk_out);
       }
       return &qk_out;
     } else {
+      // Enable inplace softmax.
       return softmax_out;
     }
   }
 
+  Tensor* GetQKTVOut(Tensor* gate_out) {
+    if (has_gating && gate_out) {
+      // Reuse gate_out.
+      gate_out->Resize(qktv_out_dims);
+      return gate_out;
+    } else {
+      if (!qktv_out.IsInitialized()) {
+        qktv_out.Resize(qktv_out_dims);
+        AllocWithDebugInfo<T>(dev_ctx, "qktv_out", &qktv_out);
+      }
+      return &qktv_out;
+    }
+  }
+
   void ClearQKVOut() {
     if (qkv_out.IsInitialized()) {
       qkv_out.clear();
@@ -196,9 +222,14 @@ struct GateAttentionConfig {
     }
   }
 
+  void ClearQKTVOut() {
+    if (qktv_out.IsInitialized()) {
+      qktv_out.clear();
+    }
+  }
+
  protected:
   Tensor qkv_out;
-  // QKV is not merged
   Tensor query_out;
   Tensor key_out;
   Tensor value_out;
@@ -207,63 +238,60 @@ struct GateAttentionConfig {
   // softmax_out = softmax(qk_out + nonbatched_bias + src_mask)
   // The shape of qk_out, softmax_out is the same, thus can be called inplace.
   Tensor qk_out;
+  // qktv_out may reuse gate_out.
+  Tensor qktv_out;
 };
 
 template <typename T>
 struct GateAttentionGradConfig : public GateAttentionConfig<T> {
  public:
-  GateAttentionGradConfig(const Tensor* query, const Tensor* key,
+  GateAttentionGradConfig(const platform::CUDADeviceContext& dev_ctx,
+                          const Tensor* query, const Tensor* key,
                           const Tensor* query_weight, const Tensor* qkv_weight,
-                          bool merge_qkv)
-      : GateAttentionConfig<T>(query, key, query_weight, qkv_weight,
-                               merge_qkv) {}
+                          bool merge_qkv, bool has_gating)
+      : GateAttentionConfig<T>(dev_ctx, query, key, query_weight, qkv_weight,
+                               merge_qkv, has_gating) {}
 
-  Tensor* GetQKVOutGrad(const platform::CUDADeviceContext& dev_ctx) {
+  Tensor* GetQKVOutGrad() {
     if (!qkv_out_grad.IsInitialized()) {
       qkv_out_grad.Resize(this->qkv_out_dims);
-      qkv_out_grad.mutable_data<T>(dev_ctx.GetPlace());
-      VLOG(4) << "qkv_out_grad: " << MemoryDebugString(qkv_out_grad);
+      AllocWithDebugInfo<T>(this->dev_ctx, "qkv_out_grad", &qkv_out_grad);
     }
     return &qkv_out_grad;
   }
 
-  Tensor* GetQueryOutGrad(const platform::CUDADeviceContext& dev_ctx) {
+  Tensor* GetQueryOutGrad() {
     if (!query_out_grad.IsInitialized()) {
       query_out_grad.Resize(this->q_out_dims);
-      query_out_grad.mutable_data<T>(dev_ctx.GetPlace());
-      VLOG(4) << "query_out_grad: " << MemoryDebugString(query_out_grad);
+      AllocWithDebugInfo<T>(this->dev_ctx, "query_out_grad", &query_out_grad);
     }
     return &query_out_grad;
   }
 
-  Tensor* GetKeyOutGrad(const platform::CUDADeviceContext& dev_ctx) {
+  Tensor* GetKeyOutGrad() {
     if (!key_out_grad.IsInitialized()) {
       key_out_grad.Resize(this->kv_out_dims);
-      key_out_grad.mutable_data<T>(dev_ctx.GetPlace());
-      VLOG(4) << "key_out_grad: " << MemoryDebugString(key_out_grad);
+      AllocWithDebugInfo<T>(this->dev_ctx, "key_out_grad", &key_out_grad);
     }
     return &key_out_grad;
   }
 
-  Tensor* GetValueOutGrad(const platform::CUDADeviceContext& dev_ctx) {
+  Tensor* GetValueOutGrad() {
     if (!value_out_grad.IsInitialized()) {
       value_out_grad.Resize(this->kv_out_dims);
-      value_out_grad.mutable_data<T>(dev_ctx.GetPlace());
-      VLOG(4) << "value_out_grad: " << MemoryDebugString(value_out_grad);
+      AllocWithDebugInfo<T>(this->dev_ctx, "value_out_grad", &value_out_grad);
     }
     return &value_out_grad;
   }
 
-  Tensor* GetQKOutGrad(const platform::CUDADeviceContext& dev_ctx,
-                       Tensor* softmax_out_grad) {
+  Tensor* GetQKOutGrad(Tensor* softmax_out_grad) {
     // softmax_dim = qk_out_dim[-1] = qk_out_dim[rank - 1]
     int softmax_dim = this->m_size;
     if (!softmax_out_grad ||
-        phi::UseCudnnSoftmax<T>(dev_ctx, softmax_dim, true)) {
+        phi::UseCudnnSoftmax<T>(this->dev_ctx, softmax_dim, true)) {
       if (!qk_out_grad.IsInitialized()) {
         qk_out_grad.Resize(this->qk_out_dims);
-        qk_out_grad.mutable_data<T>(dev_ctx.GetPlace());
-        VLOG(4) << "qk_out_grad: " << MemoryDebugString(qk_out_grad);
+        AllocWithDebugInfo<T>(this->dev_ctx, "qk_out_grad", &qk_out_grad);
       }
       return &qk_out_grad;
     } else {
@@ -288,7 +316,7 @@ class FMHAGateRef {
   void ComputeForward(const Tensor* nonbatched_bias, const Tensor* src_mask,
                       Tensor* q_transpose_out, Tensor* k_transpose_out,
                       Tensor* v_transpose_out, Tensor* qkv_transpose_out,
-                      Tensor* softmax_out, Tensor* fmha_out,
+                      Tensor* softmax_out, Tensor* fmha_out, Tensor* gate_out,
                       GateAttentionConfig<T>* config) {
     T* q_ptr = nullptr;
     T* k_ptr = nullptr;
@@ -300,7 +328,7 @@ class FMHAGateRef {
           platform::errors::NotFound("The input qkv_transpose_out can not be "
                                      "nullptr when merge_qkv is true."));
 
-      Tensor* qkv_out = config->GetQKVOut(dev_ctx_);
+      Tensor* qkv_out = config->GetQKVOut();
       ComputeQKVTransposeForward(*qkv_out, qkv_transpose_out);
       config->ClearQKVOut();
 
@@ -323,9 +351,9 @@ class FMHAGateRef {
           platform::errors::NotFound("The input v_transpose_out can not be "
                                      "nullptr when merge_qkv is false."));
 
-      Tensor* query_out = config->GetQueryOut(dev_ctx_);
-      Tensor* key_out = config->GetKeyOut(dev_ctx_);
-      Tensor* value_out = config->GetValueOut(dev_ctx_);
+      Tensor* query_out = config->GetQueryOut();
+      Tensor* key_out = config->GetKeyOut();
+      Tensor* value_out = config->GetValueOut();
       ComputeQKVTransposeForward(*query_out, *key_out, *value_out,
                                  q_transpose_out, k_transpose_out,
                                  v_transpose_out);
@@ -340,7 +368,7 @@ class FMHAGateRef {
     // [batch_size, seq_len_m, num_heads, seq_len_r, key_dim] *
     //                [batch_size, seq_len_m, num_heads, m_size, key_dim]
     // -> [batch_size, seq_len_m, num_heads, seq_len_r, m_size]
-    Tensor* qk_out = config->GetQKOut(dev_ctx_, softmax_out);
+    Tensor* qk_out = config->GetQKOut(softmax_out);
     T* qk_out_ptr = qk_out->data<T>();
 
     int64_t gemm_batch_size =
@@ -362,9 +390,8 @@ class FMHAGateRef {
     // [batch_size, seq_len_m, num_heads, seq_len_r, m_size] *
     //               [batch_size, seq_len_m, num_heads, m_size, key_dim]
     // -> [batch_size, seq_len_m, num_heads, seq_len_r, key_dim]
-    Tensor qktv_out;
-    qktv_out.Resize(config->qktv_out_dims);
-    T* qktv_out_ptr = qktv_out.mutable_data<T>(dev_ctx_.GetPlace());
+    Tensor* qktv_out = config->GetQKTVOut(gate_out);
+    T* qktv_out_ptr = qktv_out->data<T>();
 
     gemm_m = config->seq_len_r;
     gemm_n = config->key_dim;
@@ -375,7 +402,11 @@ class FMHAGateRef {
                        gemm_m, gemm_n, gemm_k, gemm_batch_size);
 
     // fmha_out = transpose(qktv_out)
-    ComputeQKTVTransposeForward(qktv_out, fmha_out);
+    ComputeQKTVTransposeForward(*qktv_out, fmha_out);
+    config->ClearQKTVOut();
+    if (config->has_gating) {
+      gate_out->Resize(config->gate_out_dims);
+    }
   }
 
   void ComputeBackward(const Tensor* q_transpose_out,
@@ -409,8 +440,10 @@ class FMHAGateRef {
       v_ptr = k_ptr + q_size;
 
       qkv_transpose_out_grad.Resize(config->qkv_transpose_out_dims);
+      AllocWithDebugInfo<T>(dev_ctx_, "qkv_transpose_out_grad",
+                            &qkv_transpose_out_grad);
 
-      q_grad_ptr = qkv_transpose_out_grad.mutable_data<T>(dev_ctx_.GetPlace());
+      q_grad_ptr = qkv_transpose_out_grad.data<T>();
       k_grad_ptr = q_grad_ptr + q_size;
       v_grad_ptr = k_grad_ptr + q_size;
     } else {
@@ -442,7 +475,7 @@ class FMHAGateRef {
 
     Tensor softmax_out_grad;
     softmax_out_grad.Resize(config->softmax_out_dims);
-    softmax_out_grad.mutable_data<T>(dev_ctx_.GetPlace());
+    AllocWithDebugInfo<T>(dev_ctx_, "softmax_out_grad", &softmax_out_grad);
 
     int64_t gemm_batch_size =
         config->batch_size * config->seq_len_m * config->num_heads;
@@ -450,7 +483,7 @@ class FMHAGateRef {
       // Forward: fmha_out = transpose(qktv_out)
       Tensor qktv_out_grad;
       qktv_out_grad.Resize(config->qktv_out_dims);
-      T* qktv_out_grad_ptr = qktv_out_grad.mutable_data<T>(dev_ctx_.GetPlace());
+      AllocWithDebugInfo<T>(dev_ctx_, "qktv_out_grad", &qktv_out_grad);
       ComputeQKTVTransposeBackward(*fmha_out_grad, &qktv_out_grad);
 
       // Forward: qktv_out = BatchedGEMM(softmax_out, V)
@@ -461,6 +494,7 @@ class FMHAGateRef {
       int64_t gemm_k = config->seq_len_r;
 
       const T* softmax_out_ptr = softmax_out->data<T>();
+      const T* qktv_out_grad_ptr = qktv_out_grad.data<T>();
       ComputeBatchedGEMM(softmax_out_ptr, qktv_out_grad_ptr, v_grad_ptr, true,
                          false, gemm_m, gemm_n, gemm_k, gemm_batch_size);
 
@@ -474,7 +508,7 @@ class FMHAGateRef {
                          true, gemm_m, gemm_n, gemm_k, gemm_batch_size);
     }
 
-    Tensor* qk_out_grad = config->GetQKOutGrad(dev_ctx_, &softmax_out_grad);
+    Tensor* qk_out_grad = config->GetQKOutGrad(&softmax_out_grad);
     ComputeBiasMaskSoftmaxBackward(&softmax_out_grad, softmax_out,
                                    src_mask_grad, qk_out_grad,
                                    nonbatched_bias_grad);
@@ -498,12 +532,12 @@ class FMHAGateRef {
                        gemm_n, gemm_k, gemm_batch_size, alpha);
 
     if (merge_qkv_) {
-      Tensor* qkv_out_grad = config->GetQKVOutGrad(dev_ctx_);
+      Tensor* qkv_out_grad = config->GetQKVOutGrad();
       ComputeQKVTransposeBackward(qkv_transpose_out_grad, qkv_out_grad);
     } else {
-      Tensor* q_out_grad = config->GetQueryOutGrad(dev_ctx_);
-      Tensor* k_out_grad = config->GetKeyOutGrad(dev_ctx_);
-      Tensor* v_out_grad = config->GetValueOutGrad(dev_ctx_);
+      Tensor* q_out_grad = config->GetQueryOutGrad();
+      Tensor* k_out_grad = config->GetKeyOutGrad();
+      Tensor* v_out_grad = config->GetValueOutGrad();
       ComputeQKVTransposeBackward(q_transpose_out_grad, k_transpose_out_grad,
                                   v_transpose_out_grad, q_out_grad, k_out_grad,
                                   v_out_grad);
@@ -578,12 +612,12 @@ class FMHAGateRef {
     if (nonbatched_bias) {
       std::vector<const Tensor*> ins = {qk_out, nonbatched_bias, src_mask};
       std::vector<Tensor*> outs = {qk_out};
-      phi::funcs::BroadcastKernel<ElementwiseType::kTernary, T, T>(
+      phi::funcs::BroadcastKernel<phi::ElementwiseType::kTernary, T, T>(
           dev_ctx_, ins, &outs, -1, TernaryAddFunctor<T>());
     } else {
       std::vector<const Tensor*> ins = {qk_out, src_mask};
       std::vector<Tensor*> outs = {qk_out};
-      phi::funcs::BroadcastKernel<ElementwiseType::kBinary, T, T>(
+      phi::funcs::BroadcastKernel<phi::ElementwiseType::kBinary, T, T>(
           dev_ctx_, ins, &outs, -1, phi::funcs::AddFunctor<T>());
     }
     phi::SoftmaxForwardCUDAKernelDriver<T>(dev_ctx_, *qk_out, -1, softmax_out);
@@ -614,12 +648,12 @@ class FMHAGateRef {
     phi::SoftmaxBackwardCUDAKernelDriver<T>(dev_ctx_, *softmax_out,
                                             *softmax_out_grad, -1, qk_out_grad);
 
-    // [1, bs, num_head, seq_l, seq_l] -> [bs, num_head, seq_l, seq_l]
     if (nonbatched_bias_grad) {
-      gpuStream_t stream = dev_ctx_.stream();
-      TensorReduceImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
+      // [batch_size, seq_len_m, num_heads, seq_len_r, m_size] ->
+      //      [batch_size, 1, num_heads, seq_len_r, m_size]
+      phi::funcs::ReduceKernel<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
           dev_ctx_, *qk_out_grad, nonbatched_bias_grad,
-          kps::IdentityFunctor<T>(), {0, 1}, stream);
+          kps::IdentityFunctor<T>(), {1});
     }
   }
 
diff --git a/paddle/fluid/operators/fused/fused_gate_attention_op.cc b/paddle/fluid/operators/fused/fused_gate_attention_op.cc
index 0bbeabd5fc9cb..506f437b1ae54 100644
--- a/paddle/fluid/operators/fused/fused_gate_attention_op.cc
+++ b/paddle/fluid/operators/fused/fused_gate_attention_op.cc
@@ -214,7 +214,7 @@ class FusedGateAttentionGradOp : public framework::OperatorWithKernel {
                    "fused_aate_attention_arad");
 
     if (ctx->Attrs().Get<bool>("has_gating")) {
-      for (auto& name : {"GateWeight", "GateBias", "GateOut"}) {
+      for (auto& name : {"GateWeight", "GateBias"}) {
         ctx->SetOutputDim(framework::GradVarName(name), ctx->GetInputDim(name));
       }
     }
@@ -224,9 +224,6 @@ class FusedGateAttentionGradOp : public framework::OperatorWithKernel {
                         ctx->GetInputDim("NonbatchedBias"));
     }
 
-    ctx->SetOutputDim(framework::GradVarName("FMHAOut"),
-                      ctx->GetInputDim("FMHAOut"));
-
     ctx->SetOutputDim(framework::GradVarName("OutLinearWeight"),
                       ctx->GetInputDim("OutLinearWeight"));
     ctx->SetOutputDim(framework::GradVarName("OutLinearBias"),
@@ -270,8 +267,6 @@ class FusedGateAttentionGradOpMaker : public framework::SingleGradOpMaker<T> {
     }
 
     op->SetInput("FMHAOut", this->Output("FMHAOut"));
-    op->SetOutput(framework::GradVarName("FMHAOut"),
-                  this->OutputGrad("FMHAOut"));
 
     if (this->HasInput("NonbatchedBias")) {
       op->SetInput("NonbatchedBias", this->Input("NonbatchedBias"));
@@ -292,8 +287,6 @@ class FusedGateAttentionGradOpMaker : public framework::SingleGradOpMaker<T> {
                     this->InputGrad("GateBias"));
 
       op->SetInput("GateOut", this->Output("GateOut"));
-      op->SetOutput(framework::GradVarName("GateOut"),
-                    this->OutputGrad("GateOut"));
     }
 
     op->SetInput("OutLinearWeight", this->Input("OutLinearWeight"));
diff --git a/paddle/fluid/operators/fused/fused_gate_attention_op.cu b/paddle/fluid/operators/fused/fused_gate_attention_op.cu
index 8f375a22cc023..ebc9a4f98d0ae 100644
--- a/paddle/fluid/operators/fused/fused_gate_attention_op.cu
+++ b/paddle/fluid/operators/fused/fused_gate_attention_op.cu
@@ -79,11 +79,11 @@ void ComputeMergedQKVMatmulForward(const framework::ExecutionContext &ctx,
 }
 
 template <typename T>
-Tensor *ComputeMergedQKVMatmulBackward(const framework::ExecutionContext &ctx,
-                                       const GateAttentionGradConfig<T> &config,
-                                       const Tensor *query,
-                                       const Tensor *qkv_out_grad,
-                                       Tensor *query_grad, bool use_addto) {
+void ComputeMergedQKVMatmulBackward(const framework::ExecutionContext &ctx,
+                                    const GateAttentionGradConfig<T> &config,
+                                    const Tensor *query,
+                                    const Tensor *qkv_out_grad,
+                                    Tensor *query_grad, bool use_addto) {
   auto *qkv_weight = ctx.Input<Tensor>("QKVWeight");
   auto *qkv_weight_grad =
       ctx.Output<Tensor>(framework::GradVarName("QKVWeight"));
@@ -97,7 +97,6 @@ Tensor *ComputeMergedQKVMatmulBackward(const framework::ExecutionContext &ctx,
       AttnMatMul<T>(ctx.cuda_device_context(), false, true, m, n, k, false);
   qkv_compute.ComputeBackward(query, qkv_weight, qkv_out_grad, query_grad,
                               qkv_weight_grad, nullptr, use_addto);
-  return query_grad;
 }
 
 template <typename T>
@@ -137,12 +136,14 @@ void ComputeSeparatedQKVMatmulForward(const framework::ExecutionContext &ctx,
 }
 
 template <typename T>
-Tensor *ComputeSeparatedQKVMatmulBackward(
-    const framework::ExecutionContext &ctx,
-    const GateAttentionGradConfig<T> &config, const Tensor *query,
-    const Tensor *key, const Tensor *query_out_grad, const Tensor *key_out_grad,
-    const Tensor *value_out_grad, Tensor *query_grad, Tensor *key_grad,
-    bool use_addto) {
+void ComputeSeparatedQKVMatmulBackward(const framework::ExecutionContext &ctx,
+                                       const GateAttentionGradConfig<T> &config,
+                                       const Tensor *query, const Tensor *key,
+                                       const Tensor *query_out_grad,
+                                       const Tensor *key_out_grad,
+                                       const Tensor *value_out_grad,
+                                       Tensor *query_grad, Tensor *key_grad,
+                                       bool use_addto) {
   // Gradient of GEMM(key, k_weight)
   const auto *key_weight = ctx.Input<Tensor>("KeyWeight");
   auto *key_weight_grad =
@@ -179,22 +180,16 @@ Tensor *ComputeSeparatedQKVMatmulBackward(
                                  q_n, q_k, false);
   q_compute.ComputeBackward(query, query_weight, query_out_grad, query_grad,
                             query_weight_grad, nullptr, use_addto);
-  return query_grad;
 }
 
 template <typename T>
-Tensor *ComputeGatingLinearForward(const framework::ExecutionContext &ctx,
-                                   const GateAttentionConfig<T> &config,
-                                   const Tensor *query,
-                                   const Tensor *fmha_out) {
+void ComputeGatingLinearForward(const framework::ExecutionContext &ctx,
+                                const GateAttentionConfig<T> &config,
+                                const Tensor *query, const Tensor *fmha_out,
+                                Tensor *gate_out) {
   auto *gate_weight = ctx.Input<Tensor>("GateWeight");
   auto *gate_bias = ctx.Input<Tensor>("GateBias");
 
-  auto *gate_out = ctx.Output<Tensor>("GateOut");
-  gate_out->mutable_data<T>(ctx.GetPlace());
-  VLOG(4) << "[ComputeGatingLinearForward] gate_out: "
-          << MemoryDebugString(*gate_out);
-
   // The first gate_bias_out stores the result of the multiplication,
   // and the second gate_bias_out stores the result of the multiplication +
   // bias.
@@ -212,16 +207,14 @@ Tensor *ComputeGatingLinearForward(const framework::ExecutionContext &ctx,
   std::vector<Tensor *> outs = {gate_out};
   phi::funcs::ElementwiseKernel<T>(ctx.cuda_device_context(), ins, &outs,
                                    SigmoidMultiplyFunctor<T>());
-  return gate_out;
 }
 
 template <typename T>
-Tensor *ComputeGatingLinearBackward(const framework::ExecutionContext &ctx,
-                                    const GateAttentionGradConfig<T> &config,
-                                    const Tensor *fmha_out,
-                                    const Tensor *gate_out_grad,
-                                    Tensor *query_grad, Tensor *fmha_out_grad) {
-  const auto *query = ctx.Input<Tensor>("Query");
+void ComputeGatingLinearBackward(const framework::ExecutionContext &ctx,
+                                 const GateAttentionGradConfig<T> &config,
+                                 const Tensor *query, const Tensor *fmha_out,
+                                 const Tensor *gate_out_grad,
+                                 Tensor *query_grad, Tensor *fmha_out_grad) {
   const auto *gate_weight = ctx.Input<Tensor>("GateWeight");
   const auto *gate_bias = ctx.Input<Tensor>("GateBias");
 
@@ -255,20 +248,15 @@ Tensor *ComputeGatingLinearBackward(const framework::ExecutionContext &ctx,
   gate_attn_compute.ComputeBackward(query, gate_weight, &gate_bias_out,
                                     query_grad, gate_weight_grad,
                                     gate_bias_grad);
-  return fmha_out_grad;
 }
 
 template <typename T>
-Tensor *ComputeOutputLinearForward(const framework::ExecutionContext &ctx,
-                                   const GateAttentionConfig<T> &config,
-                                   const Tensor *fmha_or_gate_out) {
+void ComputeOutputLinearForward(const framework::ExecutionContext &ctx,
+                                const GateAttentionConfig<T> &config,
+                                const Tensor *fmha_or_gate_out, Tensor *out) {
   const auto *out_linear_weight = ctx.Input<Tensor>("OutLinearWeight");
   const auto *out_linear_bias = ctx.Input<Tensor>("OutLinearBias");
 
-  auto *out = ctx.Output<Tensor>("Out");
-  out->mutable_data<T>(ctx.GetPlace());
-  VLOG(4) << "[ComputeOutputLinearForward] out: " << MemoryDebugString(*out);
-
   // out = GEMM(fmha_or_gate_out, out_linear_weight) + out_linear_bias
   int m = config.batch_size * config.seq_len_m * config.seq_len_r;
   int n = config.q_dim;
@@ -277,28 +265,24 @@ Tensor *ComputeOutputLinearForward(const framework::ExecutionContext &ctx,
       AttnMatMul<T>(ctx.cuda_device_context(), false, false, m, n, k, true);
   out_linear_compute.ComputeForward(out_linear_weight, fmha_or_gate_out,
                                     out_linear_bias, out, out);
-  return out;
 }
 
 template <typename T>
-Tensor *ComputeOutputLinearBackward(const framework::ExecutionContext &ctx,
-                                    const GateAttentionGradConfig<T> &config,
-                                    bool has_gating) {
-  std::string input_name = has_gating ? "GateOut" : "FMHAOut";
-
+void ComputeOutputLinearBackward(const framework::ExecutionContext &ctx,
+                                 const GateAttentionGradConfig<T> &config,
+                                 const Tensor *input, Tensor *input_grad) {
   const auto *out_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
   const auto *out_linear_weight = ctx.Input<Tensor>("OutLinearWeight");
-  const auto *input = ctx.Input<Tensor>(input_name);
 
   auto *out_linear_weight_grad =
       ctx.Output<Tensor>(framework::GradVarName("OutLinearWeight"));
   auto *out_linear_bias_grad =
       ctx.Output<Tensor>(framework::GradVarName("OutLinearBias"));
-  auto *input_grad = ctx.Output<Tensor>(framework::GradVarName(input_name));
 
   out_linear_weight_grad->mutable_data<T>(ctx.GetPlace());
   out_linear_bias_grad->mutable_data<T>(ctx.GetPlace());
-  input_grad->mutable_data<T>(ctx.GetPlace());
+
+  auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
 
   int m = config.batch_size * config.seq_len_m * config.seq_len_r;
   int n = config.q_dim;
@@ -308,7 +292,6 @@ Tensor *ComputeOutputLinearBackward(const framework::ExecutionContext &ctx,
   out_linear_compute.ComputeBackward(input, out_linear_weight, out_grad,
                                      input_grad, out_linear_weight_grad,
                                      out_linear_bias_grad);
-  return input_grad;
 }
 
 template <typename T>
@@ -330,56 +313,64 @@ class FusedGateAttentionOpKernel : public framework::OpKernel<T> {
 
     auto *softmax_out = ctx.Output<Tensor>("SoftmaxOut");
     auto *fmha_out = ctx.Output<Tensor>("FMHAOut");
+    auto *gate_out = ctx.Output<Tensor>("GateOut");
+    auto *out = ctx.Output<Tensor>("Out");
 
     const bool merge_qkv = ctx.Attr<bool>("merge_qkv");
     const bool has_gating = ctx.Attr<bool>("has_gating");
 
-    // When seq_len_r = m_size, q_dim = kv_dim, QKV matmul can be merged.
     auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    GateAttentionConfig<T> config(query, key, query_weight, qkv_weight,
-                                  merge_qkv);
+    AllocWithDebugInfo<T>(dev_ctx, "softmax_out", softmax_out);
+    AllocWithDebugInfo<T>(dev_ctx, "fmha_out", fmha_out);
+    if (has_gating) {
+      AllocWithDebugInfo<T>(dev_ctx, "gate_out", gate_out);
+    }
+    AllocWithDebugInfo<T>(dev_ctx, "out", out);
+
+    // When seq_len_r = m_size, q_dim = kv_dim, QKV matmul can be merged.
+    GateAttentionConfig<T> config(dev_ctx, query, key, query_weight, qkv_weight,
+                                  merge_qkv, has_gating);
 
     if (merge_qkv) {
+      PADDLE_ENFORCE_EQ(!key || query == key, true,
+                        platform::errors::InvalidArgument(
+                            "key is expected to be nullptr or the same as "
+                            "query, but recieved key=%p, query=%p.",
+                            key, query));
+
       // 1. Merged QKV Matmul: einsum(nbhqk,nbkhc -> nbqhc)
-      Tensor *qkv_out = config.GetQKVOut(dev_ctx);
+      Tensor *qkv_out = config.GetQKVOut();
       ComputeMergedQKVMatmulForward<T>(ctx, config, query, qkv_out);
 
-      qkv_transpose_out->mutable_data<T>(ctx.GetPlace());
-      VLOG(4) << "qkv_transpose_out:" << MemoryDebugString(*qkv_transpose_out);
+      AllocWithDebugInfo<T>(dev_ctx, "qkv_transpose_out", qkv_transpose_out);
     } else {
       // 1. Separated QKV Matmul
-      Tensor *query_out = config.GetQueryOut(dev_ctx);
-      Tensor *key_out = config.GetKeyOut(dev_ctx);
-      Tensor *value_out = config.GetValueOut(dev_ctx);
+      Tensor *query_out = config.GetQueryOut();
+      Tensor *key_out = config.GetKeyOut();
+      Tensor *value_out = config.GetValueOut();
       ComputeSeparatedQKVMatmulForward<T>(ctx, config, query, key, query_out,
                                           key_out, value_out);
 
-      q_transpose_out->mutable_data<T>(ctx.GetPlace());
-      k_transpose_out->mutable_data<T>(ctx.GetPlace());
-      v_transpose_out->mutable_data<T>(ctx.GetPlace());
-      VLOG(4) << "q_transpose_out: " << MemoryDebugString(*q_transpose_out);
-      VLOG(4) << "k_transpose_out: " << MemoryDebugString(*k_transpose_out);
-      VLOG(4) << "v_transpose_out: " << MemoryDebugString(*v_transpose_out);
+      AllocWithDebugInfo<T>(dev_ctx, "q_transpose_out", q_transpose_out);
+      AllocWithDebugInfo<T>(dev_ctx, "k_transpose_out", k_transpose_out);
+      AllocWithDebugInfo<T>(dev_ctx, "v_transpose_out", v_transpose_out);
     }
 
-    softmax_out->mutable_data<T>(ctx.GetPlace());
-    fmha_out->mutable_data<T>(ctx.GetPlace());
-    VLOG(4) << "softmax_out: " << MemoryDebugString(*softmax_out);
-    VLOG(4) << "fmha_out: " << MemoryDebugString(*fmha_out);
-
     // 2. FMHA
     auto fmha_compute = FMHAGateRef<T>(dev_ctx, merge_qkv);
-    fmha_compute.ComputeForward(
-        nonbatched_bias, src_mask, q_transpose_out, k_transpose_out,
-        v_transpose_out, qkv_transpose_out, softmax_out, fmha_out, &config);
+    fmha_compute.ComputeForward(nonbatched_bias, src_mask, q_transpose_out,
+                                k_transpose_out, v_transpose_out,
+                                qkv_transpose_out, softmax_out, fmha_out,
+                                gate_out, &config);
 
     // 3. Gating Linear
-    Tensor *fmha_or_gate_out = !has_gating ? fmha_out
-                                           : ComputeGatingLinearForward<T>(
-                                                 ctx, config, query, fmha_out);
+    if (has_gating) {
+      ComputeGatingLinearForward<T>(ctx, config, query, fmha_out, gate_out);
+    }
 
     // 4. Output Linear
-    ComputeOutputLinearForward<T>(ctx, config, fmha_or_gate_out);
+    Tensor *fmha_or_gate_out = has_gating ? gate_out : fmha_out;
+    ComputeOutputLinearForward<T>(ctx, config, fmha_or_gate_out, out);
   }
 };
 
@@ -387,9 +378,6 @@ template <typename T>
 class FusedGateAttentionGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
-    const auto has_gating = ctx.Attr<bool>("has_gating");
-    const auto merge_qkv = ctx.Attr<bool>("merge_qkv");
-
     // forward input
     const auto *query = ctx.Input<Tensor>("Query");
     const auto *key = ctx.Input<Tensor>("Key");
@@ -403,56 +391,68 @@ class FusedGateAttentionGradKernel : public framework::OpKernel<T> {
     const auto *qkv_transpose_out = ctx.Input<Tensor>("QKVTransposeOut");
     const auto *softmax_out = ctx.Input<Tensor>("SoftmaxOut");
     const auto *fmha_out = ctx.Input<Tensor>("FMHAOut");
+    const auto *gate_out = ctx.Input<Tensor>("GateOut");
 
     // backward output
     auto *query_grad = ctx.Output<Tensor>(framework::GradVarName("Query"));
-    query_grad->mutable_data<T>(ctx.GetPlace());
     auto *nonbatched_bias_grad =
         ctx.Output<Tensor>(framework::GradVarName("NonbatchedBias"));
-    auto *fmha_out_grad = ctx.Output<Tensor>(framework::GradVarName("FMHAOut"));
+
+    bool has_gating = ctx.Attr<bool>("has_gating");
+    bool merge_qkv = ctx.Attr<bool>("merge_qkv");
 
     auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    GateAttentionGradConfig<T> config(query, key, query_weight, qkv_weight,
-                                      merge_qkv);
+    AllocWithDebugInfo<T>(dev_ctx, "query_grad", query_grad);
 
-    // 1. Gradient of Output Linear
-    Tensor *fhma_or_gate_out_grad =
-        ComputeOutputLinearBackward<T>(ctx, config, has_gating);
+    GateAttentionGradConfig<T> config(dev_ctx, query, key, query_weight,
+                                      qkv_weight, merge_qkv, has_gating);
 
-    // 2. Gradient of Gating Linear
+    Tensor fmha_out_grad;
+    fmha_out_grad.Resize(config.gate_out_dims);
+    AllocWithDebugInfo<T>(dev_ctx, "fmha_out_grad", &fmha_out_grad);
     if (has_gating) {
-      // fhma_or_gate_out_grad is actually gate_out_grad.
-      fmha_out_grad->mutable_data<T>(ctx.GetPlace());
-      ComputeGatingLinearBackward<T>(ctx, config, fmha_out,
-                                     fhma_or_gate_out_grad, query_grad,
-                                     fmha_out_grad);
+      // 1. Gradient of Output Linear: out = Linear(gate_out)
+      Tensor gate_out_grad;
+      gate_out_grad.Resize(config.gate_out_dims);
+      AllocWithDebugInfo<T>(dev_ctx, "gate_out_grad", &gate_out_grad);
+      ComputeOutputLinearBackward<T>(ctx, config, gate_out, &gate_out_grad);
+
+      // 2. Gradient of Gating Linear
+      // Forward: gate_out = Sigmoid(Linear(fmha_out)) * fmha_out
+      ComputeGatingLinearBackward<T>(ctx, config, query, fmha_out,
+                                     &gate_out_grad, query_grad,
+                                     &fmha_out_grad);
+    } else {
+      // 1. Gradient of Output Linear: out = Linear(fmha_grad)
+      ComputeOutputLinearBackward<T>(ctx, config, fmha_out, &fmha_out_grad);
     }
 
     // 3. Gradient of FMHA
     if (nonbatched_bias_grad) {
-      nonbatched_bias_grad->mutable_data<T>(ctx.GetPlace());
+      AllocWithDebugInfo<T>(dev_ctx, "nonbatched_bias_grad",
+                            nonbatched_bias_grad);
     }
 
     auto fmha_compute = FMHAGateRef<T>(dev_ctx, merge_qkv);
     fmha_compute.ComputeBackward(
         q_transpose_out, k_transpose_out, v_transpose_out, qkv_transpose_out,
-        softmax_out, fmha_out_grad, nullptr, nonbatched_bias_grad, &config);
+        softmax_out, &fmha_out_grad, nullptr, nonbatched_bias_grad, &config);
 
     bool use_addto = has_gating ? true : false;
     if (merge_qkv) {
       // 4. Gradient of Merged QKV Matmul
-      Tensor *qkv_out_grad = config.GetQKVOutGrad(dev_ctx);
+      Tensor *qkv_out_grad = config.GetQKVOutGrad();
       ComputeMergedQKVMatmulBackward<T>(ctx, config, query, qkv_out_grad,
                                         query_grad, use_addto);
     } else {
       // 4. Gradient of Separated QKV Matmul
       auto *key_grad = ctx.Output<Tensor>(framework::GradVarName("Key"));
       if (key_grad) {
-        key_grad->mutable_data<T>(ctx.GetPlace());
+        AllocWithDebugInfo<T>(dev_ctx, "key_grad", key_grad);
       }
-      Tensor *query_out_grad = config.GetQueryOutGrad(dev_ctx);
-      Tensor *key_out_grad = config.GetKeyOutGrad(dev_ctx);
-      Tensor *value_out_grad = config.GetValueOutGrad(dev_ctx);
+      Tensor *query_out_grad = config.GetQueryOutGrad();
+      Tensor *key_out_grad = config.GetKeyOutGrad();
+      Tensor *value_out_grad = config.GetValueOutGrad();
       ComputeSeparatedQKVMatmulBackward<T>(
           ctx, config, query, key, query_out_grad, key_out_grad, value_out_grad,
           query_grad, key_grad, use_addto);
diff --git a/python/paddle/fluid/tests/unittests/test_fused_gate_attention_op.py b/python/paddle/fluid/tests/unittests/test_fused_gate_attention_op.py
index edfb46f5813b6..52418bba633f1 100644
--- a/python/paddle/fluid/tests/unittests/test_fused_gate_attention_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fused_gate_attention_op.py
@@ -18,7 +18,7 @@
 import paddle.nn as nn
 from paddle import tensor
 import unittest
-from op_test import OpTest, convert_float_to_uint16
+from op_test import OpTest, convert_float_to_uint16, convert_uint16_to_float
 from test_sparse_attention_op import get_cuda_version
 from paddle import _C_ops
 from paddle.fluid.framework import default_main_program, _enable_legacy_dygraph
@@ -194,23 +194,36 @@ def get_fused_gate_attention_out(self):
             return out, query.grad, None
 
     def check_output_and_grad(self, atol, rtol):
-        out_ref, query_grad_ref, key_grad_ref = self.get_reference_out()
-        out, query_grad, key_grad = self.get_fused_gate_attention_out()
-        np.testing.assert_allclose(out_ref, out.numpy(), atol=atol, rtol=rtol)
-        np.testing.assert_allclose(query_grad_ref,
-                                   query_grad.numpy(),
-                                   atol=atol,
-                                   rtol=rtol)
-        if key_grad_ref is not None and key_grad is not None:
-            np.testing.assert_allclose(key_grad_ref,
-                                       key_grad.numpy(),
-                                       atol=atol,
-                                       rtol=rtol)
+
+        def _convert(value):
+            if self.dtype == "bfloat16":
+                return convert_uint16_to_float(value)
+            return value
+
+        output_names = ["out", "query_grad", "key_grad"]
+        outputs_ref = self.get_reference_out()
+        outputs_fused = self.get_fused_gate_attention_out()
+        for i in range(len(outputs_fused)):
+            ref_res = outputs_ref[i]
+            fused_res = outputs_fused[i]
+            if ref_res is not None and fused_res is not None:
+                print("Checking {}".format(output_names[i]))
+                np.testing.assert_allclose(_convert(ref_res),
+                                           _convert(fused_res.numpy()),
+                                           atol=atol,
+                                           rtol=rtol)
 
     def test_output_and_grad(self):
         self.check_output_and_grad(atol=1e-5, rtol=1e-5)
 
 
+class TestMergeQKVLargeBatchSizeCase(TestFusedGateAttentionOp):
+
+    def config(self):
+        super().config()
+        self.batch_size = 2
+
+
 class TestSeparatedQKVCase(TestFusedGateAttentionOp):
 
     def config(self):
@@ -243,7 +256,16 @@ def config(self):
         self.dtype = "float16"
 
     def test_output_and_grad(self):
-        self.check_output_and_grad(atol=1e-1, rtol=1e-5)
+        place = core.CUDAPlace(0)
+        if core.is_float16_supported(place):
+            self.check_output_and_grad(atol=1e-1, rtol=1e-5)
+
+
+class TestMergeQKVLargeBatchSizeFp16Case(TestMergeQKVFp16Case):
+
+    def config(self):
+        super().config()
+        self.batch_size = 2
 
 
 @unittest.skipIf(
@@ -260,5 +282,12 @@ def test_output_and_grad(self):
         self.check_output_and_grad(atol=1e-1, rtol=1e-3)
 
 
+class TestMergeQKVLargeBatchSizeBF16Case(TestMergeQKVBF16Case):
+
+    def config(self):
+        super().config()
+        self.batch_size = 2
+
+
 if __name__ == "__main__":
     unittest.main()

From e1a34bc47c046ffe05505a3bda98e466750e9fa1 Mon Sep 17 00:00:00 2001
From: YuanRisheng <yuanrisheng@baidu.com>
Date: Wed, 8 Jun 2022 10:41:15 +0800
Subject: [PATCH 36/53] fix bugs (#43294)

---
 paddle/fluid/pybind/eager_utils.cc | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/paddle/fluid/pybind/eager_utils.cc b/paddle/fluid/pybind/eager_utils.cc
index 9bcac35037d04..cdbdb8c6424dd 100644
--- a/paddle/fluid/pybind/eager_utils.cc
+++ b/paddle/fluid/pybind/eager_utils.cc
@@ -734,12 +734,15 @@ PyObject* ToPyObject(
         PADDLE_THROW(
             platform::errors::Fatal("Unable to append string to py_list"));
       }
+      Py_DECREF(val_string);
     }
 
     if (PyDict_SetItem(dict, key_string, py_list) != 0) {
       PADDLE_THROW(
           platform::errors::Fatal("Unable to set key:value for py_dict"));
     }
+    Py_DECREF(py_list);
+    Py_DECREF(key_string);
   }
 
   return dict;

From 1fbd44403fe09e3d9c57d453cf5d99a94a116f51 Mon Sep 17 00:00:00 2001
From: Wangzheee <634486483@qq.com>
Date: Wed, 8 Jun 2022 10:42:47 +0800
Subject: [PATCH 37/53] [Paddle-Inference]support matmulv2 in multihead
 (#43269)

* support matmulv2 in multihead
---
 .../ir/trt_multihead_matmul_fuse_pass.cc      | 64 ++++++++++++-------
 1 file changed, 42 insertions(+), 22 deletions(-)

diff --git a/paddle/fluid/framework/ir/trt_multihead_matmul_fuse_pass.cc b/paddle/fluid/framework/ir/trt_multihead_matmul_fuse_pass.cc
index 2e3e957fd15f1..8fff2f953c378 100644
--- a/paddle/fluid/framework/ir/trt_multihead_matmul_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/trt_multihead_matmul_fuse_pass.cc
@@ -235,16 +235,18 @@ static int BuildFusion(Graph* graph, const std::string& name_scope) {
 }
 
 PDNode* TrtMultiHeadMatmulPattern::operator()() {
+  std::unordered_set<std::string> mul_ops{"mul", "matmul_v2"};
+  std::unordered_set<std::string> matmul_ops{"matmul", "matmul_v2"};
   auto* input0 = pattern->NewNode(input0_repr());
-  input0->assert_is_op_input("mul");
+  input0->assert_is_ops_input(mul_ops);
 
   // First path with scale
-  auto* mul0 = pattern->NewNode(mul0_repr())->assert_is_op("mul");
+  auto* mul0 = pattern->NewNode(mul0_repr())->assert_is_ops(mul_ops);
   auto* mul0_w_var = pattern->NewNode(mul0_w_repr())
                          ->AsInput()
-                         ->assert_is_op_input("mul", "Y");
+                         ->assert_is_ops_input(mul_ops, "Y");
   auto* mul0_out_var =
-      pattern->NewNode(mul0_out_repr())->assert_is_op_output("mul");
+      pattern->NewNode(mul0_out_repr())->assert_is_ops_output(mul_ops);
 
   decltype(mul0) eltadd0;
   decltype(mul0) eltadd0_b_var;
@@ -277,11 +279,12 @@ PDNode* TrtMultiHeadMatmulPattern::operator()() {
   auto* scale = pattern->NewNode(scale_repr())->assert_is_op("scale");
   auto* scale_out_var =
       pattern->NewNode(scale_out_repr())->assert_is_op_output("scale");
-  scale_out_var->AsIntermediate()->assert_is_op_input("matmul");
+  scale_out_var->AsIntermediate()->assert_is_ops_input(matmul_ops);
 
-  auto* matmul_qk = pattern->NewNode(matmul_qk_repr())->assert_is_op("matmul");
+  auto* matmul_qk =
+      pattern->NewNode(matmul_qk_repr())->assert_is_ops(matmul_ops);
   auto* matmul_qk_out_var =
-      pattern->NewNode(matmul_qk_out_repr())->assert_is_op_output("matmul");
+      pattern->NewNode(matmul_qk_out_repr())->assert_is_ops_output(matmul_ops);
   matmul_qk_out_var->AsIntermediate()->assert_is_op_input("elementwise_add");
 
   auto* eltadd_qk =
@@ -297,12 +300,12 @@ PDNode* TrtMultiHeadMatmulPattern::operator()() {
       pattern->NewNode(softmax_qk_repr())->assert_is_op("softmax");
   auto* softmax_qk_out_var =
       pattern->NewNode(softmax_qk_out_repr())->assert_is_op_output("softmax");
-  softmax_qk_out_var->AsIntermediate()->assert_is_op_input("matmul");
+  softmax_qk_out_var->AsIntermediate()->assert_is_ops_input(matmul_ops);
 
   auto* matmul_qkv =
-      pattern->NewNode(matmul_qkv_repr())->assert_is_op("matmul");
+      pattern->NewNode(matmul_qkv_repr())->assert_is_ops(matmul_ops);
   auto* matmul_qkv_out_var =
-      pattern->NewNode(matmul_qkv_out_repr())->assert_is_op_output("matmul");
+      pattern->NewNode(matmul_qkv_out_repr())->assert_is_ops_output(matmul_ops);
   matmul_qkv_out_var->AsIntermediate()->assert_is_op_input("transpose2");
 
   auto* transpose2_qkv =
@@ -315,15 +318,15 @@ PDNode* TrtMultiHeadMatmulPattern::operator()() {
       pattern->NewNode(reshape2_qkv_repr())->assert_is_op("reshape2");
   auto* reshape2_qkv_out_var = pattern->NewNode(reshape2_qkv_out_repr())
                                    ->assert_is_op_output("reshape2");
-  reshape2_qkv_out_var->assert_is_op_input("mul");
+  reshape2_qkv_out_var->assert_is_ops_input(mul_ops);
 
   // Second path to matmul
-  auto* mul1 = pattern->NewNode(mul1_repr())->assert_is_op("mul");
+  auto* mul1 = pattern->NewNode(mul1_repr())->assert_is_ops(mul_ops);
   auto* mul1_w_var = pattern->NewNode(mul1_w_repr())
                          ->AsInput()
-                         ->assert_is_op_input("mul", "Y");
+                         ->assert_is_ops_input(mul_ops, "Y");
   auto* mul1_out_var =
-      pattern->NewNode(mul1_out_repr())->assert_is_op_output("mul");
+      pattern->NewNode(mul1_out_repr())->assert_is_ops_output(mul_ops);
 
   decltype(mul1) eltadd1;
   decltype(mul1) eltadd1_b_var;
@@ -350,16 +353,16 @@ PDNode* TrtMultiHeadMatmulPattern::operator()() {
       pattern->NewNode(transpose2_1_repr())->assert_is_op("transpose2");
   auto* transpose2_1_out_var = pattern->NewNode(transpose2_1_out_repr())
                                    ->assert_is_op_output("transpose2");
-  transpose2_1_out_var->AsIntermediate()->assert_is_op_input(
-      "matmul");  // link to matmul qk
+  transpose2_1_out_var->AsIntermediate()->assert_is_ops_input(
+      matmul_ops);  // link to matmul qk
 
   // Third path to matmul
-  auto* mul2 = pattern->NewNode(mul2_repr())->assert_is_op("mul");
+  auto* mul2 = pattern->NewNode(mul2_repr())->assert_is_ops(mul_ops);
   auto* mul2_w_var = pattern->NewNode(mul2_w_repr())
                          ->AsInput()
-                         ->assert_is_op_input("mul", "Y");
+                         ->assert_is_ops_input(mul_ops, "Y");
   auto* mul2_out_var =
-      pattern->NewNode(mul2_out_repr())->assert_is_op_output("mul");
+      pattern->NewNode(mul2_out_repr())->assert_is_ops_output(mul_ops);
 
   decltype(mul2) eltadd2;
   decltype(mul2) eltadd2_b_var;
@@ -386,8 +389,8 @@ PDNode* TrtMultiHeadMatmulPattern::operator()() {
       pattern->NewNode(transpose2_2_repr())->assert_is_op("transpose2");
   auto* transpose2_2_out_var = pattern->NewNode(transpose2_2_out_repr())
                                    ->assert_is_op_output("transpose2");
-  transpose2_2_out_var->AsIntermediate()->assert_is_op_input(
-      "matmul");  // link to matmul qkv
+  transpose2_2_out_var->AsIntermediate()->assert_is_ops_input(
+      matmul_ops);  // link to matmul qkv
 
   // Q path
   mul0->LinksFrom({input0, mul0_w_var}).LinksTo({mul0_out_var});
@@ -734,6 +737,23 @@ TrtMultiHeadMatmulV2FusePass::TrtMultiHeadMatmulV2FusePass() {
       .IsType<bool>()
       .End();
 
+  AddOpCompat(OpCompat("matmul_v2"))
+      .AddInput("X")
+      .IsTensor()
+      .End()
+      .AddInput("Y")
+      .IsTensor()
+      .End()
+      .AddOutput("Out")
+      .IsTensor()
+      .End()
+      .AddAttr("trans_x")
+      .IsType<bool>()
+      .End()
+      .AddAttr("trans_y")
+      .IsType<bool>()
+      .End();
+
   AddOpCompat(OpCompat("softmax"))
       .AddInput("X")
       .IsTensor()
@@ -866,7 +886,7 @@ int TrtMultiHeadMatmulV2FusePass::BuildFusionV2(Graph* graph,
     auto* mul0_op_desc = mul0->Op();
 
     // all mul op has same input.
-    if (multihead_op_desc.HasAttr("Input_scale")) {
+    if (mul0_op_desc->HasAttr("Input_scale")) {
       multihead_op_desc.SetAttr("Input_scale",
                                 mul0_op_desc->GetAttr("Input_scale"));
     }

From b056c9cb87151920e7e15f9e45048154fa6a024e Mon Sep 17 00:00:00 2001
From: Aurelius84 <zhangliujie@baidu.com>
Date: Wed, 8 Jun 2022 11:09:29 +0800
Subject: [PATCH 38/53] [Dy2stat]Add RollBack into original dygraph function
 for @to_static (#43284)

* [Dy2stat]Add RollBack into original dygraph function for @to_static

* fix unittest
---
 .../dygraph_to_static/convert_call_func.py    |   1 +
 .../dygraph_to_static/program_translator.py   |  56 ++++++++
 python/paddle/fluid/dygraph/layers.py         |   2 +
 .../test_convert_operators.py                 |   2 +-
 .../dygraph_to_static/test_rollback.py        | 126 ++++++++++++++++++
 5 files changed, 186 insertions(+), 1 deletion(-)
 create mode 100644 python/paddle/fluid/tests/unittests/dygraph_to_static/test_rollback.py

diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/convert_call_func.py b/python/paddle/fluid/dygraph/dygraph_to_static/convert_call_func.py
index cf3383f5d0638..e660a64ab363c 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/convert_call_func.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/convert_call_func.py
@@ -243,6 +243,7 @@ def dyfunc(x):
         if hasattr(func, 'forward') and isinstance(func, Layer):
             try:
                 _, forward_func = unwrap_decorators(func.forward)
+                func._original_funcs['forward'] = forward_func.__func__
                 forward_func = convert_to_static(forward_func)
                 # Bound mothod will be convert into plain function after `convert_to_static`.
                 # So descriptor mechanism is used to bound `self` instance on function to
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py b/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py
index 54c2b2216cd1c..c5a3915802401 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py
@@ -255,6 +255,8 @@ def __init__(self, function, input_spec=None, **kwargs):
         if inspect.ismethod(function):
             self._dygraph_function = getattr(function, '__func__')
             self._class_instance = getattr(function, '__self__')
+            self._class_instance._original_funcs[
+                function.__name__] = self._dygraph_function
         else:
             self._dygraph_function = function
             self._class_instance = None
@@ -564,6 +566,60 @@ def concrete_program_specify_input_spec(self,
                     partial_layer) = self._program_cache.last()
         return concrete_program
 
+    def rollback(self):
+        """
+        Rollback into original dygraph functions for current class instance.
+        
+        Returns:
+            Function or Method
+
+        Example::
+            .. code-block:: python
+
+                import paddle
+
+                class Net(paddle.nn.Layer):
+                    def __init__(self):
+                        super(Net, self).__init__()
+
+                    def forward(self, x, flag=True):
+                        if flag:
+                            out = x + 1
+                        else:
+                            out = x - 1
+                        return out
+
+                x = paddle.randn([10, 1], 'float32')
+                net = paddle.jit.to_static(Net())  # convert into static mode
+                out = net(x)
+                
+                net.forward.rollback()  # rollback into dygraph mode
+                out = net(x)
+        """
+
+        def rollback_impl(class_instance):
+            for name, func in class_instance._original_funcs.items():
+                setattr(class_instance, name, func.__get__(class_instance))
+
+            for sublayer in class_instance.sublayers(include_self=False):
+                rollback_impl(sublayer)
+
+        if self._class_instance is None:
+            return self._dygraph_function
+
+        # only rollback sub-functions on path of top _dygraph_function
+        func_name = self._dygraph_function.__name__
+        assert func_name in self._class_instance._original_funcs, "Not Found function '{}' in class '{}'.".format(
+            func_name, self._class_instance.__name__)
+        func = self._class_instance._original_funcs[func_name]
+        setattr(self._class_instance, func_name,
+                func.__get__(self._class_instance))
+
+        for sublayer in self._class_instance.sublayers(include_self=False):
+            rollback_impl(sublayer)
+
+        return getattr(self._class_instance, func_name)
+
     @property
     def inputs(self):
         """
diff --git a/python/paddle/fluid/dygraph/layers.py b/python/paddle/fluid/dygraph/layers.py
index b67f7d0a91fee..4a4bdf6e18e36 100644
--- a/python/paddle/fluid/dygraph/layers.py
+++ b/python/paddle/fluid/dygraph/layers.py
@@ -127,6 +127,8 @@ def __init__(self, name_scope=None, dtype="float32"):
         self._casted_by_pure_fp16 = False
 
         self._state_dict_hooks = collections.OrderedDict()
+        # Records orignal functions after @to_static to support to rollback
+        self._original_funcs = collections.OrderedDict()
 
     def train(self):
         """
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_convert_operators.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_convert_operators.py
index 6188d6a786b2e..375873aa14fdd 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_convert_operators.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_convert_operators.py
@@ -51,7 +51,7 @@ def call_not_exist():
         def forward_not_exist():
             return net()
 
-        with self.assertRaises(TypeError):
+        with self.assertRaises(AttributeError):
             forward_not_exist()
 
 
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_rollback.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_rollback.py
new file mode 100644
index 0000000000000..5277a50c299ea
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_rollback.py
@@ -0,0 +1,126 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle
+import numpy as np
+from paddle.fluid.dygraph.dygraph_to_static.utils import func_to_source_code
+from paddle.fluid.dygraph.dygraph_to_static.program_translator import StaticFunction
+
+
+class Net(paddle.nn.Layer):
+
+    def __init__(self):
+        super(Net, self).__init__()
+        self.sub = SubNet()
+
+    def forward(self, x):
+        x = self.sub(x)
+        x = foo(x)
+        out = self.sub.bar(x)
+        return out
+
+    def infer(self, x):
+        x = self.sub.bar(x)
+        out = foo(x)
+        return out
+
+
+class SubNet(paddle.nn.Layer):
+
+    def __init__(self):
+        super(SubNet, self).__init__()
+
+    def forward(self, x, flag=True):
+        if flag:
+            out = x + 1
+        else:
+            out = x - 1
+        return out
+
+    def bar(self, x, flag=True):
+        if flag:
+            out = x + 2
+        else:
+            out = x - 2
+        return out
+
+
+def foo(x, flag=False):
+    if flag:
+        out = x * 2.
+    else:
+        out = x / 2.
+
+    return out
+
+
+class TestRollBackPlainFunction(unittest.TestCase):
+
+    def setUp(self):
+        paddle.set_device("cpu")
+
+    def test_plain_func(self):
+        st_foo = paddle.jit.to_static(foo)
+        x = paddle.randn([3, 4])
+        st_out = st_foo(x)
+
+        self.assertTrue(isinstance(st_foo, StaticFunction))
+
+        st_foo = st_foo.rollback()
+        dy_out = st_foo(x)
+
+        self.assertTrue(func_to_source_code(foo) == func_to_source_code(st_foo))
+        self.assertTrue(np.array_equal(st_out.numpy(), dy_out.numpy()))
+
+
+class TestRollBackNet(unittest.TestCase):
+
+    def setUp(self):
+        paddle.set_device("cpu")
+
+    def test_net(self):
+        net = paddle.jit.to_static(Net())
+        x = paddle.randn([3, 4])
+        st_fwd_out = net(x)
+
+        # forward function is inplacly converted.
+        self.assertTrue(isinstance(net.forward, StaticFunction))
+        self.assertTrue("true_fn" in func_to_source_code(net.sub.forward))
+        # other non-forward function is not inplacly converted.
+        self.assertFalse("true_fn" in func_to_source_code(net.sub.bar))
+
+        net.infer = paddle.jit.to_static(net.infer)
+        st_infer_out = net.infer(x)
+        self.assertTrue(isinstance(net.infer, StaticFunction))
+        self.assertFalse("true_fn" in func_to_source_code(net.sub.bar))
+
+        # rollback forward into original dygraph method
+        net.forward = net.forward.rollback()
+        self.assertFalse(isinstance(net.forward, StaticFunction))
+        self.assertFalse("true_fn" in func_to_source_code(net.sub.forward))
+        dy_fwd_out = net(x)
+        self.assertTrue(np.array_equal(st_fwd_out.numpy(), dy_fwd_out.numpy()))
+
+        # rollback infer into original dygraph method
+        net.infer.rollback()
+        self.assertFalse(isinstance(net.infer, StaticFunction))
+        self.assertFalse("true_fn" in func_to_source_code(net.sub.forward))
+        dy_infer_out = net.infer(x)
+        self.assertTrue(
+            np.array_equal(st_infer_out.numpy(), dy_infer_out.numpy()))
+
+
+if __name__ == "__main__":
+    unittest.main()

From 8bd3514cbd8cc968e4d69aa3236fcdd687f948f9 Mon Sep 17 00:00:00 2001
From: fwenguang <95677191+fwenguang@users.noreply.github.com>
Date: Wed, 8 Jun 2022 11:35:58 +0800
Subject: [PATCH 39/53] [MLU] add logical ops (#43286)

---
 .../operators/controlflow/logical_op_mlu.cc   |  77 ++++++
 paddle/fluid/operators/mlu/mlu_baseop.cc      |   8 +-
 paddle/fluid/operators/mlu/mlu_baseop.h       |  14 +-
 .../unittests/mlu/test_logical_op_mlu.py      | 256 ++++++++++++++++++
 4 files changed, 338 insertions(+), 17 deletions(-)
 create mode 100644 paddle/fluid/operators/controlflow/logical_op_mlu.cc
 create mode 100755 python/paddle/fluid/tests/unittests/mlu/test_logical_op_mlu.py

diff --git a/paddle/fluid/operators/controlflow/logical_op_mlu.cc b/paddle/fluid/operators/controlflow/logical_op_mlu.cc
new file mode 100644
index 0000000000000..231865fc7902e
--- /dev/null
+++ b/paddle/fluid/operators/controlflow/logical_op_mlu.cc
@@ -0,0 +1,77 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/mlu/mlu_baseop.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename T, cnnlLogicOp_t log_method>
+class LogicalMLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<Tensor>("X");
+    auto* y = ctx.Input<Tensor>("Y");
+    auto* out = ctx.Output<Tensor>("Out");
+
+    out->mutable_data<T>(ctx.GetPlace());
+
+    if (log_method == CNNL_LOGIC_OP_NOT) {
+      y = x;
+    }
+
+    MLUCnnlTensorDesc x_desc(*x);
+    MLUCnnlTensorDesc y_desc(*y);
+    MLUCnnlTensorDesc out_desc(*out);
+
+    MLUCnnl::Logic(ctx, log_method, x_desc.get(), GetBasePtr(x), y_desc.get(),
+                   GetBasePtr(y), out_desc.get(), GetBasePtr(out));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_MLU_KERNEL(logical_not,
+                       ops::LogicalMLUKernel<bool, CNNL_LOGIC_OP_NOT>,
+                       ops::LogicalMLUKernel<int8_t, CNNL_LOGIC_OP_NOT>,
+                       ops::LogicalMLUKernel<int16_t, CNNL_LOGIC_OP_NOT>,
+                       ops::LogicalMLUKernel<int, CNNL_LOGIC_OP_NOT>,
+                       ops::LogicalMLUKernel<float, CNNL_LOGIC_OP_NOT>);
+
+REGISTER_OP_MLU_KERNEL(logical_and,
+                       ops::LogicalMLUKernel<bool, CNNL_LOGIC_OP_AND>,
+                       ops::LogicalMLUKernel<int8_t, CNNL_LOGIC_OP_AND>,
+                       ops::LogicalMLUKernel<int16_t, CNNL_LOGIC_OP_AND>,
+                       ops::LogicalMLUKernel<int, CNNL_LOGIC_OP_AND>,
+                       ops::LogicalMLUKernel<float, CNNL_LOGIC_OP_AND>);
+
+REGISTER_OP_MLU_KERNEL(logical_or,
+                       ops::LogicalMLUKernel<bool, CNNL_LOGIC_OP_OR>,
+                       ops::LogicalMLUKernel<int8_t, CNNL_LOGIC_OP_OR>,
+                       ops::LogicalMLUKernel<int16_t, CNNL_LOGIC_OP_OR>,
+                       ops::LogicalMLUKernel<int, CNNL_LOGIC_OP_OR>,
+                       ops::LogicalMLUKernel<float, CNNL_LOGIC_OP_OR>);
+
+REGISTER_OP_MLU_KERNEL(logical_xor,
+                       ops::LogicalMLUKernel<bool, CNNL_LOGIC_OP_XOR>,
+                       ops::LogicalMLUKernel<int8_t, CNNL_LOGIC_OP_XOR>,
+                       ops::LogicalMLUKernel<int16_t, CNNL_LOGIC_OP_XOR>,
+                       ops::LogicalMLUKernel<int, CNNL_LOGIC_OP_XOR>,
+                       ops::LogicalMLUKernel<float, CNNL_LOGIC_OP_XOR>);
diff --git a/paddle/fluid/operators/mlu/mlu_baseop.cc b/paddle/fluid/operators/mlu/mlu_baseop.cc
index 1ff27454013e1..5b452ca3ba2ea 100644
--- a/paddle/fluid/operators/mlu/mlu_baseop.cc
+++ b/paddle/fluid/operators/mlu/mlu_baseop.cc
@@ -1142,7 +1142,7 @@ MLUCnnlTrigonDesc::~MLUCnnlTrigonDesc() {
 }
 
 /* static */ void MLUCnnl::Logic(
-    const ExecutionContext& ctx, const MLULogicMethod log_method,
+    const ExecutionContext& ctx, const cnnlLogicOp_t log_method,
     const cnnlTensorDescriptor_t input1_desc, const void* input1,
     const cnnlTensorDescriptor_t input2_desc, const void* input2,
     const cnnlTensorDescriptor_t output_desc, void* output) {
@@ -1157,9 +1157,9 @@ MLUCnnlTrigonDesc::~MLUCnnlTrigonDesc() {
       {static_cast<int64_t>(workspace_size)}, dev_ctx);
   void* workspace_ptr = workspace.mutable_data(ctx.GetPlace());
 
-  PADDLE_ENFORCE_MLU_SUCCESS(cnnlLogicOp(
-      handle, cnnlLogicOp_t(log_method), input1_desc, input1, input2_desc,
-      input2, workspace_ptr, workspace_size, output_desc, output));
+  PADDLE_ENFORCE_MLU_SUCCESS(
+      cnnlLogicOp(handle, log_method, input1_desc, input1, input2_desc, input2,
+                  workspace_ptr, workspace_size, output_desc, output));
 }
 
 /* static */ void MLUCnnl::Select(
diff --git a/paddle/fluid/operators/mlu/mlu_baseop.h b/paddle/fluid/operators/mlu/mlu_baseop.h
index c97ee3efd3f56..ebb8aae1eb329 100644
--- a/paddle/fluid/operators/mlu/mlu_baseop.h
+++ b/paddle/fluid/operators/mlu/mlu_baseop.h
@@ -34,17 +34,6 @@ using ExecutionContext = framework::ExecutionContext;
 using DeviceContextPool = platform::DeviceContextPool;
 using MLUDeviceContext = platform::MLUDeviceContext;
 
-enum MLULogicMethod {
-  CNNL_LOGIC_OP_EQ = 0,
-  CNNL_LOGIC_OP_NE = 1,
-  CNNL_LOGIC_OP_GT = 2,
-  CNNL_LOGIC_OP_GE = 3,
-  CNNL_LOGIC_OP_LT = 4,
-  CNNL_LOGIC_OP_LE = 5,
-  CNNL_LOGIC_OP_AND = 6,
-  CNNL_LOGIC_OP_OR = 7,
-};
-
 const std::map<std::string, cnnlReduceOp_t> MLUReduceOpMap = {
     {"reduce_all", CNNL_REDUCE_AND},  {"reduce_any", CNNL_REDUCE_OR},
     {"reduce_max", CNNL_REDUCE_MAX},  {"reduce_mean", CNNL_REDUCE_AVG},
@@ -645,8 +634,7 @@ class MLUCnnl {
                                const cnnlTensorDescriptor_t output_desc,
                                void* output);
 
-  static void Logic(const ExecutionContext& ctx,
-                    const MLULogicMethod log_method,
+  static void Logic(const ExecutionContext& ctx, const cnnlLogicOp_t log_method,
                     const cnnlTensorDescriptor_t input1_desc,
                     const void* input1,
                     const cnnlTensorDescriptor_t input2_desc,
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_logical_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_logical_op_mlu.py
new file mode 100755
index 0000000000000..55d3f543d9298
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/test_logical_op_mlu.py
@@ -0,0 +1,256 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import sys
+
+sys.path.append('..')
+import op_test
+import unittest
+import numpy as np
+import paddle
+from paddle.static import Program, program_guard, Executor
+from paddle.framework import _non_static_mode
+
+paddle.enable_static()
+
+SUPPORTED_DTYPES = [bool, np.int8, np.int16, np.int32, np.float32]
+
+TEST_META_OP_DATA = [{
+    'op_str': 'logical_and',
+    'binary_op': True
+}, {
+    'op_str': 'logical_or',
+    'binary_op': True
+}, {
+    'op_str': 'logical_xor',
+    'binary_op': True
+}, {
+    'op_str': 'logical_not',
+    'binary_op': False
+}]
+
+TEST_META_SHAPE_DATA = {
+    'XDimLargerThanYDim1': {
+        'x_shape': [2, 3, 4, 5],
+        'y_shape': [4, 5]
+    },
+    'XDimLargerThanYDim2': {
+        'x_shape': [2, 3, 4, 5],
+        'y_shape': [4, 1]
+    },
+    'XDimLargerThanYDim3': {
+        'x_shape': [2, 3, 4, 5],
+        'y_shape': [1, 4, 1]
+    },
+    'XDimLargerThanYDim4': {
+        'x_shape': [2, 3, 4, 5],
+        'y_shape': [3, 4, 1]
+    },
+    'XDimLargerThanYDim5': {
+        'x_shape': [2, 3, 1, 5],
+        'y_shape': [3, 1, 1]
+    },
+    'XDimLessThanYDim1': {
+        'x_shape': [4, 1],
+        'y_shape': [2, 3, 4, 5]
+    },
+    'XDimLessThanYDim2': {
+        'x_shape': [1, 4, 1],
+        'y_shape': [2, 3, 4, 5]
+    },
+    'XDimLessThanYDim3': {
+        'x_shape': [3, 4, 1],
+        'y_shape': [2, 3, 4, 5]
+    },
+    'XDimLessThanYDim4': {
+        'x_shape': [3, 1, 1],
+        'y_shape': [2, 3, 1, 5]
+    },
+    'XDimLessThanYDim5': {
+        'x_shape': [4, 5],
+        'y_shape': [2, 3, 4, 5]
+    },
+    'Axis1InLargerDim': {
+        'x_shape': [1, 4, 5],
+        'y_shape': [2, 3, 1, 5]
+    },
+    'EqualDim1': {
+        'x_shape': [10, 7],
+        'y_shape': [10, 7]
+    },
+    'EqualDim2': {
+        'x_shape': [1, 1, 4, 5],
+        'y_shape': [2, 3, 1, 5]
+    }
+}
+
+TEST_META_WRONG_SHAPE_DATA = {
+    'ErrorDim1': {
+        'x_shape': [2, 3, 4, 5],
+        'y_shape': [3, 4]
+    },
+    'ErrorDim2': {
+        'x_shape': [2, 3, 4, 5],
+        'y_shape': [4, 3]
+    }
+}
+
+
+def run_static(x_np, y_np, op_str, use_mlu=False, binary_op=True):
+    paddle.enable_static()
+    startup_program = Program()
+    main_program = Program()
+    place = paddle.CPUPlace()
+    if use_mlu and paddle.is_compiled_with_mlu():
+        place = paddle.MLUPlace(0)
+    exe = Executor(place)
+    with program_guard(main_program, startup_program):
+        x = paddle.static.data(name='x', shape=x_np.shape, dtype=x_np.dtype)
+        op = getattr(paddle, op_str)
+        feed_list = {'x': x_np}
+        if not binary_op:
+            res = op(x)
+        else:
+            y = paddle.static.data(name='y', shape=y_np.shape, dtype=y_np.dtype)
+            feed_list['y'] = y_np
+            res = op(x, y)
+        exe.run(startup_program)
+        static_result = exe.run(main_program, feed=feed_list, fetch_list=[res])
+    return static_result
+
+
+def run_dygraph(x_np, y_np, op_str, use_mlu=False, binary_op=True):
+    place = paddle.CPUPlace()
+    if use_mlu and paddle.is_compiled_with_mlu():
+        place = paddle.MLUPlace(0)
+    paddle.disable_static(place)
+    op = getattr(paddle, op_str)
+    x = paddle.to_tensor(x_np, dtype=x_np.dtype)
+    if not binary_op:
+        dygraph_result = op(x)
+    else:
+        y = paddle.to_tensor(y_np, dtype=y_np.dtype)
+        dygraph_result = op(x, y)
+    return dygraph_result
+
+
+def np_data_generator(np_shape, dtype, *args, **kwargs):
+    if dtype == bool:
+        return np.random.choice(a=[True, False], size=np_shape).astype(bool)
+    else:
+        return np.random.randn(*np_shape).astype(dtype)
+
+
+def test(unit_test, use_mlu=False, test_error=False):
+    for op_data in TEST_META_OP_DATA:
+        meta_data = dict(op_data)
+        meta_data['use_mlu'] = use_mlu
+        np_op = getattr(np, meta_data['op_str'])
+        META_DATA = dict(TEST_META_SHAPE_DATA)
+        if test_error:
+            META_DATA = dict(TEST_META_WRONG_SHAPE_DATA)
+        for shape_data in META_DATA.values():
+            for data_type in SUPPORTED_DTYPES:
+                meta_data['x_np'] = np_data_generator(shape_data['x_shape'],
+                                                      dtype=data_type)
+                meta_data['y_np'] = np_data_generator(shape_data['y_shape'],
+                                                      dtype=data_type)
+                if meta_data['binary_op'] and test_error:
+                    # catch C++ Exception
+                    unit_test.assertRaises(BaseException, run_static,
+                                           **meta_data)
+                    unit_test.assertRaises(BaseException, run_dygraph,
+                                           **meta_data)
+                    continue
+                static_result = run_static(**meta_data)
+                dygraph_result = run_dygraph(**meta_data)
+                if meta_data['binary_op']:
+                    np_result = np_op(meta_data['x_np'], meta_data['y_np'])
+                else:
+                    np_result = np_op(meta_data['x_np'])
+                unit_test.assertTrue((static_result == np_result).all())
+                unit_test.assertTrue(
+                    (dygraph_result.numpy() == np_result).all())
+
+
+def test_type_error(unit_test, use_mlu, type_str_map):
+
+    def check_type(op_str, x, y, binary_op):
+        op = getattr(paddle, op_str)
+        error_type = ValueError
+        if isinstance(x, np.ndarray):
+            x = paddle.to_tensor(x)
+            y = paddle.to_tensor(y)
+            error_type = BaseException
+        if binary_op:
+            if type_str_map['x'] != type_str_map['y']:
+                unit_test.assertRaises(error_type, op, x=x, y=y)
+            if not _non_static_mode():
+                error_type = TypeError
+                unit_test.assertRaises(error_type, op, x=x, y=y, out=1)
+        else:
+            if not _non_static_mode():
+                error_type = TypeError
+                unit_test.assertRaises(error_type, op, x=x, out=1)
+
+    place = paddle.CPUPlace()
+    if use_mlu and paddle.is_compiled_with_mlu():
+        place = paddle.MLUPlace(0)
+    for op_data in TEST_META_OP_DATA:
+        meta_data = dict(op_data)
+        binary_op = meta_data['binary_op']
+
+        paddle.disable_static(place)
+        x = np.random.choice(a=[0, 1], size=[10]).astype(type_str_map['x'])
+        y = np.random.choice(a=[0, 1], size=[10]).astype(type_str_map['y'])
+        check_type(meta_data['op_str'], x, y, binary_op)
+
+        paddle.enable_static()
+        startup_program = paddle.static.Program()
+        main_program = paddle.static.Program()
+        with paddle.static.program_guard(main_program, startup_program):
+            x = paddle.static.data(name='x',
+                                   shape=[10],
+                                   dtype=type_str_map['x'])
+            y = paddle.static.data(name='y',
+                                   shape=[10],
+                                   dtype=type_str_map['y'])
+            check_type(meta_data['op_str'], x, y, binary_op)
+
+
+def type_map_factory():
+    return [{
+        'x': x_type,
+        'y': y_type
+    } for x_type in SUPPORTED_DTYPES for y_type in SUPPORTED_DTYPES]
+
+
+class TestMLU(unittest.TestCase):
+
+    def test(self):
+        test(self, True)
+
+    def test_error(self):
+        test(self, True, True)
+
+    def test_type_error(self):
+        type_map_list = type_map_factory()
+        for type_map in type_map_list:
+            test_type_error(self, True, type_map)
+
+
+if __name__ == '__main__':
+    unittest.main()

From 99c6497b2056c09d7f0fe520f68c369043d61586 Mon Sep 17 00:00:00 2001
From: YuanRisheng <yuanrisheng@baidu.com>
Date: Wed, 8 Jun 2022 11:36:29 +0800
Subject: [PATCH 40/53] [Phi]Move group op kernel into PHI and add yaml /
 unittest (#43104)

* move_group_norm

* move group norm backward

* fix code format

* modify code according comment
---
 paddle/fluid/operators/group_norm_op.cc       | 106 +---
 paddle/fluid/operators/group_norm_op_npu.cc   |   3 +-
 paddle/phi/infermeta/ternary.cc               | 117 +++++
 paddle/phi/infermeta/ternary.h                |  10 +
 .../phi/kernels/cpu/group_norm_grad_kernel.cc | 204 ++++++++
 paddle/phi/kernels/cpu/group_norm_kernel.cc   | 210 ++++++++
 .../phi/kernels/gpu/group_norm_grad_kernel.cu | 452 ++++++++++++++++++
 paddle/phi/kernels/gpu/group_norm_kernel.cu   | 233 +++++++++
 paddle/phi/kernels/gpu/group_norm_utils.h     | 174 +++++++
 paddle/phi/kernels/group_norm_grad_kernel.h   |  39 ++
 paddle/phi/kernels/group_norm_kernel.h        |  35 ++
 paddle/phi/ops/compat/group_norm_sig.cc       |  39 ++
 python/paddle/fluid/dygraph/nn.py             |  12 +-
 .../tests/unittests/test_group_norm_op.py     |  27 +-
 .../tests/unittests/test_group_norm_op_v2.py  |   9 +
 python/paddle/nn/functional/norm.py           |   2 +-
 python/paddle/utils/code_gen/api.yaml         |  12 +
 python/paddle/utils/code_gen/backward.yaml    |  13 +
 tools/infrt/skipped_phi_api.json              |   2 +-
 19 files changed, 1597 insertions(+), 102 deletions(-)
 create mode 100644 paddle/phi/kernels/cpu/group_norm_grad_kernel.cc
 create mode 100644 paddle/phi/kernels/cpu/group_norm_kernel.cc
 create mode 100644 paddle/phi/kernels/gpu/group_norm_grad_kernel.cu
 create mode 100644 paddle/phi/kernels/gpu/group_norm_kernel.cu
 create mode 100644 paddle/phi/kernels/gpu/group_norm_utils.h
 create mode 100644 paddle/phi/kernels/group_norm_grad_kernel.h
 create mode 100644 paddle/phi/kernels/group_norm_kernel.h
 create mode 100644 paddle/phi/ops/compat/group_norm_sig.cc

diff --git a/paddle/fluid/operators/group_norm_op.cc b/paddle/fluid/operators/group_norm_op.cc
index 4d989ed1f2ec0..e35598f23e9d8 100644
--- a/paddle/fluid/operators/group_norm_op.cc
+++ b/paddle/fluid/operators/group_norm_op.cc
@@ -12,13 +12,17 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/group_norm_op.h"
-
 #include <memory>
 #include <string>
 #include <unordered_map>
 #include <vector>
 
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/backward.h"
+#include "paddle/phi/infermeta/ternary.h"
+
 namespace paddle {
 namespace operators {
 
@@ -29,91 +33,6 @@ using DataLayout = framework::DataLayout;
 class GroupNormOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "GroupNorm");
-    OP_INOUT_CHECK(ctx->HasOutput("Y"), "Output", "Y", "GroupNorm");
-    OP_INOUT_CHECK(ctx->HasOutput("Mean"), "Output", "Mean", "GroupNorm");
-    OP_INOUT_CHECK(ctx->HasOutput("Variance"), "Output", "Variance",
-                   "GroupNorm");
-
-    auto x_dim = ctx->GetInputDim("X");
-    PADDLE_ENFORCE_GE(
-        x_dim.size(), 2,
-        platform::errors::InvalidArgument(
-            "The Input(X)'s dimension of Op(group_norm) must be "
-            "greater than 1. But received: %u-D Tensor, which shape is [%s].",
-            x_dim.size(), x_dim));
-
-    const std::string data_layout_str =
-        ctx->Attrs().Get<std::string>("data_layout");
-    const framework::DataLayout data_layout =
-        framework::StringToDataLayout(data_layout_str);
-    const int64_t channel_num =
-        (data_layout == DataLayout::kNCHW ? x_dim[1] : x_dim[x_dim.size() - 1]);
-    auto batch_size = x_dim[0];
-    auto groups = ctx->Attrs().Get<int>("groups");
-    PADDLE_ENFORCE_LE(
-        groups, channel_num,
-        platform::errors::InvalidArgument(
-            "The Attr(groups) of Op(group_norm) must be less than or "
-            "equal to the number of channels. But received: groups "
-            "is [%s], channels is [%s], the Attr(data_layout) "
-            "is [%s]. The error may come from wrong data_layout setting.",
-            groups, channel_num, data_layout_str));
-    PADDLE_ENFORCE_GE(
-        groups, 1,
-        platform::errors::InvalidArgument(
-            "The Attr(groups) of Op(group_norm) must be "
-            "greater than or equal to 1. But received: groups is [%s].",
-            groups));
-    PADDLE_ENFORCE_EQ(
-        channel_num % groups, 0,
-        platform::errors::InvalidArgument(
-            "Expected number of channels in input to be divisible by "
-            "num_groups, but got input channel is %d and num_groups is %d",
-            channel_num, groups));
-
-    if (ctx->HasInput("Scale")) {
-      PADDLE_ENFORCE_EQ(
-          ctx->GetInputDim("Scale").size(), 1UL,
-          platform::errors::InvalidArgument(
-              "The Input(Scale) of Op(group_norm) should be 1-D Tensor. "
-              "But received: %u-D Tensor, the shape of Input(Scale) is [%s].",
-              ctx->GetInputDim("Scale").size(), ctx->GetInputDim("Scale")));
-      PADDLE_ENFORCE_EQ(
-          ctx->GetInputDim("Scale")[0], channel_num,
-          platform::errors::InvalidArgument(
-              "The Input(Scale)'s first dimension size of Op(group_norm) must "
-              "be equal to the number of channels. But received: the "
-              "Input(Scale)'s first dimension size is [%s], the channels is "
-              "[%s], the Attr(data_layout) is [%s]. The error may come "
-              "from wrong data_layout setting.",
-              ctx->GetInputDim("Scale")[0], channel_num, data_layout_str));
-    }
-    if (ctx->HasInput("Bias")) {
-      PADDLE_ENFORCE_EQ(
-          ctx->GetInputDim("Bias").size(), 1UL,
-          platform::errors::InvalidArgument(
-              "The Input(Bias) of Op(group_norm) should be 1-D Tensor. "
-              "But received: %u-D Tensor, the shape of Input(Bias) is [%s].",
-              ctx->GetInputDim("Bias").size(), ctx->GetInputDim("Bias")));
-      PADDLE_ENFORCE_EQ(
-          ctx->GetInputDim("Bias")[0], channel_num,
-          platform::errors::InvalidArgument(
-              "The Input(Bias)'s first dimension size of "
-              "Op(group_norm) must be equal to the number of channels. "
-              "But received: the Input(Bias)'s first dimension size is [%s], "
-              "the channels is [%s], the Attr(data_layout) is [%s]. The "
-              "error may come from wrong data_layout setting.",
-              ctx->GetInputDim("Bias")[0], channel_num, data_layout_str));
-    }
-
-    ctx->SetOutputDim("Y", ctx->GetInputDim("X"));
-    ctx->SetOutputDim("Mean", {batch_size, groups});
-    ctx->SetOutputDim("Variance", {batch_size, groups});
-    ctx->ShareLoD("X", "Y");
-  }
 };
 
 class GroupNormOpMaker : public framework::OpProtoAndCheckerMaker {
@@ -252,17 +171,14 @@ class GroupNormOpInferVarType
 }  // namespace operators
 }  // namespace paddle
 
+DECLARE_INFER_SHAPE_FUNCTOR(group_norm, GroupNormInferShapeFunctor,
+                            PD_INFER_META(phi::GroupNormInferMeta));
+
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(group_norm, ops::GroupNormOp, ops::GroupNormOpMaker,
                   ops::GroupNormOpInferVarType,
                   ops::GroupNormGradMaker<paddle::framework::OpDesc>,
-                  ops::GroupNormGradMaker<paddle::imperative::OpBase>);
+                  ops::GroupNormGradMaker<paddle::imperative::OpBase>,
+                  GroupNormInferShapeFunctor);
 REGISTER_OPERATOR(group_norm_grad, ops::GroupNormGradOp,
                   ops::GroupNormGradInplaceInferer);
-REGISTER_OP_CPU_KERNEL(
-    group_norm, ops::GroupNormKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::GroupNormKernel<paddle::platform::CPUDeviceContext, double>);
-REGISTER_OP_CPU_KERNEL(
-    group_norm_grad,
-    ops::GroupNormGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::GroupNormGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/group_norm_op_npu.cc b/paddle/fluid/operators/group_norm_op_npu.cc
index dfc509941bc2d..8217815f9d74c 100644
--- a/paddle/fluid/operators/group_norm_op_npu.cc
+++ b/paddle/fluid/operators/group_norm_op_npu.cc
@@ -14,7 +14,8 @@ limitations under the License. */
 
 #include <vector>
 
-#include "paddle/fluid/operators/group_norm_op.h"
+#include "paddle/fluid/framework/data_layout.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 
 namespace paddle {
diff --git a/paddle/phi/infermeta/ternary.cc b/paddle/phi/infermeta/ternary.cc
index d84cc9e6d75af..a22f720b97e76 100644
--- a/paddle/phi/infermeta/ternary.cc
+++ b/paddle/phi/infermeta/ternary.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include "paddle/phi/infermeta/ternary.h"
 
+#include "paddle/phi/common/layout.h"
 #include "paddle/phi/core/ddim.h"
 #include "paddle/phi/kernels/funcs/common_shape.h"
 
@@ -363,6 +364,122 @@ void GraphSendRecvInferMeta(const MetaTensor& x,
   }
 }
 
+void GroupNormInferMeta(const MetaTensor& x,
+                        const MetaTensor& scale,
+                        const MetaTensor& bias,
+                        float epsilon,
+                        int groups,
+                        const std::string& data_layout_str,
+                        MetaTensor* y,
+                        MetaTensor* mean,
+                        MetaTensor* variance) {
+  PADDLE_ENFORCE_NE(y,
+                    nullptr,
+                    phi::errors::InvalidArgument(
+                        "The y in GroupNormInferMeta can't be nullptr."));
+  PADDLE_ENFORCE_NE(mean,
+                    nullptr,
+                    phi::errors::InvalidArgument(
+                        "The mean in GroupNormInferMeta can't be nullptr."));
+  PADDLE_ENFORCE_NE(
+      variance,
+      nullptr,
+      phi::errors::InvalidArgument(
+          "The variance in GroupNormInferMeta can't be nullptr."));
+
+  auto x_dim = x.dims();
+  PADDLE_ENFORCE_GE(
+      x_dim.size(),
+      2,
+      phi::errors::InvalidArgument(
+          "The Input(X)'s dimension of Op(group_norm) must be "
+          "greater than 1. But received: %u-D Tensor, which shape is [%s].",
+          x_dim.size(),
+          x_dim));
+
+  const DataLayout data_layout =
+      paddle::framework::StringToDataLayout(data_layout_str);
+  const int64_t channel_num =
+      (data_layout == DataLayout::kNCHW ? x_dim[1] : x_dim[x_dim.size() - 1]);
+  auto batch_size = x_dim[0];
+  PADDLE_ENFORCE_LE(
+      groups,
+      channel_num,
+      phi::errors::InvalidArgument(
+          "The Attr(groups) of Op(group_norm) must be less than or "
+          "equal to the number of channels. But received: groups "
+          "is [%s], channels is [%s], the Attr(data_layout) "
+          "is [%s]. The error may come from wrong data_layout setting.",
+          groups,
+          channel_num,
+          data_layout_str));
+  PADDLE_ENFORCE_GE(
+      groups,
+      1,
+      phi::errors::InvalidArgument(
+          "The Attr(groups) of Op(group_norm) must be "
+          "greater than or equal to 1. But received: groups is [%s].",
+          groups));
+  PADDLE_ENFORCE_EQ(
+      channel_num % groups,
+      0,
+      phi::errors::InvalidArgument(
+          "Expected number of channels in input to be divisible by "
+          "num_groups, but got input channel is %d and num_groups is %d",
+          channel_num,
+          groups));
+
+  if (scale) {
+    PADDLE_ENFORCE_EQ(
+        scale.dims().size(),
+        1UL,
+        phi::errors::InvalidArgument(
+            "The Input(Scale) of Op(group_norm) should be 1-D Tensor. "
+            "But received: %u-D Tensor, the shape of Input(Scale) is [%s].",
+            scale.dims().size(),
+            scale.dims()));
+    PADDLE_ENFORCE_EQ(
+        scale.dims()[0],
+        channel_num,
+        phi::errors::InvalidArgument(
+            "The Input(Scale)'s first dimension size of Op(group_norm) must "
+            "be equal to the number of channels. But received: the "
+            "Input(Scale)'s first dimension size is [%s], the channels is "
+            "[%s], the Attr(data_layout) is [%s]. The error may come "
+            "from wrong data_layout setting.",
+            scale.dims()[0],
+            channel_num,
+            data_layout_str));
+  }
+  if (bias) {
+    PADDLE_ENFORCE_EQ(
+        bias.dims().size(),
+        1UL,
+        phi::errors::InvalidArgument(
+            "The Input(Bias) of Op(group_norm) should be 1-D Tensor. "
+            "But received: %u-D Tensor, the shape of Input(Bias) is [%s].",
+            bias.dims().size(),
+            bias.dims()));
+    PADDLE_ENFORCE_EQ(
+        bias.dims()[0],
+        channel_num,
+        phi::errors::InvalidArgument(
+            "The Input(Bias)'s first dimension size of "
+            "Op(group_norm) must be equal to the number of channels. "
+            "But received: the Input(Bias)'s first dimension size is [%s], "
+            "the channels is [%s], the Attr(data_layout) is [%s]. The "
+            "error may come from wrong data_layout setting.",
+            bias.dims()[0],
+            channel_num,
+            data_layout_str));
+  }
+  y->set_dims(x_dim);
+  y->set_dtype(x.dtype());
+  y->share_lod(x);
+  mean->set_dims({batch_size, groups});
+  variance->set_dims({batch_size, groups});
+}
+
 void LayerNormInferMeta(const MetaTensor& x,
                         const MetaTensor& scale,
                         const MetaTensor& bias,
diff --git a/paddle/phi/infermeta/ternary.h b/paddle/phi/infermeta/ternary.h
index 760011ad829fc..40461d299fb01 100644
--- a/paddle/phi/infermeta/ternary.h
+++ b/paddle/phi/infermeta/ternary.h
@@ -69,6 +69,16 @@ void GraphSendRecvInferMeta(const MetaTensor& x,
                             MetaTensor* out,
                             MetaTensor* dst_count);
 
+void GroupNormInferMeta(const MetaTensor& x,
+                        const MetaTensor& scale,
+                        const MetaTensor& bias,
+                        float epsilon,
+                        int groups,
+                        const std::string& data_layout,
+                        MetaTensor* y,
+                        MetaTensor* mean,
+                        MetaTensor* variance);
+
 void LayerNormInferMeta(const MetaTensor& x,
                         const MetaTensor& scale,
                         const MetaTensor& bias,
diff --git a/paddle/phi/kernels/cpu/group_norm_grad_kernel.cc b/paddle/phi/kernels/cpu/group_norm_grad_kernel.cc
new file mode 100644
index 0000000000000..949f9148761f5
--- /dev/null
+++ b/paddle/phi/kernels/cpu/group_norm_grad_kernel.cc
@@ -0,0 +1,204 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/group_norm_grad_kernel.h"
+
+#include <algorithm>
+#include <array>
+#include <numeric>
+#include <string>
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/common/layout.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/eigen/extensions.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void GroupNormGradKernel(const Context& dev_ctx,
+                         const DenseTensor& x,
+                         const paddle::optional<DenseTensor>& scale,
+                         const paddle::optional<DenseTensor>& bias,
+                         const DenseTensor& y,
+                         const DenseTensor& mean,
+                         const DenseTensor& var,
+                         const DenseTensor& d_y,
+                         float epsilon,
+                         int groups,
+                         const std::string& data_layout_str,
+                         DenseTensor* d_x,
+                         DenseTensor* d_scale,
+                         DenseTensor* d_bias) {
+  const DataLayout data_layout =
+      paddle::framework::StringToDataLayout(data_layout_str);
+  const auto scale_ptr = scale.get_ptr();
+  const auto bias_ptr = bias.get_ptr();
+  const auto& x_dims = y.dims();
+  const int C = (data_layout == DataLayout::kNCHW ? x_dims[1]
+                                                  : x_dims[x_dims.size() - 1]);
+  const int group_size = C / groups;
+
+  dev_ctx.template Alloc<T>(d_x);
+  phi::funcs::SetConstant<CPUContext, T> set_zero;
+
+  auto* x_data = y.data<T>();
+  auto* d_x_data = d_x->data<T>();
+  auto* y_data = d_y.data<T>();
+  auto* var_data = var.data<T>();
+  T* d_scale_data = nullptr;
+  if (d_scale) {
+    dev_ctx.template Alloc<T>(d_scale);
+    set_zero(dev_ctx, d_scale, static_cast<T>(0));
+    d_scale_data = d_scale->data<T>();
+  }
+  T* d_bias_data = nullptr;
+  if (d_bias) {
+    dev_ctx.template Alloc<T>(d_bias);
+    set_zero(dev_ctx, d_bias, static_cast<T>(0));
+    d_bias_data = d_bias->data<T>();
+  }
+
+  const T* scale_data = nullptr;
+  if (scale_ptr) scale_data = scale_ptr->data<T>();
+  const T* bias_data = nullptr;
+  if (bias_ptr) bias_data = bias_ptr->data<T>();
+
+  int imsize = 1;
+  if (data_layout == DataLayout::kNCHW) {
+    for (int i = 2; i < x_dims.size(); ++i) {
+      imsize *= x_dims[i];
+    }
+  } else {
+    for (int i = 1; i < x_dims.size() - 1; ++i) {
+      imsize *= x_dims[i];
+    }
+  }
+  auto* iter_x_data = x_data;
+  auto* iter_d_x_data = d_x_data;
+  auto* iter_y_data = y_data;
+  for (int bid = 0; bid < x_dims[0]; bid++) {
+    for (int gid = 0; gid < groups; gid++) {
+      T x_var = var_data[bid * groups + gid];
+      T var_inv = 1.0 / sqrt(x_var + epsilon);
+      int number = std::min(group_size, static_cast<int>(C - gid * group_size));
+      T number_inv = 1.0 / (number * imsize);
+      auto* tmp_x = iter_x_data;
+      auto* tmp_y = iter_y_data;
+      auto* tmp_d_x = iter_d_x_data;
+      auto* x_src_data = iter_x_data;
+      auto* y_src_data = iter_y_data;
+      auto* iter_x_data_backup = iter_x_data;
+      auto* iter_y_data_backup = iter_y_data;
+      auto* iter_d_x_data_backup = iter_d_x_data;
+      T dp_scale = 0, dp_bias = 0;
+
+      if (data_layout == DataLayout::kNCHW) {
+        for (int cid = 0; cid < number; cid++) {
+          for (int imid = 0; imid < imsize;
+               imid++, iter_x_data++, iter_y_data++) {
+            T val = iter_x_data[0];
+            if (bias_data) val -= bias_data[gid * group_size + cid];
+            T dval = iter_y_data[0];
+            dp_scale += val * dval;
+            if (scale_data)
+              dp_bias += dval * scale_data[gid * group_size + cid];
+
+            if (scale_data && scale_data[gid * group_size + cid] != 0)
+              val /= scale_data[gid * group_size + cid];
+            if (d_bias_data) d_bias_data[gid * group_size + cid] += dval;
+            if (d_scale_data)
+              d_scale_data[gid * group_size + cid] += val * dval;
+          }
+        }
+
+        for (int cid = 0; cid < number; cid++) {
+          for (int imid = 0; imid < imsize;
+               imid++, iter_d_x_data++, tmp_x++, tmp_y++) {
+            T v_y = tmp_x[0];
+            T dly = tmp_y[0];
+            T dss = dp_scale;
+            T dbs = dp_bias;
+            T v_scale = 1., v_bias = 0.;
+            if (scale_data) v_scale = scale_data[gid * group_size + cid];
+            if (bias_data) v_bias = bias_data[gid * group_size + cid];
+            v_y -= v_bias;
+            if (v_scale != 0) v_y /= v_scale;
+            iter_d_x_data[0] =
+                (dly * v_scale - number_inv * dss * v_y - number_inv * dbs) *
+                var_inv;
+          }
+        }
+      } else {
+        for (int cid = 0; cid < number; cid++) {
+          iter_x_data = x_src_data + cid;
+          iter_y_data = y_src_data + cid;
+          for (int imid = 0; imid < imsize;
+               imid++, iter_x_data += C, iter_y_data += C) {
+            T val = iter_x_data[0];
+            if (bias_data) val -= bias_data[gid * group_size + cid];
+            T dval = iter_y_data[0];
+            dp_scale += val * dval;
+            if (scale_data)
+              dp_bias += dval * scale_data[gid * group_size + cid];
+
+            if (scale_data && scale_data[gid * group_size + cid] != 0)
+              val /= scale_data[gid * group_size + cid];
+            if (d_bias_data) d_bias_data[gid * group_size + cid] += dval;
+            if (d_scale_data)
+              d_scale_data[gid * group_size + cid] += val * dval;
+          }
+        }
+
+        for (int cid = 0; cid < number; cid++) {
+          tmp_x = x_src_data + cid;
+          tmp_y = y_src_data + cid;
+          iter_d_x_data = tmp_d_x + cid;
+          for (int imid = 0; imid < imsize;
+               imid++, iter_d_x_data += C, tmp_x += C, tmp_y += C) {
+            T v_y = tmp_x[0];
+            T dly = tmp_y[0];
+            T dss = dp_scale;
+            T dbs = dp_bias;
+            T v_scale = 1.0, v_bias = 0.;
+            if (scale_data) v_scale = scale_data[gid * group_size + cid];
+            if (bias_data) v_bias = bias_data[gid * group_size + cid];
+            v_y -= v_bias;
+            if (v_scale != 0) v_y /= v_scale;
+            iter_d_x_data[0] =
+                (dly * v_scale - number_inv * dss * v_y - number_inv * dbs) *
+                var_inv;
+          }
+        }
+        iter_x_data = iter_x_data_backup + group_size;
+        iter_y_data = iter_y_data_backup + group_size;
+        iter_d_x_data = iter_d_x_data_backup + group_size;
+      }
+    }
+    if (data_layout == DataLayout::kNHWC) {
+      iter_x_data = x_data + (bid + 1) * C * imsize;
+      iter_d_x_data = d_x_data + (bid + 1) * C * imsize;
+      iter_y_data = y_data + (bid + 1) * C * imsize;
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    group_norm_grad, CPU, ALL_LAYOUT, phi::GroupNormGradKernel, float, double) {
+}
diff --git a/paddle/phi/kernels/cpu/group_norm_kernel.cc b/paddle/phi/kernels/cpu/group_norm_kernel.cc
new file mode 100644
index 0000000000000..12aedf4cb4475
--- /dev/null
+++ b/paddle/phi/kernels/cpu/group_norm_kernel.cc
@@ -0,0 +1,210 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/group_norm_kernel.h"
+
+#include <algorithm>
+#include <array>
+#include <numeric>
+#include <string>
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/common/layout.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/eigen/extensions.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void GroupNormKernel(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     const paddle::optional<DenseTensor>& scale,
+                     const paddle::optional<DenseTensor>& bias,
+                     float epsilon,
+                     int groups,
+                     const std::string& data_layout_str,
+                     DenseTensor* y,
+                     DenseTensor* mean,
+                     DenseTensor* var) {
+  const DataLayout data_layout =
+      paddle::framework::StringToDataLayout(data_layout_str);
+  const auto scale_ptr = scale.get_ptr();
+  const auto bias_ptr = bias.get_ptr();
+
+  const auto x_dims = x.dims();
+  const int C = (data_layout == DataLayout::kNCHW ? x_dims[1]
+                                                  : x_dims[x_dims.size() - 1]);
+  const int group_size = C / groups;
+
+  dev_ctx.template Alloc<T>(y);
+  dev_ctx.template Alloc<T>(mean);
+  dev_ctx.template Alloc<T>(var);
+
+  auto* x_data = x.data<T>();
+  auto* y_data = y->data<T>();
+  auto* mean_data = mean->data<T>();
+  auto* var_data = var->data<T>();
+
+  const T* scale_data = nullptr;
+  if (scale_ptr) scale_data = scale_ptr->data<T>();
+  const T* bias_data = nullptr;
+  if (bias_ptr) bias_data = bias_ptr->data<T>();
+
+  int imsize = 1;
+  if (data_layout == DataLayout::kNCHW) {
+    for (int i = 2; i < x_dims.size(); ++i) {
+      imsize *= x_dims[i];
+    }
+  } else {
+    for (int i = 1; i < x_dims.size() - 1; ++i) {
+      imsize *= x_dims[i];
+    }
+  }
+  auto* iter_x_data = x_data;
+  auto* iter_y_data = y_data;
+  for (int bid = 0; bid < x_dims[0]; bid++) {
+    for (int gid = 0; gid < groups; gid++) {
+      const int64_t M = 8;
+      std::array<T, M> x_mean_arr;
+      std::array<T, M> x_var_arr;
+      std::fill(x_mean_arr.begin(), x_mean_arr.end(), T(0));
+      std::fill(x_var_arr.begin(), x_var_arr.end(), T(0));
+      T x_mean = 0, x_var = 0;
+      int number = std::min(group_size, static_cast<int>(C - gid * group_size));
+      auto* tmp_x = iter_x_data;
+      auto* x_src_data = iter_x_data;
+      auto* tmp_y = iter_y_data;
+      auto* y_src_data = iter_y_data;
+
+      if (data_layout == DataLayout::kNCHW) {
+        for (int cid = 0; cid < number; cid++) {
+          int imid;
+          for (imid = 0; imid < imsize - (imsize % M);
+               imid += M, iter_x_data += M) {
+            // TODO(gaoxiang): Because AVX/AVX2/AVX512 can not directly used
+            // in template class/function, before we complete high
+            // performance cpu vector extension, temporarily unrolling
+            // loop to get high precision and performance
+            x_mean_arr[0] += iter_x_data[0];
+            x_var_arr[0] += iter_x_data[0] * iter_x_data[0];
+            x_mean_arr[1] += iter_x_data[1];
+            x_var_arr[1] += iter_x_data[1] * iter_x_data[1];
+            x_mean_arr[2] += iter_x_data[2];
+            x_var_arr[2] += iter_x_data[2] * iter_x_data[2];
+            x_mean_arr[3] += iter_x_data[3];
+            x_var_arr[3] += iter_x_data[3] * iter_x_data[3];
+            x_mean_arr[4] += iter_x_data[4];
+            x_var_arr[4] += iter_x_data[4] * iter_x_data[4];
+            x_mean_arr[5] += iter_x_data[5];
+            x_var_arr[5] += iter_x_data[5] * iter_x_data[5];
+            x_mean_arr[6] += iter_x_data[6];
+            x_var_arr[6] += iter_x_data[6] * iter_x_data[6];
+            x_mean_arr[7] += iter_x_data[7];
+            x_var_arr[7] += iter_x_data[7] * iter_x_data[7];
+          }
+          x_mean =
+              std::accumulate(x_mean_arr.cbegin(), x_mean_arr.cend(), x_mean);
+          x_var = std::accumulate(x_var_arr.cbegin(), x_var_arr.cend(), x_var);
+          std::fill(x_mean_arr.begin(), x_mean_arr.end(), T(0));
+          std::fill(x_var_arr.begin(), x_var_arr.end(), T(0));
+          for (; imid < imsize; imid++, iter_x_data++) {
+            x_mean += iter_x_data[0];
+            x_var += iter_x_data[0] * iter_x_data[0];
+          }
+        }
+      } else {
+        for (int cid = 0; cid < number; cid++) {
+          iter_x_data = tmp_x + cid;
+          int imid;
+          for (imid = 0; imid < imsize - (imsize % M);
+               imid += M, iter_x_data += M * C) {
+            // TODO(gaoxiang): Because AVX/AVX2/AVX512 can not directly used
+            // in template class/function, before we complete high
+            // performance cpu vector extension, temporarily unrolling
+            // loop to get high precision and performance
+            x_mean_arr[0] += iter_x_data[0 * C];
+            x_var_arr[0] += iter_x_data[0 * C] * iter_x_data[0 * C];
+            x_mean_arr[1] += iter_x_data[1 * C];
+            x_var_arr[1] += iter_x_data[1 * C] * iter_x_data[1 * C];
+            x_mean_arr[2] += iter_x_data[2 * C];
+            x_var_arr[2] += iter_x_data[2 * C] * iter_x_data[2 * C];
+            x_mean_arr[3] += iter_x_data[3 * C];
+            x_var_arr[3] += iter_x_data[3 * C] * iter_x_data[3 * C];
+            x_mean_arr[4] += iter_x_data[4 * C];
+            x_var_arr[4] += iter_x_data[4 * C] * iter_x_data[4 * C];
+            x_mean_arr[5] += iter_x_data[5 * C];
+            x_var_arr[5] += iter_x_data[5 * C] * iter_x_data[5 * C];
+            x_mean_arr[6] += iter_x_data[6 * C];
+            x_var_arr[6] += iter_x_data[6 * C] * iter_x_data[6 * C];
+            x_mean_arr[7] += iter_x_data[7 * C];
+            x_var_arr[7] += iter_x_data[7 * C] * iter_x_data[7 * C];
+          }
+          x_mean =
+              std::accumulate(x_mean_arr.cbegin(), x_mean_arr.cend(), x_mean);
+          x_var = std::accumulate(x_var_arr.cbegin(), x_var_arr.cend(), x_var);
+          std::fill(x_mean_arr.begin(), x_mean_arr.end(), T(0));
+          std::fill(x_var_arr.begin(), x_var_arr.end(), T(0));
+          for (; imid < imsize; imid++, iter_x_data += C) {
+            x_mean += iter_x_data[0];
+            x_var += iter_x_data[0] * iter_x_data[0];
+          }
+        }
+        iter_x_data = tmp_x + group_size;
+      }
+
+      x_mean /= number * imsize;
+      x_var /= number * imsize;
+      x_var = std::max(x_var - x_mean * x_mean, T(0));
+      T var_inv = T(1) / std::sqrt(x_var + epsilon);
+      mean_data[bid * groups + gid] = x_mean;
+      var_data[bid * groups + gid] = x_var;
+
+      if (data_layout == DataLayout::kNCHW) {
+        for (int cid = 0; cid < number; cid++) {
+          for (int imid = 0; imid < imsize; imid++, tmp_x++, iter_y_data++) {
+            T val = (tmp_x[0] - x_mean) * var_inv;
+            if (scale_data) val *= scale_data[gid * group_size + cid];
+            if (bias_data) val += bias_data[gid * group_size + cid];
+            iter_y_data[0] = val;
+          }
+        }
+      } else {
+        for (int cid = 0; cid < number; cid++) {
+          tmp_x = x_src_data + cid;
+          iter_y_data = y_src_data + cid;
+          for (int imid = 0; imid < imsize;
+               imid++, tmp_x += C, iter_y_data += C) {
+            T val = (tmp_x[0] - x_mean) * var_inv;
+            if (scale_data) val *= scale_data[gid * group_size + cid];
+            if (bias_data) val += bias_data[gid * group_size + cid];
+            iter_y_data[0] = val;
+          }
+        }
+        iter_y_data = tmp_y + group_size;
+      }
+    }
+    if (data_layout == DataLayout::kNHWC) {
+      iter_x_data = x_data + (bid + 1) * C * imsize;
+      iter_y_data = y_data + (bid + 1) * C * imsize;
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    group_norm, CPU, ALL_LAYOUT, phi::GroupNormKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/group_norm_grad_kernel.cu b/paddle/phi/kernels/gpu/group_norm_grad_kernel.cu
new file mode 100644
index 0000000000000..8af66fe0f29b6
--- /dev/null
+++ b/paddle/phi/kernels/gpu/group_norm_grad_kernel.cu
@@ -0,0 +1,452 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/layout.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/gpu/group_norm_utils.h"
+#include "paddle/phi/kernels/group_norm_grad_kernel.h"
+
+namespace phi {
+
+template <typename T, int flags>
+__global__ void GroupNormBackwardGetMeanAndVar(const T* x,
+                                               const T* scale,
+                                               const T* bias,
+                                               const T* d_y,
+                                               int N,
+                                               int C,
+                                               int W,
+                                               int imsize,
+                                               int groups,
+                                               int group_size,
+                                               T epsilon,
+                                               T* d_mean,
+                                               T* d_var,
+                                               T* d_scale,
+                                               T* d_bias) {
+  int gid = blockIdx.y;
+  int cid = blockIdx.x;
+  int bid = blockIdx.z;
+  int H = imsize / W;
+  int number = min(group_size, static_cast<int>(C - gid * group_size));
+  int ccid = gid * group_size + cid;
+  if (ccid >= C) return;
+  T x_scale = (flags & kHasScale) ? scale[ccid] : 1;
+  T x_bias = (flags & kHasBias) ? bias[ccid] : 0;
+  T x_scale_inv = 0;
+  if (x_scale != 0) x_scale_inv = 1.0 / x_scale;
+  T d_mean_data = 0, d_var_data = 0, d_scale_data = 0, d_bias_data = 0;
+
+  for (int imid = threadIdx.x; imid < imsize; imid += blockDim.x) {
+    T val, dval;
+
+    int hid = imid / W;
+    int wid = imid % W;
+    val = x[(bid * H + hid) * W * C + wid * C + ccid] - x_bias;
+    dval = d_y[(bid * H + hid) * W * C + wid * C + ccid];
+
+    d_var_data += val * dval;
+    d_mean_data += dval * x_scale;
+
+    val = val * x_scale_inv;
+    d_bias_data += dval;
+    d_scale_data += val * dval;
+  }
+  CudaAtomicAddWithWarp(&(d_mean[bid * groups + gid]), d_mean_data);
+  CudaAtomicAddWithWarp(&(d_var[bid * groups + gid]), d_var_data);
+  if (flags & kHasScale) CudaAtomicAddWithWarp(&(d_scale[ccid]), d_scale_data);
+  if (flags & kHasBias) CudaAtomicAddWithWarp(&(d_bias[ccid]), d_bias_data);
+}
+
+template <typename T, int flags>
+__global__ void GroupNormBackward(const T* x,
+                                  const T* d_y,
+                                  const T* scale,
+                                  const T* bias,
+                                  const T* var,
+                                  const T* d_mean,
+                                  const T* d_var,
+                                  int N,
+                                  int C,
+                                  int W,
+                                  int imsize,
+                                  int groups,
+                                  int group_size,
+                                  T epsilon,
+                                  T* d_x) {
+  int gid = blockIdx.y;
+  int cid = blockIdx.x;
+  int bid = blockIdx.z;
+  int H = imsize / W;
+  int number = min(group_size, static_cast<int>(C - gid * group_size));
+  int ccid = gid * group_size + cid;
+  if (ccid >= C) return;
+  T x_var = var[bid * groups + gid];
+  T d_x_mean = d_mean[bid * groups + gid];
+  T d_x_var = d_var[bid * groups + gid];
+
+  T x_var_inv = 1.0 / sqrt(x_var + epsilon);
+  T number_inv = 1.0 / (number * imsize);
+
+  T x_scale = (flags & kHasScale) ? scale[ccid] : 1;
+  T x_bias = (flags & kHasBias) ? bias[ccid] : 0;
+  T x_scale_inv = 0;
+  if (x_scale != 0) x_scale_inv = 1.0 / x_scale;
+
+  for (int imid = threadIdx.x; imid < imsize; imid += blockDim.x) {
+    int hid = imid / W;
+    int wid = imid % W;
+    T tmp = x[(bid * H + hid) * W * C + wid * C + ccid];
+    T v_y = (tmp - x_bias) * x_scale_inv;
+    T dly = d_y[(bid * H + hid) * W * C + wid * C + ccid];
+    d_x[(bid * H + hid) * W * C + wid * C + ccid] =
+        x_var_inv *
+        (dly * x_scale - number_inv * d_x_var * v_y - number_inv * d_x_mean);
+  }
+}
+
+template <typename T>
+__global__ void ScalarGetDsDbCUDAKernel(
+    int imsize, const T* x, const T* dy, T* ds, T* db) {
+  const int nc = blockIdx.x;
+  T ds_sum = 0;
+  T db_sum = 0;
+  for (int i = threadIdx.x; i < imsize; i += blockDim.x) {
+    const int index = nc * imsize + i;
+    ds_sum += dy[index] * x[index];
+    db_sum += dy[index];
+  }
+  ReduceMeanAndVar<T>(db, ds, db_sum, ds_sum, 1);
+}
+
+template <typename T>
+__global__ void GetScaleBiasGradientCUDAKernel(int N,
+                                               int C,
+                                               int group,
+                                               T epsilon,
+                                               const T* mean,
+                                               const T* var,
+                                               const T* ds,
+                                               const T* db,
+                                               T* d_scale,
+                                               T* d_bias) {
+  const int c = blockIdx.x * blockDim.x + threadIdx.x;
+  if (c < C) {
+    const int G = group;
+    const int D = C / G;
+    T sum1 = 0;
+    T sum2 = 0;
+    for (int n = 0; n < N; ++n) {
+      const int nc = n * C + c;
+      const int ng = n * G + c / D;
+      sum1 += (d_scale == nullptr)
+                  ? T(0)
+                  : ((ds[nc] - db[nc] * static_cast<T>(mean[ng])) *
+                     static_cast<T>(rsqrt(var[ng] + epsilon)));
+      sum2 += (d_bias == nullptr) ? T(0) : db[nc];
+    }
+    if (d_scale != nullptr) {
+      d_scale[c] = sum1;
+    }
+    if (d_bias != nullptr) {
+      d_bias[c] = sum2;
+    }
+  }
+}
+
+template <typename T, int BlockDim>
+__global__ void GetBackwardParamsCUDAKernel(int imsize,
+                                            int groups,
+                                            int group_size,
+                                            T epsilon,
+                                            const T* mean,
+                                            const T* var,
+                                            const T* scale,
+                                            const T* ds,
+                                            const T* db,
+                                            T* p1,
+                                            T* p2,
+                                            T* p3) {
+  const int n = blockIdx.x;
+  const int g = blockIdx.y;
+  const int ng = n * groups + g;
+  T sum1 = 0;
+  T sum2 = 0;
+  T var_inv = rsqrt(var[ng] + epsilon);
+  for (int64_t i = threadIdx.x; i < group_size; i += blockDim.x) {
+    const int64_t index = ng * group_size + i;
+    const int64_t c = g * group_size + i;
+    const T scale_v = scale == nullptr ? T(1) : static_cast<T>(scale[c]);
+    sum1 += ds[index] * scale_v;
+    sum2 += db[index] * scale_v;
+    const T scale_c = scale == nullptr ? T(0) : static_cast<T>(scale[c]);
+    p1[index] = scale_c * var_inv;
+  }
+
+  typedef cub::BlockReduce<T, BlockDim> BlockReduce;
+  __shared__ typename BlockReduce::TempStorage ds_storage;
+  __shared__ typename BlockReduce::TempStorage db_storage;
+  sum1 = BlockReduce(ds_storage).Reduce(sum1, cub::Sum());
+  sum2 = BlockReduce(db_storage).Reduce(sum2, cub::Sum());
+
+  if (threadIdx.x == 0) {
+    const T s = T(1) / static_cast<T>(group_size * imsize);
+    const T x = (sum2 * static_cast<T>(mean[ng]) - sum1) *
+                static_cast<T>(var_inv) * static_cast<T>(var_inv) *
+                static_cast<T>(var_inv) * s;
+    p2[ng] = x;
+    p3[ng] = -x * static_cast<T>(mean[ng]) - sum2 * static_cast<T>(var_inv) * s;
+  }
+}
+
+template <typename T>
+__global__ void GetXGradientCUDAKernel(int imsize,
+                                       int C,
+                                       int group_size,
+                                       int groups,
+                                       T* p1,
+                                       T* p2,
+                                       T* p3,
+                                       const T* x,
+                                       const T* dy,
+                                       T* dx) {
+  int cid = blockIdx.x;
+  int gid = blockIdx.y;
+  int bid = blockIdx.z;
+  int ccid = bid * C + gid * group_size + cid;
+  int ng = bid * groups + gid;
+  int nc = gid * group_size + cid;
+  for (int imid = threadIdx.x; imid < imsize; imid += blockDim.x) {
+    int index = (bid * C + nc) * imsize + imid;
+    dx[index] = p1[ccid] * dy[index] + p2[ng] * x[index] + p3[ng];
+  }
+}
+
+template <typename T, typename Context>
+void GroupNormGradKernel(const Context& dev_ctx,
+                         const DenseTensor& x,
+                         const paddle::optional<DenseTensor>& scale,
+                         const paddle::optional<DenseTensor>& bias,
+                         const DenseTensor& y,
+                         const DenseTensor& mean,
+                         const DenseTensor& var,
+                         const DenseTensor& d_y,
+                         float epsilon,
+                         int groups,
+                         const std::string& data_layout_str,
+                         DenseTensor* d_x,
+                         DenseTensor* d_scale,
+                         DenseTensor* d_bias) {
+  const DataLayout data_layout =
+      paddle::framework::StringToDataLayout(data_layout_str);
+  const auto scale_ptr = scale.get_ptr();
+  const auto bias_ptr = bias.get_ptr();
+
+  const auto& x_dims = x.dims();
+  const int C = (data_layout == DataLayout::kNCHW ? x_dims[1]
+                                                  : x_dims[x_dims.size() - 1]);
+  const int group_size = C / groups;
+  const int W = (data_layout == DataLayout::kNCHW ? x_dims[x_dims.size() - 1]
+                                                  : x_dims[x_dims.size() - 2]);
+
+  dev_ctx.template Alloc<T>(d_x);
+  phi::funcs::SetConstant<GPUContext, T> set_zero;
+
+  DenseTensor ds, db;
+  ds.Resize({x_dims[0], C});
+  T* ds_data = dev_ctx.template Alloc<T>(&ds);
+  db.Resize({x_dims[0], C});
+  T* db_data = dev_ctx.template Alloc<T>(&db);
+
+  auto* y_data = y.data<T>();
+  auto* x_data = x.data<T>();
+  T* d_x_data = nullptr;
+  if (d_x) d_x_data = d_x->data<T>();
+  auto* dy_data = d_y.data<T>();
+  auto* var_data = var.data<T>();
+  auto* mean_data = mean.data<T>();
+  T* d_scale_data = nullptr;
+  if (d_scale) {
+    dev_ctx.template Alloc<T>(d_scale);
+    d_scale_data = d_scale->data<T>();
+  }
+  T* d_bias_data = nullptr;
+  if (d_bias) {
+    dev_ctx.template Alloc<T>(d_bias);
+    d_bias_data = d_bias->data<T>();
+  }
+
+  const T* scale_data = nullptr;
+  if (scale_ptr) scale_data = scale_ptr->data<T>();
+  const T* bias_data = nullptr;
+  if (bias_ptr) bias_data = bias_ptr->data<T>();
+
+  int imsize = 1;
+  if (data_layout == DataLayout::kNCHW) {
+    for (int i = 2; i < x_dims.size(); ++i) {
+      imsize *= x_dims[i];
+    }
+  } else {
+    for (int i = 1; i < x_dims.size() - 1; ++i) {
+      imsize *= x_dims[i];
+    }
+  }
+
+#ifdef __HIPCC__
+  int block_size = std::max(std::min(256, imsize), 64);
+  const int block_dims = 256;
+#else
+  int block_size = std::min(1024, imsize);
+  const int block_dims = 1024;
+#endif
+  dim3 grid(group_size, groups, x_dims[0]);
+  dim3 threads(block_size, 1, 1);
+  int flags =
+      (scale_data != nullptr) * kHasScale + (bias_data != nullptr) * kHasBias;
+  if (data_layout == DataLayout::kNCHW) {
+    const int max_num_threads = 1024;
+    int max_block_size = std::min(imsize, max_num_threads);
+    int block_size_nchw = 1;
+    while (block_size_nchw < max_block_size) {
+      block_size_nchw *= 2;
+    }
+    block_size_nchw = std::max(block_size_nchw, kps::details::kWarpSize);
+    dim3 blocks(block_size_nchw);
+    ScalarGetDsDbCUDAKernel<T><<<x_dims[0] * C, blocks, 0, dev_ctx.stream()>>>(
+        imsize, x_data, dy_data, ds_data, db_data);
+
+    if (d_scale || d_bias) {
+      const int block = 256;
+      GetScaleBiasGradientCUDAKernel<T>
+          <<<(C + block - 1) / block, block, 0, dev_ctx.stream()>>>(
+              x_dims[0],
+              C,
+              groups,
+              epsilon,
+              mean_data,
+              var_data,
+              ds_data,
+              db_data,
+              d_scale_data,
+              d_bias_data);
+    }
+
+    if (d_x_data != nullptr) {
+      // p1 * dy + p2 * x + p3,
+      // p1, p2, p3 represent the reverse calculation of temporary variables
+      // p1 = scale * var_inv
+      // p2 = (db * scale * mean - ds * scale) * pow(var_inv, 3) * (1/n)
+      // p3 = -p2 * mean[ng] - db * scale * var_inv * (1/n);
+      DenseTensor p1, p2, p3;
+      p1.Resize({x_dims[0] * C});
+      T* p1_data = dev_ctx.template Alloc<T>(&p1);
+      p2.Resize({x_dims[0], groups});
+      T* p2_data = dev_ctx.template Alloc<T>(&p2);
+      p3.Resize({x_dims[0], groups});
+      T* p3_data = dev_ctx.template Alloc<T>(&p3);
+
+      GetBackwardParamsCUDAKernel<T, block_dims>
+          <<<dim3(x_dims[0], groups), block_dims, 0, dev_ctx.stream()>>>(
+              imsize,
+              groups,
+              group_size,
+              epsilon,
+              mean_data,
+              var_data,
+              scale_data,
+              ds_data,
+              db_data,
+              p1_data,
+              p2_data,
+              p3_data);
+      GetXGradientCUDAKernel<T>
+          <<<grid, threads, 0, dev_ctx.stream()>>>(imsize,
+                                                   C,
+                                                   group_size,
+                                                   groups,
+                                                   p1_data,
+                                                   p2_data,
+                                                   p3_data,
+                                                   x_data,
+                                                   dy_data,
+                                                   d_x_data);
+    }
+  } else {
+    if (d_scale) {
+      set_zero(dev_ctx, d_scale, static_cast<T>(0));
+    }
+    if (d_bias) {
+      set_zero(dev_ctx, d_bias, static_cast<T>(0));
+    }
+
+    DenseTensor temp_var;
+    temp_var.Resize(var.dims());
+    dev_ctx.template Alloc<T>(&temp_var);
+    set_zero(dev_ctx, &temp_var, static_cast<T>(0));
+    T* temp_var_data = temp_var.data<T>();
+
+    DenseTensor temp_mean;
+    temp_mean.Resize(var.dims());
+    dev_ctx.template Alloc<T>(&temp_mean);
+    set_zero(dev_ctx, &temp_mean, static_cast<T>(0));
+    T* temp_mean_data = temp_mean.data<T>();
+
+    int flags =
+        (scale_data != nullptr) * kHasScale + (bias_data != nullptr) * kHasBias;
+    UNROLL_ALL_CASES(flags,
+                     GroupNormBackwardGetMeanAndVar,
+                     y_data,
+                     scale_data,
+                     bias_data,
+                     dy_data,
+                     x_dims[0],
+                     C,
+                     W,
+                     imsize,
+                     groups,
+                     group_size,
+                     epsilon,
+                     temp_mean_data,
+                     temp_var_data,
+                     d_scale_data,
+                     d_bias_data);
+    if (d_x_data != nullptr) {
+      UNROLL_ALL_CASES(flags,
+                       GroupNormBackward,
+                       y_data,
+                       dy_data,
+                       scale_data,
+                       bias_data,
+                       var_data,
+                       temp_mean_data,
+                       temp_var_data,
+                       x_dims[0],
+                       C,
+                       W,
+                       imsize,
+                       groups,
+                       group_size,
+                       epsilon,
+                       d_x_data);
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    group_norm_grad, GPU, ALL_LAYOUT, phi::GroupNormGradKernel, float, double) {
+}
diff --git a/paddle/phi/kernels/gpu/group_norm_kernel.cu b/paddle/phi/kernels/gpu/group_norm_kernel.cu
new file mode 100644
index 0000000000000..127677233b80f
--- /dev/null
+++ b/paddle/phi/kernels/gpu/group_norm_kernel.cu
@@ -0,0 +1,233 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/layout.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/gpu/group_norm_utils.h"
+#include "paddle/phi/kernels/group_norm_kernel.h"
+
+namespace phi {
+
+template <typename T>
+__global__ void GroupNormForwardGetMeanAndVar(const T* x,
+                                              int N,
+                                              int C,
+                                              int W,
+                                              int imsize,
+                                              int groups,
+                                              int group_size,
+                                              T* mean,
+                                              T* var) {
+  int gid = blockIdx.y;
+  int cid = blockIdx.x;
+  int bid = blockIdx.z;
+  int H = imsize / W;
+  int number = min(group_size, static_cast<int>(C - gid * group_size));
+  int ccid = gid * group_size + cid;
+  if (ccid >= C) return;
+  T x_mean = 0, x_var = 0;
+  for (int imid = threadIdx.x; imid < imsize; imid += blockDim.x) {
+    T val;
+    int hid = imid / W;
+    int wid = imid % W;
+    val = x[(bid * H + hid) * W * C + wid * C + ccid];
+
+    x_mean += val;
+    x_var += val * val;
+  }
+  x_mean /= number * imsize;
+  x_var /= number * imsize;
+  CudaAtomicAddWithWarp(&mean[bid * groups + gid], x_mean);
+  CudaAtomicAddWithWarp(&var[bid * groups + gid], x_var);
+}
+
+template <typename T, int flags>
+__global__ void GroupNormForward(const T* x,
+                                 const T* mean,
+                                 const T* var,
+                                 const T* scale,
+                                 const T* bias,
+                                 int N,
+                                 int C,
+                                 int W,
+                                 int imsize,
+                                 int groups,
+                                 int group_size,
+                                 T epsilon,
+                                 T* y,
+                                 T* real_var,
+                                 const DataLayout data_layout) {
+  int gid = blockIdx.y;
+  int cid = blockIdx.x;
+  int bid = blockIdx.z;
+  int H = imsize / W;
+  int ccid = gid * group_size + cid;
+  if (ccid >= C) return;
+  auto ng = bid * groups + gid;
+  T x_mean = mean[ng];
+  T x_var = var[ng];
+  x_var = x_var - x_mean * x_mean;
+  T var_inv = rsqrt(x_var + epsilon);
+  if (cid == 0 && threadIdx.x == 0) {
+    real_var[ng] = x_var;
+  }
+  for (int imid = threadIdx.x; imid < imsize; imid += blockDim.x) {
+    T val;
+    int hid, wid;
+    int index = (bid * C + ccid) * imsize + imid;
+    if (data_layout == DataLayout::kNCHW) {
+      val = x[index];
+    } else {
+      hid = imid / W;
+      wid = imid % W;
+      val = x[(bid * H + hid) * W * C + wid * C + ccid];
+    }
+    val = (val - x_mean) * var_inv;
+    if (flags & kHasScale) {
+      val *= scale[ccid];
+    }
+    if (flags & kHasBias) {
+      val += bias[ccid];
+    }
+    if (data_layout == DataLayout::kNCHW) {
+      y[index] = val;
+    } else {
+      y[(bid * H + hid) * W * C + wid * C + ccid] = val;
+    }
+  }
+}
+
+template <typename T, typename Context>
+void GroupNormKernel(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     const paddle::optional<DenseTensor>& scale,
+                     const paddle::optional<DenseTensor>& bias,
+                     float epsilon,
+                     int groups,
+                     const std::string& data_layout_str,
+                     DenseTensor* y,
+                     DenseTensor* mean,
+                     DenseTensor* var) {
+  const DataLayout data_layout =
+      paddle::framework::StringToDataLayout(data_layout_str);
+  const auto scale_ptr = scale.get_ptr();
+  const auto bias_ptr = bias.get_ptr();
+
+  const auto x_dims = x.dims();
+  const int C = (data_layout == DataLayout::kNCHW ? x_dims[1]
+                                                  : x_dims[x_dims.size() - 1]);
+  const int group_size = C / groups;
+
+  const int W = (data_layout == DataLayout::kNCHW ? x_dims[x_dims.size() - 1]
+                                                  : x_dims[x_dims.size() - 2]);
+
+  dev_ctx.template Alloc<T>(y);
+  dev_ctx.template Alloc<T>(mean);
+  dev_ctx.template Alloc<T>(var);
+  phi::funcs::SetConstant<GPUContext, T> set_zero;
+  DenseTensor temp_var;
+  temp_var.Resize(var->dims());
+  dev_ctx.template Alloc<T>(&temp_var);
+  auto* x_data = x.data<T>();
+  auto* y_data = y->data<T>();
+  auto* mean_data = mean->data<T>();
+  auto* var_data = var->data<T>();
+  auto* temp_var_data = temp_var.data<T>();
+
+  const T* scale_data = nullptr;
+  if (scale_ptr) scale_data = scale_ptr->data<T>();
+  const T* bias_data = nullptr;
+  if (bias_ptr) bias_data = bias_ptr->data<T>();
+
+  int imsize = 1;
+  if (data_layout == DataLayout::kNCHW) {
+    for (int i = 2; i < x_dims.size(); ++i) {
+      imsize *= x_dims[i];
+    }
+  } else {
+    for (int i = 1; i < x_dims.size() - 1; ++i) {
+      imsize *= x_dims[i];
+    }
+  }
+
+#ifdef __HIPCC__
+  int block_size = std::max(std::min(256, imsize), 64);
+#else
+  int block_size = std::min(1024, imsize);
+#endif
+
+  dim3 grid(group_size, groups, x_dims[0]);
+  dim3 threads(block_size, 1, 1);
+  if (data_layout == DataLayout::kNCHW) {
+    using AccT = typename kps::details::MPTypeTrait<T>::Type;
+    constexpr int vec_size = sizeof(float4) / sizeof(T);
+    int size = group_size * imsize;
+    const int max_num_threads = 1024;
+    int max_block_size = std::min(size / vec_size, max_num_threads);
+    int block_size_nchw = 1;
+    while (block_size_nchw < max_block_size) {
+      block_size_nchw *= 2;
+    }
+    block_size_nchw = std::max(block_size_nchw, kps::details::kWarpSize);
+    dim3 grids(x_dims[0] * groups);
+    dim3 blocks(block_size_nchw);
+    if (size < vec_size * block_size_nchw) {
+      ScalarGetMeanAndVarNCHW<T><<<grids, blocks, 0, dev_ctx.stream()>>>(
+          x_data, mean_data, temp_var_data, size);
+    } else {
+      VectorizedGetMeanAndVarNCHW<T, AccT, vec_size>
+          <<<grids, blocks, 0, dev_ctx.stream()>>>(
+              x_data, mean_data, temp_var_data, size);
+    }
+  } else {
+    set_zero(dev_ctx, mean, static_cast<T>(0));
+    set_zero(dev_ctx, &temp_var, static_cast<T>(0));
+    GroupNormForwardGetMeanAndVar<T>
+        <<<grid, threads, 0, dev_ctx.stream()>>>(x_data,
+                                                 x_dims[0],
+                                                 C,
+                                                 W,
+                                                 imsize,
+                                                 groups,
+                                                 group_size,
+                                                 mean_data,
+                                                 temp_var_data);
+  }
+  int flags =
+      (scale_data != nullptr) * kHasScale + (bias_data != nullptr) * kHasBias;
+  UNROLL_ALL_CASES(flags,
+                   GroupNormForward,
+                   x_data,
+                   mean_data,
+                   temp_var_data,
+                   scale_data,
+                   bias_data,
+                   x_dims[0],
+                   C,
+                   W,
+                   imsize,
+                   groups,
+                   group_size,
+                   epsilon,
+                   y_data,
+                   var_data,
+                   data_layout);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    group_norm, GPU, ALL_LAYOUT, phi::GroupNormKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/group_norm_utils.h b/paddle/phi/kernels/gpu/group_norm_utils.h
new file mode 100644
index 0000000000000..6af7b96ca2182
--- /dev/null
+++ b/paddle/phi/kernels/gpu/group_norm_utils.h
@@ -0,0 +1,174 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#ifdef __NVCC__
+#include "cub/cub.cuh"
+#endif
+#ifdef __HIPCC__
+#include <hipcub/hipcub.hpp>
+namespace cub = hipcub;
+#endif
+
+#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/kernels/primitive/kernel_primitives.h"
+
+namespace phi {
+
+enum GroupNormKernelFlags { kHasScale = 1, kHasBias = 2 };
+#define ALIGN_BYTES 16
+
+#define CHECK_CASE(i, flags, kernel_name, ...)                              \
+  if (i == flags) {                                                         \
+    kernel_name<T, i><<<grid, threads, 0, dev_ctx.stream()>>>(__VA_ARGS__); \
+  }
+
+// 0 for no scale, no bias
+// 1 for has scale, no bias
+// 2 for no scale, has bias
+// 3 for has scale, has bias
+#define UNROLL_ALL_CASES(flags, kernel_name, ...) \
+  CHECK_CASE(0, flags, kernel_name, __VA_ARGS__)  \
+  CHECK_CASE(1, flags, kernel_name, __VA_ARGS__)  \
+  CHECK_CASE(2, flags, kernel_name, __VA_ARGS__)  \
+  CHECK_CASE(3, flags, kernel_name, __VA_ARGS__)
+
+template <typename T>
+__device__ __inline__ void CudaAtomicAddWithWarp(T* sum, T value) {
+  typedef cub::WarpReduce<T> WarpReduce;
+  typename WarpReduce::TempStorage temp_storage;
+  value = WarpReduce(temp_storage).Sum(value);
+  if (cub::LaneId() == 0) paddle::platform::CudaAtomicAdd(sum, value);
+}
+
+template <typename T, typename AccT, int VecSize, int Num>
+__device__ __forceinline__ void ThreadReduce(phi::Array<const T*, Num> arrs,
+                                             int size,
+                                             const int offset,
+                                             AccT* out_mean,
+                                             AccT* out_var) {
+  const T* x = arrs[0];
+  const T* y;
+  if (Num == 2) {
+    y = arrs[1];
+  }
+  using VecT = kps::details::VectorType<T, VecSize>;
+  int tid = threadIdx.x;
+  if (offset > 0) {
+    x -= offset;
+    if (Num == 2) {
+      y -= offset;
+    }
+    size += offset;
+    if (tid >= offset) {
+      if (Num == 1) {
+        *out_mean += x[tid];
+        *out_var += x[tid] * x[tid];
+      } else if (Num == 2) {
+        *out_mean += y[tid];
+        *out_var += y[tid] * x[tid];
+      }
+    }
+    size -= blockDim.x;
+    x += blockDim.x;
+    if (Num == 2) {
+      y += blockDim.x;
+    }
+  }
+  int remain = size % (VecSize * blockDim.x);
+
+  T ins_x[VecSize];
+  T ins_y[VecSize];
+  VecT* ins_vec_x = reinterpret_cast<VecT*>(&ins_x);
+  VecT* ins_vec_y = reinterpret_cast<VecT*>(&ins_y);
+
+  // vector part
+  for (; VecSize * tid < (size - remain); tid += blockDim.x) {
+    *ins_vec_x = reinterpret_cast<const VecT*>(x)[tid];
+    if (Num == 2) {
+      *ins_vec_y = reinterpret_cast<const VecT*>(y)[tid];
+    }
+
+#pragma unroll
+    for (int i = 0; i < VecSize; ++i) {
+      if (Num == 1) {
+        *out_mean += ins_x[i];
+        *out_var += ins_x[i] * ins_x[i];
+      } else if (Num == 2) {
+        *out_mean += ins_y[i];
+        *out_var += ins_y[i] * ins_x[i];
+      }
+    }
+  }
+
+  // scalar part
+  tid = size - remain + threadIdx.x;
+  for (; tid < size; tid += blockDim.x) {
+    if (Num == 1) {
+      *out_mean += x[tid];
+      *out_var += x[tid] * x[tid];
+    } else if (Num == 2) {
+      *out_mean += y[tid];
+      *out_var += y[tid] * x[tid];
+    }
+  }
+}
+
+template <typename T>
+__device__ __forceinline__ void ReduceMeanAndVar(
+    T* mean, T* var, T x_mean, T x_var, int size) {
+  const int nc = blockIdx.x;
+  x_mean = kps::details::BlockXReduce<T, kps::AddFunctor<T>>(
+      x_mean, kps::AddFunctor<T>());
+  x_var = kps::details::BlockXReduce<T, kps::AddFunctor<T>>(
+      x_var, kps::AddFunctor<T>());
+  __syncthreads();
+  if (threadIdx.x == 0) {
+    mean[nc] = static_cast<T>(x_mean / size);
+    var[nc] = static_cast<T>(x_var / size);
+  }
+}
+
+template <typename T>
+__global__ void ScalarGetMeanAndVarNCHW(const T* x, T* mean, T* var, int size) {
+  int i = blockIdx.x;
+  T x_mean = 0, x_var = 0;
+  for (int j = threadIdx.x; j < size; j += blockDim.x) {
+    T val;
+    val = x[i * size + j];
+    x_mean += val;
+    x_var += val * val;
+  }
+  ReduceMeanAndVar<T>(mean, var, x_mean, x_var, size);
+}
+
+template <typename T, typename AccT, int VecSize>
+__global__ void VectorizedGetMeanAndVarNCHW(const T* x,
+                                            T* mean,
+                                            T* var,
+                                            int size) {
+  int i = blockIdx.x;
+  AccT x_mean = static_cast<AccT>(0);
+  AccT x_var = static_cast<AccT>(0);
+  x += i * size;
+  const int input_offset = ((uint64_t)x) % ALIGN_BYTES / sizeof(T);
+  phi::Array<const T*, 1> ins;
+  ins[0] = x;
+  ThreadReduce<T, AccT, VecSize, 1>(ins, size, input_offset, &x_mean, &x_var);
+  ReduceMeanAndVar<AccT>(mean, var, x_mean, x_var, size);
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/group_norm_grad_kernel.h b/paddle/phi/kernels/group_norm_grad_kernel.h
new file mode 100644
index 0000000000000..cc404f0213252
--- /dev/null
+++ b/paddle/phi/kernels/group_norm_grad_kernel.h
@@ -0,0 +1,39 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void GroupNormGradKernel(const Context& dev_ctx,
+                         const DenseTensor& x,
+                         const paddle::optional<DenseTensor>& scale,
+                         const paddle::optional<DenseTensor>& bias,
+                         const DenseTensor& y,
+                         const DenseTensor& mean,
+                         const DenseTensor& variance,
+                         const DenseTensor& d_y,
+                         float epsilon,
+                         int groups,
+                         const std::string& data_layout,
+                         DenseTensor* d_x,
+                         DenseTensor* d_scale,
+                         DenseTensor* d_bias);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/group_norm_kernel.h b/paddle/phi/kernels/group_norm_kernel.h
new file mode 100644
index 0000000000000..36bf7125ec16e
--- /dev/null
+++ b/paddle/phi/kernels/group_norm_kernel.h
@@ -0,0 +1,35 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void GroupNormKernel(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     const paddle::optional<DenseTensor>& scale,
+                     const paddle::optional<DenseTensor>& bias,
+                     float epsilon,
+                     int groups,
+                     const std::string& data_layout,
+                     DenseTensor* y,
+                     DenseTensor* mean,
+                     DenseTensor* variance);
+
+}  // namespace phi
diff --git a/paddle/phi/ops/compat/group_norm_sig.cc b/paddle/phi/ops/compat/group_norm_sig.cc
new file mode 100644
index 0000000000000..d5a9cad97a2c8
--- /dev/null
+++ b/paddle/phi/ops/compat/group_norm_sig.cc
@@ -0,0 +1,39 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature GroupNormOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("group_norm",
+                         {"X", "Scale", "Bias"},
+                         {"epsilon", "groups", "data_layout"},
+                         {"Y", "Mean", "Variance"});
+}
+
+KernelSignature GroupNormGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "group_norm_grad",
+      {"X", "Scale", "Bias", "Y", "Mean", "Variance", "Y@GRAD"},
+      {"epsilon", "groups", "data_layout"},
+      {"X@GRAD", "Scale@GRAD", "Bias@GRAD"});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(group_norm, phi::GroupNormOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(group_norm_grad,
+                           phi::GroupNormGradOpArgumentMapping);
diff --git a/python/paddle/fluid/dygraph/nn.py b/python/paddle/fluid/dygraph/nn.py
index 26bda1a34ef63..0f250fbd87091 100644
--- a/python/paddle/fluid/dygraph/nn.py
+++ b/python/paddle/fluid/dygraph/nn.py
@@ -1144,8 +1144,8 @@ def __init__(self,
 
     def forward(self, input):
         if in_dygraph_mode():
-            out, _, _, = _C_ops.final_state_instance_norm(
-                input, self.scale, self.bias, self._epsilon)
+            out = _C_ops.final_state_instance_norm(input, self.scale, self.bias,
+                                                   self._epsilon)
             return out
         if _in_legacy_dygraph():
             out, _, _ = _C_ops.instance_norm(input, self.scale, self.bias,
@@ -3031,8 +3031,14 @@ def forward(self, input):
             dtype=self._dtype, stop_gradient=True)
         variance_out = self._helper.create_variable_for_type_inference(
             dtype=self._dtype, stop_gradient=True)
+        if in_dygraph_mode():
+            out = _C_ops.final_state_group_norm(input, self.weight, self.bias,
+                                                self._epsilon, self._groups,
+                                                "NCHW")
 
-        if _non_static_mode():
+            return dygraph_utils._append_activation_in_dygraph(out, self._act)
+
+        elif _in_legacy_dygraph():
             attrs = ('epsilon', self._epsilon, 'groups', self._groups)
             out, _, _ = _C_ops.group_norm(input, self.weight, self.bias,
                                           mean_out, variance_out, *attrs)
diff --git a/python/paddle/fluid/tests/unittests/test_group_norm_op.py b/python/paddle/fluid/tests/unittests/test_group_norm_op.py
index 94793ad85cf30..179b197cf62c3 100644
--- a/python/paddle/fluid/tests/unittests/test_group_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_group_norm_op.py
@@ -20,7 +20,7 @@
 import paddle.fluid.core as core
 import paddle.fluid as fluid
 from op_test import OpTest, skip_check_grad_ci
-
+from paddle.fluid.framework import _test_eager_guard
 from testsuite import create_op
 
 
@@ -301,5 +301,30 @@ def attr_data_format():
         self.assertRaises(ValueError, attr_data_format)
 
 
+class TestGroupNormEager(unittest.TestCase):
+
+    def test_dygraph_final_state_api(self):
+        self.dtype = np.float64
+        self.shape = (8, 32, 32)
+        input = np.random.random(self.shape).astype(self.dtype)
+
+        with fluid.dygraph.guard():
+            tensor_1 = fluid.dygraph.to_variable(input)
+            tensor_1.stop_gradient = False
+            groupNorm = fluid.dygraph.nn.GroupNorm(channels=32, groups=4)
+            ret1 = groupNorm(tensor_1)
+            ret1.backward()
+            with _test_eager_guard():
+                tensor_eager_1 = fluid.dygraph.to_variable(input)
+                tensor_eager_1.stop_gradient = False
+                groupNorm_eager = fluid.dygraph.nn.GroupNorm(channels=32,
+                                                             groups=4)
+                ret2 = groupNorm_eager(tensor_eager_1)
+                ret2.backward()
+                self.assertEqual((
+                    tensor_1.grad.numpy() == tensor_eager_1.grad.numpy()).all(),
+                                 True)
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_group_norm_op_v2.py b/python/paddle/fluid/tests/unittests/test_group_norm_op_v2.py
index c6bc44ebd2f24..42f97585172a5 100644
--- a/python/paddle/fluid/tests/unittests/test_group_norm_op_v2.py
+++ b/python/paddle/fluid/tests/unittests/test_group_norm_op_v2.py
@@ -22,6 +22,7 @@
 from paddle.fluid.framework import grad_var_name
 import paddle.fluid as fluid
 from paddle.fluid import Program, program_guard
+from paddle.fluid.framework import _test_eager_guard
 import paddle
 
 
@@ -124,6 +125,10 @@ def compute_v2(x_np):
                 y2 = compute_v2(x)
                 self.assertTrue(np.allclose(y1, y2, atol=1e-5))
 
+    def test_eager_api(self):
+        with _test_eager_guard():
+            self.test_dygraph()
+
 
 class TestGroupNormAPIV2_With_General_Dimensions(unittest.TestCase):
 
@@ -154,6 +159,10 @@ def test_numerical_accuracy(self):
                 self.assertTrue(np.allclose(result1, expect_res1, atol=1e-5))
                 self.assertTrue(np.allclose(result2, expect_res2, atol=1e-5))
 
+    def test_eager_api(self):
+        with _test_eager_guard():
+            self.test_numerical_accuracy()
+
 
 class TestGroupNormDimException(unittest.TestCase):
 
diff --git a/python/paddle/nn/functional/norm.py b/python/paddle/nn/functional/norm.py
index 7bc9f105cac1e..e40731b828d97 100644
--- a/python/paddle/nn/functional/norm.py
+++ b/python/paddle/nn/functional/norm.py
@@ -413,7 +413,7 @@ def instance_norm(x,
 
     """
     if in_dygraph_mode():
-        out, _, _, = _C_ops.final_state_instance_norm(x, weight, bias, eps)
+        out = _C_ops.final_state_instance_norm(x, weight, bias, eps)
         return out
     if _in_legacy_dygraph():
         out, _, _ = _C_ops.instance_norm(x, weight, bias, "epsilon", eps,
diff --git a/python/paddle/utils/code_gen/api.yaml b/python/paddle/utils/code_gen/api.yaml
index 8ed4832a8f751..fd000567c507b 100644
--- a/python/paddle/utils/code_gen/api.yaml
+++ b/python/paddle/utils/code_gen/api.yaml
@@ -933,6 +933,17 @@
   kernel :
     func : greater_than
 
+- api : group_norm
+  args : (Tensor x, Tensor scale, Tensor bias, float epsilon, int groups, str data_layout)
+  output : Tensor(y), Tensor(mean), Tensor(variance)
+  infer_meta :
+    func : GroupNormInferMeta
+  kernel :
+    func : group_norm
+  optional : scale, bias
+  intermediate : mean, variance
+  backward : group_norm_grad
+
 - api : gumbel_softmax
   args : (Tensor x, float temperature, bool hard, int axis)
   output : Tensor
@@ -1039,6 +1050,7 @@
     func : instance_norm
     data_type : x
   optional : scale, bias
+  intermediate : saved_mean, saved_variance
   backward : instance_norm_grad
 
 # is_empty
diff --git a/python/paddle/utils/code_gen/backward.yaml b/python/paddle/utils/code_gen/backward.yaml
index 6a555fd24a066..81641ac19f7b5 100644
--- a/python/paddle/utils/code_gen/backward.yaml
+++ b/python/paddle/utils/code_gen/backward.yaml
@@ -844,6 +844,19 @@
     data_type : out_grad
   optional: out, dst_count
 
+- backward_api : group_norm_grad
+  forward : group_norm (Tensor x, Tensor scale, Tensor bias, float epsilon, int groups, str data_layout) -> Tensor(y), Tensor(mean), Tensor(variance)
+  args : (Tensor x, Tensor scale, Tensor bias, Tensor y, Tensor mean, Tensor variance, Tensor y_grad, float epsilon, int groups, str data_layout)
+  output : Tensor(x_grad), Tensor(scale_grad), Tensor(bias_grad)
+  infer_meta :
+    func : GeneralTernaryGradInferMeta
+    param : [y, scale, bias]
+  kernel :
+    func : group_norm_grad
+    data_type : y_grad
+  optional: scale, bias
+  inplace : (y_grad -> x_grad)
+
 - backward_api : gumbel_softmax_grad
   forward : gumbel_softmax (Tensor x, float temperature, bool hard, int axis) -> Tensor(out)
   args : (Tensor out, Tensor out_grad, int axis)
diff --git a/tools/infrt/skipped_phi_api.json b/tools/infrt/skipped_phi_api.json
index 75533311513e5..c788128c63c7f 100644
--- a/tools/infrt/skipped_phi_api.json
+++ b/tools/infrt/skipped_phi_api.json
@@ -1,4 +1,4 @@
 {
-"phi_apis":["conj", "deformable_conv", "dropout", "expand_as", "nll_loss", "psroi_pool", "roi_align", "roi_pool", "label_smooth", "layer_norm", "instance_norm"],
+"phi_apis":["conj", "deformable_conv", "dropout", "expand_as", "nll_loss", "psroi_pool", "roi_align", "roi_pool", "label_smooth", "layer_norm", "instance_norm", "group_norm"],
 "phi_kernels":["equal_all"]
 }

From 88216f63734ef12049be22e23e2e3ddd435dc044 Mon Sep 17 00:00:00 2001
From: zyfncg <zhangyunfei07@baidu.com>
Date: Wed, 8 Jun 2022 13:12:28 +0800
Subject: [PATCH 41/53] fix tensor copy bug (#43299)

---
 paddle/phi/api/lib/tensor_copy.cc | 12 ++++++++----
 paddle/phi/api/lib/tensor_copy.h  |  2 +-
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/paddle/phi/api/lib/tensor_copy.cc b/paddle/phi/api/lib/tensor_copy.cc
index 5f8c2ed71e939..fb18a3b05c77e 100644
--- a/paddle/phi/api/lib/tensor_copy.cc
+++ b/paddle/phi/api/lib/tensor_copy.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include "paddle/phi/api/lib/tensor_copy.h"
 
+#include "paddle/phi/api/include/context_pool.h"
 #include "paddle/phi/api/lib/api_gen_utils.h"
 #include "paddle/phi/api/lib/kernel_dispatch.h"
 #include "paddle/phi/core/compat/convert_utils.h"
@@ -24,18 +25,21 @@ limitations under the License. */
 namespace paddle {
 namespace experimental {
 
-void copy(const Tensor& src, Place place, bool blocking, Tensor* dst) {
+void copy(const Tensor& src, const Place& place, bool blocking, Tensor* dst) {
   auto kernel_key_set = ParseKernelKeyByInputArgs(src);
   kernel_key_set.backend_set =
       kernel_key_set.backend_set | BackendSet(phi::TransToPhiBackend(place));
   auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
-  auto kernel = phi::KernelFactory::Instance().SelectKernelOrThrowError(
-      "copy", kernel_key);
 
   VLOG(6) << "copy API kernel key: " << kernel_key;
+  auto kernel = phi::KernelFactory::Instance().SelectKernelOrThrowError(
+      "copy", kernel_key);
   VLOG(6) << "copy API kernel: " << kernel;
 
-  auto* dev_ctx = GetDeviceContextByBackend(kernel_key.backend());
+  auto target_place = phi::TransToPhiPlace(kernel_key.backend());
+  auto& pool = paddle::experimental::DeviceContextPool::Instance();
+  auto* dev_ctx = pool.GetMutable(
+      target_place.GetType() == place.GetType() ? place : target_place);
 
   auto dense_x = TensorToDenseTensor(src);
 
diff --git a/paddle/phi/api/lib/tensor_copy.h b/paddle/phi/api/lib/tensor_copy.h
index 3ce45853319ec..4a50b78be85d0 100644
--- a/paddle/phi/api/lib/tensor_copy.h
+++ b/paddle/phi/api/lib/tensor_copy.h
@@ -19,7 +19,7 @@ limitations under the License. */
 namespace paddle {
 namespace experimental {
 
-void copy(const Tensor& src, Place place, bool blocking, Tensor* dst);
+void copy(const Tensor& src, const Place& place, bool blocking, Tensor* dst);
 
 }  // namespace experimental
 }  // namespace paddle

From 0ffaf04964034ee6a01a5ac15ceac35746a2ee9e Mon Sep 17 00:00:00 2001
From: Wangzheee <634486483@qq.com>
Date: Wed, 8 Jun 2022 13:47:22 +0800
Subject: [PATCH 42/53] fix_ernie_unitest (#43283)

---
 paddle/fluid/inference/tests/infer_ut/test_ernie_xnli_int8.cc | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/paddle/fluid/inference/tests/infer_ut/test_ernie_xnli_int8.cc b/paddle/fluid/inference/tests/infer_ut/test_ernie_xnli_int8.cc
index 53edc554ebaf8..779b6ee8eec81 100644
--- a/paddle/fluid/inference/tests/infer_ut/test_ernie_xnli_int8.cc
+++ b/paddle/fluid/inference/tests/infer_ut/test_ernie_xnli_int8.cc
@@ -69,6 +69,10 @@ std::shared_ptr<Predictor> InitPredictor() {
                                 opt_input_shape);
   // erinie varlen must be used with oss
   config.EnableVarseqlen();
+  paddle_infer::experimental::InternalUtils::SetTransformerPosid(&config,
+                                                                 input_name2);
+  paddle_infer::experimental::InternalUtils::SetTransformerMaskid(&config,
+                                                                  input_name3);
 
   return CreatePredictor(config);
 }

From 5413fd79938d2e0c71e9140ac98073f88b6ef9eb Mon Sep 17 00:00:00 2001
From: Aurelius84 <zhangliujie@baidu.com>
Date: Wed, 8 Jun 2022 14:14:15 +0800
Subject: [PATCH 43/53] [Dy2Stat]Make convert_shape return List type (#43302)

---
 .../fluid/dygraph/dygraph_to_static/convert_operators.py     | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/convert_operators.py b/python/paddle/fluid/dygraph/dygraph_to_static/convert_operators.py
index 7933ddfe590c9..78cf8f3b85c9e 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/convert_operators.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/convert_operators.py
@@ -320,7 +320,7 @@ def has_negative(list_shape, idx=None):
                                     or has_negative(x.shape, idx)):
         return nn.shape(x) if idx is None else nn.shape(x)[idx]
     else:
-        return x.shape if idx is None else x.shape[idx]
+        return list(x.shape) if idx is None else x.shape[idx]
 
 
 def convert_var_shape_simple(x):
@@ -330,7 +330,8 @@ def convert_var_shape_simple(x):
     if isinstance(x, Variable):
         return nn.shape(x)
     else:
-        return x.shape
+        # Use list() to make returned type consistant with dygraph
+        return list(x.shape)
 
 
 def eval_if_exist_else_none(name, global_symbol_table):

From 8dab269044aa8d0ccd09f7ec1a093bcecd5495e7 Mon Sep 17 00:00:00 2001
From: Netpunk <69072522+Patrick-Star125@users.noreply.github.com>
Date: Wed, 8 Jun 2022 15:01:44 +0800
Subject: [PATCH 44/53] =?UTF-8?q?=E3=80=90Hackathon=20No.17=E3=80=91?=
 =?UTF-8?q?=E4=B8=BA=20Paddle=20=E6=96=B0=E5=A2=9E=20paddle.nn.CosineEmbed?=
 =?UTF-8?q?dingLoss=20=E5=92=8C=20paddle.nn.functional.cosine=5Fembedding?=
 =?UTF-8?q?=5Floss=20API=20(#41680)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* add cosine embedding loss API

* new version

* new version

* new version

* set label to int32

* new version

* new version-test

* new version

* new version

* new version

* new version

* new version

* new version

* new version

* new version

* new version

* new version

* new version

* new version

* new version

* new version

* new version

* new version

* aligning to Chinese document

* add name parameter

* activate CI

* fix format error

* unit test code format

* format code
---
 .../unittests/test_cosine_embedding_loss.py   | 328 ++++++++++++++++++
 python/paddle/nn/__init__.py                  |   2 +
 python/paddle/nn/functional/__init__.py       |   2 +
 python/paddle/nn/functional/loss.py           | 109 ++++++
 python/paddle/nn/layer/loss.py                |  91 +++++
 5 files changed, 532 insertions(+)
 create mode 100644 python/paddle/fluid/tests/unittests/test_cosine_embedding_loss.py

diff --git a/python/paddle/fluid/tests/unittests/test_cosine_embedding_loss.py b/python/paddle/fluid/tests/unittests/test_cosine_embedding_loss.py
new file mode 100644
index 0000000000000..f95089a4cdb51
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_cosine_embedding_loss.py
@@ -0,0 +1,328 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License
+
+from __future__ import print_function
+
+import paddle
+import paddle.static as static
+import numpy as np
+import unittest
+
+
+def cosine_embedding_loss(input1, input2, label, margin=0.5, reduction='mean'):
+    z = (input1 * input2).sum(axis=-1)
+    mag_square1 = np.square(input1).sum(axis=-1) + 10e-12
+    mag_square2 = np.square(input2).sum(axis=-1) + 10e-12
+    denom = np.sqrt(mag_square1 * mag_square2)
+    cos = z / denom
+    zeros = np.zeros_like(cos)
+    pos = 1 - cos
+    neg = np.clip(cos - margin, a_min=0, a_max=np.inf)
+    out_pos = np.where(label == 1, pos, zeros)
+    out_neg = np.where(label == -1, neg, zeros)
+    out = out_pos + out_neg
+    if reduction == 'none':
+        return out
+    if reduction == 'mean':
+        return np.mean(out)
+    elif reduction == 'sum':
+        return np.sum(out)
+
+
+class TestFunctionCosineEmbeddingLoss(unittest.TestCase):
+
+    def setUp(self):
+        self.input1_np = np.random.random(size=(5, 3)).astype(np.float64)
+        self.input2_np = np.random.random(size=(5, 3)).astype(np.float64)
+        a = np.array([-1, -1, -1]).astype(np.int32)
+        b = np.array([1, 1]).astype(np.int32)
+        self.label_np = np.concatenate((a, b), axis=0)
+        np.random.shuffle(self.label_np)
+
+    def run_dynamic(self):
+        input1 = paddle.to_tensor(self.input1_np)
+        input2 = paddle.to_tensor(self.input2_np)
+        label = paddle.to_tensor(self.label_np)
+        dy_result = paddle.nn.functional.cosine_embedding_loss(input1,
+                                                               input2,
+                                                               label,
+                                                               margin=0.5,
+                                                               reduction='mean')
+        expected1 = cosine_embedding_loss(self.input1_np,
+                                          self.input2_np,
+                                          self.label_np,
+                                          margin=0.5,
+                                          reduction='mean')
+        self.assertTrue(np.allclose(dy_result.numpy(), expected1))
+        self.assertTrue(dy_result.shape, [1])
+
+        dy_result = paddle.nn.functional.cosine_embedding_loss(input1,
+                                                               input2,
+                                                               label,
+                                                               margin=0.5,
+                                                               reduction='sum')
+        expected2 = cosine_embedding_loss(self.input1_np,
+                                          self.input2_np,
+                                          self.label_np,
+                                          margin=0.5,
+                                          reduction='sum')
+
+        self.assertTrue(np.allclose(dy_result.numpy(), expected2))
+        self.assertTrue(dy_result.shape, [1])
+
+        dy_result = paddle.nn.functional.cosine_embedding_loss(input1,
+                                                               input2,
+                                                               label,
+                                                               margin=0.5,
+                                                               reduction='none')
+        expected3 = cosine_embedding_loss(self.input1_np,
+                                          self.input2_np,
+                                          self.label_np,
+                                          margin=0.5,
+                                          reduction='none')
+
+        self.assertTrue(np.allclose(dy_result.numpy(), expected3))
+        self.assertTrue(dy_result.shape, [5])
+
+    def run_static(self, use_gpu=False):
+        input1 = static.data(name='input1', shape=[5, 3], dtype='float64')
+        input2 = static.data(name='input2', shape=[5, 3], dtype='float64')
+        label = static.data(name='label', shape=[5], dtype='int32')
+        result0 = paddle.nn.functional.cosine_embedding_loss(input1,
+                                                             input2,
+                                                             label,
+                                                             margin=0.5,
+                                                             reduction='none')
+        result1 = paddle.nn.functional.cosine_embedding_loss(input1,
+                                                             input2,
+                                                             label,
+                                                             margin=0.5,
+                                                             reduction='sum')
+        result2 = paddle.nn.functional.cosine_embedding_loss(input1,
+                                                             input2,
+                                                             label,
+                                                             margin=0.5,
+                                                             reduction='mean')
+
+        place = paddle.CUDAPlace(0) if use_gpu else paddle.CPUPlace()
+        exe = static.Executor(place)
+        exe.run(static.default_startup_program())
+        static_result = exe.run(feed={
+            "input1": self.input1_np,
+            "input2": self.input2_np,
+            "label": self.label_np
+        },
+                                fetch_list=[result0, result1, result2])
+        expected = cosine_embedding_loss(self.input1_np,
+                                         self.input2_np,
+                                         self.label_np,
+                                         margin=0.5,
+                                         reduction='none')
+
+        self.assertTrue(np.allclose(static_result[0], expected))
+        expected = cosine_embedding_loss(self.input1_np,
+                                         self.input2_np,
+                                         self.label_np,
+                                         margin=0.5,
+                                         reduction='sum')
+
+        self.assertTrue(np.allclose(static_result[1], expected))
+        expected = cosine_embedding_loss(self.input1_np,
+                                         self.input2_np,
+                                         self.label_np,
+                                         margin=0.5,
+                                         reduction='mean')
+
+        self.assertTrue(np.allclose(static_result[2], expected))
+
+    def test_cpu(self):
+        paddle.disable_static(place=paddle.CPUPlace())
+        self.run_dynamic()
+        paddle.enable_static()
+
+        with static.program_guard(static.Program()):
+            self.run_static()
+
+    def test_gpu(self):
+        if not paddle.is_compiled_with_cuda():
+            return
+
+        paddle.disable_static(place=paddle.CUDAPlace(0))
+        self.run_dynamic()
+        paddle.enable_static()
+
+        with static.program_guard(static.Program()):
+            self.run_static(use_gpu=True)
+
+    def test_errors(self):
+        paddle.disable_static()
+        input1 = paddle.to_tensor(self.input1_np)
+        input2 = paddle.to_tensor(self.input2_np)
+        label = paddle.to_tensor(self.label_np)
+
+        def test_label_shape_error():
+            label = paddle.to_tensor(
+                np.random.randint(low=0, high=2, size=(2, 3)))
+            paddle.nn.functional.cosine_embedding_loss(input1,
+                                                       input2,
+                                                       label,
+                                                       margin=0.5,
+                                                       reduction='mean')
+
+        self.assertRaises(ValueError, test_label_shape_error)
+
+        def test_input_different_shape_error():
+            input1 = paddle.to_tensor(self.input1_np[0])
+            label = paddle.to_tensor(np.ndarray([1]))
+            paddle.nn.functional.cosine_embedding_loss(input1,
+                                                       input2,
+                                                       label,
+                                                       margin=0.5,
+                                                       reduction='mean')
+
+        self.assertRaises(ValueError, test_input_different_shape_error)
+
+        def test_input_shape2D_error():
+            input1 = paddle.to_tensor(
+                np.random.random(size=(2, 3, 4)).astype(np.float64))
+            input2 = paddle.to_tensor(
+                np.random.random(size=(2, 3, 4)).astype(np.float64))
+            paddle.nn.functional.cosine_embedding_loss(input1,
+                                                       input2,
+                                                       label,
+                                                       margin=0.5,
+                                                       reduction='mean')
+
+        self.assertRaises(ValueError, test_input_shape2D_error)
+
+        def test_label_value_error():
+            label = paddle.to_tensor(np.ndarray([-1, -2]))
+            paddle.nn.functional.cosine_embedding_loss(input1,
+                                                       input2,
+                                                       label,
+                                                       margin=0.5,
+                                                       reduction='mean')
+
+        self.assertRaises(ValueError, test_label_value_error)
+
+        def test_input_type_error():
+            input1 = paddle.to_tensor(self.input1_np.astype(np.int64))
+            paddle.nn.functional.cosine_embedding_loss(input1,
+                                                       input2,
+                                                       label,
+                                                       margin=0.5,
+                                                       reduction='mean')
+
+        self.assertRaises(ValueError, test_input_type_error)
+
+        def test_label_type_error():
+            label = paddle.to_tensor(self.label_np.astype(np.int16))
+            paddle.nn.functional.cosine_embedding_loss(input1,
+                                                       input2,
+                                                       label,
+                                                       margin=0.5,
+                                                       reduction='mean')
+
+        self.assertRaises(ValueError, test_label_type_error)
+
+
+class TestClassCosineEmbeddingLoss(unittest.TestCase):
+
+    def setUp(self):
+        self.input1_np = np.random.random(size=(10, 3)).astype(np.float32)
+        self.input2_np = np.random.random(size=(10, 3)).astype(np.float32)
+        a = np.array([-1, -1, -1, -1, -1]).astype(np.int64)
+        b = np.array([1, 1, 1, 1, 1]).astype(np.int64)
+        self.label_np = np.concatenate((a, b), axis=0)
+        np.random.shuffle(self.label_np)
+        self.input1_np_1D = np.random.random(size=10).astype(np.float32)
+        self.input2_np_1D = np.random.random(size=10).astype(np.float32)
+        self.label_np_1D = np.array([1]).astype(np.int64)
+
+    def run_dynamic(self):
+        input1 = paddle.to_tensor(self.input1_np)
+        input2 = paddle.to_tensor(self.input2_np)
+        label = paddle.to_tensor(self.label_np)
+        CosineEmbeddingLoss = paddle.nn.CosineEmbeddingLoss(margin=0.5,
+                                                            reduction='mean')
+        dy_result = CosineEmbeddingLoss(input1, input2, label)
+        expected1 = cosine_embedding_loss(self.input1_np,
+                                          self.input2_np,
+                                          self.label_np,
+                                          margin=0.5,
+                                          reduction='mean')
+        self.assertTrue(np.allclose(dy_result.numpy(), expected1))
+        self.assertTrue(dy_result.shape, [1])
+
+        input1_1D = paddle.to_tensor(self.input1_np_1D)
+        input2_1D = paddle.to_tensor(self.input2_np_1D)
+        label_1D = paddle.to_tensor(self.label_np_1D)
+        dy_result = CosineEmbeddingLoss(input1_1D, input2_1D, label_1D)
+        expected2 = cosine_embedding_loss(self.input1_np_1D,
+                                          self.input2_np_1D,
+                                          self.label_np_1D,
+                                          margin=0.5,
+                                          reduction='mean')
+        self.assertTrue(np.allclose(dy_result.numpy(), expected2))
+
+    def run_static(self):
+        input1 = static.data(name='input1', shape=[10, 3], dtype='float32')
+        input2 = static.data(name='input2', shape=[10, 3], dtype='float32')
+        label = static.data(name='label', shape=[10], dtype='int64')
+        CosineEmbeddingLoss = paddle.nn.CosineEmbeddingLoss(margin=0.5,
+                                                            reduction='mean')
+        result = CosineEmbeddingLoss(input1, input2, label)
+
+        place = paddle.CPUPlace()
+        exe = static.Executor(place)
+        exe.run(static.default_startup_program())
+        static_result = exe.run(feed={
+            "input1": self.input1_np,
+            "input2": self.input2_np,
+            "label": self.label_np
+        },
+                                fetch_list=[result])
+        expected = cosine_embedding_loss(self.input1_np,
+                                         self.input2_np,
+                                         self.label_np,
+                                         margin=0.5,
+                                         reduction='mean')
+
+        self.assertTrue(np.allclose(static_result[0], expected))
+
+    def test_cpu(self):
+        paddle.disable_static(place=paddle.CPUPlace())
+        self.run_dynamic()
+        paddle.enable_static()
+
+        with static.program_guard(static.Program()):
+            self.run_static()
+
+    def test_errors(self):
+
+        def test_margin_error():
+            CosineEmbeddingLoss = paddle.nn.CosineEmbeddingLoss(
+                margin=2, reduction='mean')
+
+        self.assertRaises(ValueError, test_margin_error)
+
+        def test_reduction_error():
+            CosineEmbeddingLoss = paddle.nn.CosineEmbeddingLoss(
+                margin=2, reduction='reduce_mean')
+
+        self.assertRaises(ValueError, test_reduction_error)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/nn/__init__.py b/python/paddle/nn/__init__.py
index de416ca8093d7..20b176d7c7365 100644
--- a/python/paddle/nn/__init__.py
+++ b/python/paddle/nn/__init__.py
@@ -107,6 +107,7 @@
 from .layer.loss import CTCLoss  # noqa: F401
 from .layer.loss import SmoothL1Loss  # noqa: F401
 from .layer.loss import HingeEmbeddingLoss  # noqa: F401
+from .layer.loss import CosineEmbeddingLoss  # noqa: F401
 from .layer.norm import BatchNorm  # noqa: F401
 from .layer.norm import SyncBatchNorm  # noqa: F401
 from .layer.norm import GroupNorm  # noqa: F401
@@ -311,5 +312,6 @@ def weight_norm(*args):
     'MaxUnPool3D',
     'HingeEmbeddingLoss',
     'Identity',
+    'CosineEmbeddingLoss',
     'RReLU',
 ]
diff --git a/python/paddle/nn/functional/__init__.py b/python/paddle/nn/functional/__init__.py
index 5e4d0dd3558f5..5de8c775ad7f4 100644
--- a/python/paddle/nn/functional/__init__.py
+++ b/python/paddle/nn/functional/__init__.py
@@ -90,6 +90,7 @@
 from .loss import square_error_cost  # noqa: F401
 from .loss import ctc_loss  # noqa: F401
 from .loss import hinge_embedding_loss  # noqa: F401
+from .loss import cosine_embedding_loss  # noqa: F401
 from .norm import batch_norm  # noqa: F401
 from .norm import instance_norm  # noqa: F401
 from .norm import layer_norm  # noqa: F401
@@ -229,5 +230,6 @@
     'class_center_sample',
     'sparse_attention',
     'fold',
+    'cosine_embedding_loss',
     'rrelu',
 ]
diff --git a/python/paddle/nn/functional/loss.py b/python/paddle/nn/functional/loss.py
index e6a3fdb464caf..58a8bb6538351 100755
--- a/python/paddle/nn/functional/loss.py
+++ b/python/paddle/nn/functional/loss.py
@@ -2763,3 +2763,112 @@ def hinge_embedding_loss(input, label, margin=1.0, reduction='mean', name=None):
         return paddle.sum(loss, name=name)
     elif reduction == 'none':
         return loss
+
+
+def cosine_embedding_loss(input1,
+                          input2,
+                          label,
+                          margin=0,
+                          reduction='mean',
+                          name=None):
+    r"""
+    This operator computes the cosine embedding loss of Tensor ``input1``, ``input2`` and ``label`` as follows.
+
+    If label = 1, then the loss value can be calculated as follow:
+
+    .. math::
+        Out = 1 - cos(input1, input2)
+
+    If label = -1, then the loss value can be calculated as follow:
+
+    .. math::
+        Out = max(0, cos(input1, input2)) - margin
+
+    The operator cos can be described as follow:
+     .. math::
+        cos(x1, x2) = \frac{x1 \cdot{} x2}{\Vert x1 \Vert_2 * \Vert x2 \Vert_2}
+
+     Parameters:
+        input1 (Tensor): tensor with shape: [N, M] or [M], 'N' means batch size, 'M' means the length of input array.
+                         Available dtypes are float32, float64.
+        input2 (Tensor): tensor with shape: [N, M] or [M], 'N' means batch size, 'M' means the length of input array.
+                         Available dtypes are float32, float64.
+        label (Tensor): tensor with shape: [N] or [1]. The target labels values should be -1 or 1.
+                         Available dtypes are int32, int64, float32, float64.
+        margin (float, optional): Should be a number from :math:`-1` to :math:`1`,
+                         :math:`0` to :math:`0.5` is suggested. If :attr:`margin` is missing, the
+                         default value is :math:`0`.
+        reduction (string, optional): Specifies the reduction to apply to the output:
+                         ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied,
+                         ``'mean'``: the sum of the output will be divided by the number of elements in the output
+                         ``'sum'``: the output will be summed.
+        name (str, optional): Name for the operation (optional, default is None).
+                         For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        Tensor, the cosine embedding Loss of Tensor ``input1`` ``input2`` and ``label``.
+            If `reduction` is ``'none'``, the shape of output loss is [N], the same as ``input`` .
+            If `reduction` is ``'mean'`` or ``'sum'``, the shape of output loss is [1].
+
+    Examples:
+        .. code-block:: python
+          :name: code-example1
+
+            import paddle
+
+            input1 = paddle.to_tensor([[1.6, 1.2, -0.5], [3.2, 2.6, -5.8]], 'float32')
+            input2 = paddle.to_tensor([[0.5, 0.5, -1.8], [2.3, -1.4, 1.1]], 'float32')
+            label = paddle.to_tensor([1, -1], 'int64')
+
+            output = paddle.nn.functional.cosine_embedding_loss(input1, input2, label, margin=0.5, reduction='mean')
+            print(output)  # [0.21155193]
+
+            output = paddle.nn.functional.cosine_embedding_loss(input1, input2, label, margin=0.5, reduction='sum')
+            print(output)  # [0.42310387]
+
+            output = paddle.nn.functional.cosine_embedding_loss(input1, input2, label, margin=0.5, reduction='none')
+            print(output)  # [0.42310387, 0.        ]
+
+    """
+    if len(label.shape) != 1:
+        raise ValueError(
+            "1D target tensor expected, multi-target not supported")
+
+    if input1.shape != input2.shape:
+        raise ValueError(
+            "the shape of input tensor 1 should be equal to input tensor 2, but found inputs with "
+            "different sizes")
+
+    if len(input1.shape) > 2:
+        raise ValueError(
+            "1D target tensor expects 1D or 2D input tensors, but found inputs with different sizes"
+        )
+
+    if input1.dtype not in [paddle.float32, paddle.float64]:
+        raise ValueError(
+            "The data type of input Variable must be 'float32' or 'float64'")
+    if label.dtype not in [
+            paddle.int32, paddle.int64, paddle.float32, paddle.float64
+    ]:
+        raise ValueError(
+            "The data type of label Variable must be 'int32', 'int64', 'float32', 'float64'"
+        )
+
+    prod_sum = (input1 * input2).sum(axis=-1)
+    mag_square1 = paddle.square(input1).sum(axis=-1) + 10e-12
+    mag_square2 = paddle.square(input2).sum(axis=-1) + 10e-12
+    denom = paddle.sqrt(mag_square1 * mag_square2)
+    cos = prod_sum / denom
+    zeros = paddle.zeros_like(cos)
+    pos = 1 - cos
+    neg = paddle.clip(cos - margin, min=0)
+    out_pos = paddle.where(label == 1, pos, zeros)
+    out_neg = paddle.where(label == -1, neg, zeros)
+    out = out_pos + out_neg
+
+    if reduction == 'none':
+        return out
+    if reduction == 'mean':
+        return paddle.mean(out, name=name)
+    elif reduction == 'sum':
+        return paddle.sum(out, name=name)
diff --git a/python/paddle/nn/layer/loss.py b/python/paddle/nn/layer/loss.py
index c720ec7d1be07..0ec60ef473805 100644
--- a/python/paddle/nn/layer/loss.py
+++ b/python/paddle/nn/layer/loss.py
@@ -1309,3 +1309,94 @@ def forward(self, input, label):
                                       reduction=self.reduction,
                                       margin=self.margin,
                                       name=self.name)
+
+
+class CosineEmbeddingLoss(Layer):
+    r"""
+    This interface is used to construct a callable object of the ``CosineEmbeddingLoss`` class.
+    The CosineEmbeddingLoss layer measures the cosine_embedding loss between input predictions ``input1``, ``input2``
+    and target labels ``label`` with values 1 or 0. This is used for measuring whether two inputs are similar or
+    dissimilar and is typically used for learning nonlinear embeddings or semi-supervised learning.
+    The cosine embedding loss can be described as:
+
+    If label = 1, then the loss value can be calculated as follow:
+
+    .. math::
+        Out = 1 - cos(input1, input2)
+
+    If label = -1, then the loss value can be calculated as follow:
+
+    .. math::
+        Out = max(0, cos(input1, input2)) - margin
+
+    The operator cos can be described as follow:
+     .. math::
+        cos(x1, x2) = \frac{x1 \cdot{} x2}{\Vert x1 \Vert_2 * \Vert x2 \Vert_2}
+
+    Parameters:
+        margin (float, optional): Should be a number from :math:`-1` to :math:`1`,
+            :math:`0` to :math:`0.5` is suggested. If :attr:`margin` is missing, the
+            default value is :math:`0`.
+        reduction (string, optional): Specifies the reduction to apply to the output:
+            ``'none'`` | ``'mean'`` | ``'sum'``. ``'none'``: no reduction will be applied,
+            ``'mean'``: the sum of the output will be divided by the number of
+            elements in the output, ``'sum'``: the output will be summed.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Shape:
+        input1 (Tensor): tensor with shape: [N, M] or [M], 'N' means batch size, 'M' means the length of input array.
+                         Available dtypes are float32, float64.
+        input2 (Tensor): tensor with shape: [N, M] or [M], 'N' means batch size, 'M' means the length of input array.
+                         Available dtypes are float32, float64.
+        label (Tensor): tensor with shape: [N] or [1]. The target labels values should be -1 or 1.
+                         Available dtypes are int32, int64, float32, float64.
+        output (Tensor): Tensor, the cosine embedding Loss of Tensor ``input1`` ``input2`` and ``label``.
+                         If `reduction` is ``'none'``, the shape of output loss is [N], the same as ``input`` .
+                         If `reduction` is ``'mean'`` or ``'sum'``, the shape of output loss is [1].
+
+    Examples:
+        .. code-block:: python
+          :name: code-example1
+
+            import paddle
+
+            input1 = paddle.to_tensor([[1.6, 1.2, -0.5], [3.2, 2.6, -5.8]], 'float32')
+            input2 = paddle.to_tensor([[0.5, 0.5, -1.8], [2.3, -1.4, 1.1]], 'float32')
+            label = paddle.to_tensor([1, -1], 'int64')
+
+            cosine_embedding_loss = paddle.nn.CosineEmbeddingLoss(margin=0.5, reduction='mean')
+            output = cosine_embedding_loss(input1, input2, label)
+            print(output) # [0.21155193]
+
+            cosine_embedding_loss = paddle.nn.CosineEmbeddingLoss(margin=0.5, reduction='sum')
+            output = cosine_embedding_loss(input1, input2, label)
+            print(output) # [0.42310387]
+
+            cosine_embedding_loss = paddle.nn.CosineEmbeddingLoss(margin=0.5, reduction='none')
+            output = cosine_embedding_loss(input1, input2, label)
+            print(output) # [0.42310387, 0.        ]
+
+    """
+
+    def __init__(self, margin=0, reduction='mean', name=None):
+        if margin > 1 or margin < -1:
+            raise ValueError(
+                "The value of 'margin' should be in the interval of [-1, 1], but received %f, which is not allowed."
+                % margin)
+        if reduction not in ['sum', 'mean', 'none']:
+            raise ValueError(
+                "The value of 'reduction' should be 'sum', 'mean' or "
+                "'none', but received %s, which is not allowed." % reduction)
+        super(CosineEmbeddingLoss, self).__init__()
+        self.margin = margin
+        self.reduction = reduction
+        self.name = name
+
+    def forward(self, input1, input2, label):
+        return F.cosine_embedding_loss(input1,
+                                       input2,
+                                       label,
+                                       margin=self.margin,
+                                       reduction=self.reduction,
+                                       name=self.name)

From cad139a71310011838d4d1180dc4bf4f85364c76 Mon Sep 17 00:00:00 2001
From: xiaoxiaohehe001 <49090790+xiaoxiaohehe001@users.noreply.github.com>
Date: Wed, 8 Jun 2022 15:40:54 +0800
Subject: [PATCH 45/53] call_once (#43206)

---
 paddle/phi/backends/gpu/gpu_context.cc | 122 +++++++++++++++++--------
 1 file changed, 82 insertions(+), 40 deletions(-)

diff --git a/paddle/phi/backends/gpu/gpu_context.cc b/paddle/phi/backends/gpu/gpu_context.cc
index f68e451039092..ead53d648109d 100644
--- a/paddle/phi/backends/gpu/gpu_context.cc
+++ b/paddle/phi/backends/gpu/gpu_context.cc
@@ -214,23 +214,6 @@ struct GPUContext::Impl {
                            &max_grid_dim_size_);
     phi::InitStream(&stream_);
     InitEigenDevice();
-    phi::InitBlasHandle(&blas_handle_, stream_);
-#ifdef PADDLE_WITH_CUDA
-#if CUDA_VERSION >= 9000
-    phi::InitBlasHandle(&blas_tensor_core_handle_, stream_);
-    PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cublasSetMathMode(
-        blas_tensor_core_handle_, CUBLAS_TENSOR_OP_MATH));
-#endif
-#if CUDA_VERSION >= 11000
-    phi::InitBlasHandle(&blas_tf32_tensor_core_handle_, stream_);
-    PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cublasSetMathMode(
-        blas_tf32_tensor_core_handle_, CUBLAS_TF32_TENSOR_OP_MATH));
-#endif
-#endif
-    phi::InitBlasLtHandle(&blaslt_handle_);
-    phi::InitDnnHandle(&dnn_handle_, stream_, place_);
-    phi::InitSolverHandle(&solver_handle_, stream_);
-    phi::InitSparseHandle(&sparse_handle_, stream_);
     InitDnnWorkspace();
   }
 
@@ -246,23 +229,6 @@ struct GPUContext::Impl {
                            &max_threads_per_block_,
                            &max_grid_dim_size_);
     phi::InitStream(&stream_);
-    phi::InitBlasHandle(&blas_handle_, stream_);
-#ifdef PADDLE_WITH_CUDA
-#if CUDA_VERSION >= 9000
-    phi::InitBlasHandle(&blas_tensor_core_handle_, stream_);
-    PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cublasSetMathMode(
-        blas_tensor_core_handle_, CUBLAS_TENSOR_OP_MATH));
-#endif
-#if CUDA_VERSION >= 11000
-    phi::InitBlasHandle(&blas_tf32_tensor_core_handle_, stream_);
-    PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cublasSetMathMode(
-        blas_tf32_tensor_core_handle_, CUBLAS_TF32_TENSOR_OP_MATH));
-#endif
-#endif
-    phi::InitBlasLtHandle(&blaslt_handle_);
-    phi::InitDnnHandle(&dnn_handle_, stream_, place_);
-    phi::InitSolverHandle(&solver_handle_, stream_);
-    phi::InitSparseHandle(&sparse_handle_, stream_);
   }
 
   void PartialInitWithAllocator() {
@@ -356,7 +322,28 @@ struct GPUContext::Impl {
     return eigen_device_;
   }
 
-  blasHandle_t GetBlasHandle() const {
+  blasHandle_t GetBlasHandle() {
+    std::call_once(flag_blas_, [=]() {
+      if (!blas_handle_) {
+        phi::InitBlasHandle(&blas_handle_, stream_);
+      }
+#ifdef PADDLE_WITH_CUDA
+#if CUDA_VERSION >= 9000
+      if (!blas_tensor_core_handle_) {
+        phi::InitBlasHandle(&blas_tensor_core_handle_, stream_);
+        PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cublasSetMathMode(
+            blas_tensor_core_handle_, CUBLAS_TENSOR_OP_MATH));
+      }
+#endif
+#if CUDA_VERSION >= 11000
+      if (!blas_tf32_tensor_core_handle_) {
+        phi::InitBlasHandle(&blas_tf32_tensor_core_handle_, stream_);
+        PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cublasSetMathMode(
+            blas_tf32_tensor_core_handle_, CUBLAS_TF32_TENSOR_OP_MATH));
+      }
+#endif
+#endif
+    });
     PD_CHECK(blas_handle_ != nullptr, "the gpu blas handle is nullptr.");
     return blas_handle_;
   }
@@ -373,12 +360,18 @@ struct GPUContext::Impl {
 
   void SetBlasLtHandle(blasLtHandle_t blaslt) { blaslt_handle_ = blaslt; }
 
-  blasLtHandle_t GetBlasLtHandle() const {
+  blasLtHandle_t GetBlasLtHandle() {
+    std::call_once(flag_blaslt_, [=]() {
+      if (!blaslt_handle_) phi::InitBlasLtHandle(&blaslt_handle_);
+    });
     PD_CHECK(blaslt_handle_ != nullptr, "the gpu blasLt handle is nullptr.");
     return blaslt_handle_;
   }
 
   dnnHandle_t GetDnnHandle() {
+    std::call_once(flag_dnn_, [=]() {
+      if (!dnn_handle_) phi::InitDnnHandle(&dnn_handle_, stream_, place_);
+    });
     PD_CHECK(dnn_handle_ != nullptr, "the gpu dnn handle is nullptr.");
     return dnn_handle_;
   }
@@ -399,7 +392,10 @@ struct GPUContext::Impl {
 
   void SetDnnHandle(dnnHandle_t handle) { dnn_handle_ = handle; }
 
-  solverHandle_t GetSolverHandle() const {
+  solverHandle_t GetSolverHandle() {
+    std::call_once(flag_slover_, [=]() {
+      if (!solver_handle_) phi::InitSolverHandle(&solver_handle_, stream_);
+    });
     PD_CHECK(solver_handle_ != nullptr, "the gpu solver handle is nullptr.");
     return solver_handle_;
   }
@@ -461,8 +457,28 @@ struct GPUContext::Impl {
 #endif
   }
 
-  inline void CublasCall(
-      const std::function<void(blasHandle_t)>& callback) const {
+  inline void CublasCall(const std::function<void(blasHandle_t)>& callback) {
+    std::call_once(flag_cublas_, [=]() {
+      if (!blas_handle_) {
+        phi::InitBlasHandle(&blas_handle_, stream_);
+      }
+#ifdef PADDLE_WITH_CUDA
+#if CUDA_VERSION >= 9000
+      if (!blas_tensor_core_handle_) {
+        phi::InitBlasHandle(&blas_tensor_core_handle_, stream_);
+        PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cublasSetMathMode(
+            blas_tensor_core_handle_, CUBLAS_TENSOR_OP_MATH));
+      }
+#endif
+#if CUDA_VERSION >= 11000
+      if (!blas_tf32_tensor_core_handle_) {
+        phi::InitBlasHandle(&blas_tf32_tensor_core_handle_, stream_);
+        PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cublasSetMathMode(
+            blas_tf32_tensor_core_handle_, CUBLAS_TF32_TENSOR_OP_MATH));
+      }
+#endif
+#endif
+    });
     if (blas_tf32_tensor_core_handle_ != nullptr) {
       std::lock_guard<std::mutex> guard(blas_tf32_mtx_);
       callback(blas_tf32_tensor_core_handle_);
@@ -473,7 +489,26 @@ struct GPUContext::Impl {
   }
 
   inline void TensorCoreCublasCallIfAvailable(
-      const std::function<void(blasHandle_t)>& callback) const {
+      const std::function<void(blasHandle_t)>& callback) {
+    std::call_once(flag_tensorcore_cublas_, [=]() {
+      if (!blas_handle_) phi::InitBlasHandle(&blas_handle_, stream_);
+#ifdef PADDLE_WITH_CUDA
+#if CUDA_VERSION >= 9000
+      if (!blas_tensor_core_handle_) {
+        phi::InitBlasHandle(&blas_tensor_core_handle_, stream_);
+        PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cublasSetMathMode(
+            blas_tensor_core_handle_, CUBLAS_TENSOR_OP_MATH));
+      }
+#endif
+#if CUDA_VERSION >= 11000
+      if (!blas_tf32_tensor_core_handle_) {
+        phi::InitBlasHandle(&blas_tf32_tensor_core_handle_, stream_);
+        PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cublasSetMathMode(
+            blas_tf32_tensor_core_handle_, CUBLAS_TF32_TENSOR_OP_MATH));
+      }
+#endif
+#endif
+    });
     if (blas_tensor_core_handle_ != nullptr) {
       std::lock_guard<std::mutex> guard(blas_tensor_core_mtx_);
       callback(blas_tensor_core_handle_);
@@ -563,6 +598,13 @@ struct GPUContext::Impl {
   sparseHandle_t sparse_handle_{nullptr};
   DnnWorkspaceHandle* workspace_{nullptr};
 
+  std::once_flag flag_blas_;
+  std::once_flag flag_blaslt_;
+  std::once_flag flag_dnn_;
+  std::once_flag flag_slover_;
+  std::once_flag flag_cublas_;
+  std::once_flag flag_tensorcore_cublas_;
+
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
   // NCCL communicator (single process version) for NCCL collective operations.
   // NCCL collective operations provides fast collectives over multiple GPUs

From 33949fc5f979c6df4c0ea17a6cd451fbda214558 Mon Sep 17 00:00:00 2001
From: Asthestarsfalll <72954905+Asthestarsfalll@users.noreply.github.com>
Date: Wed, 8 Jun 2022 15:46:14 +0800
Subject: [PATCH 46/53] =?UTF-8?q?=E3=80=90Hackathon=20No.13=E3=80=91?=
 =?UTF-8?q?=E4=B8=BA=20Paddle=20=E6=96=B0=E5=A2=9E=20CyclicLR=20=E4=BC=98?=
 =?UTF-8?q?=E5=8C=96=E8=B0=83=E5=BA=A6=E5=99=A8=20(#40698)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* add paddle.optimizer.lr.CyclicLR

* add unittest of CyclicLR

* fix code format

* fix bug

* try

* fix CI-Coverage

* fix ValueError

* fix arguments assgin

* fix code format and retry pulling develop to pass ci

* fix typo

* Refactor

* fix function-redefined in test_lr_scheduler.py

* update

* fix conflict

* update

* gamma->exp_gamma

* polish docs

* fix code-style

* adjust code format again

* change format of __all__ in lr.py
---
 .../tests/unittests/test_lr_scheduler.py      | 156 ++++++++++++
 python/paddle/optimizer/lr.py                 | 226 +++++++++++++++++-
 2 files changed, 376 insertions(+), 6 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/test_lr_scheduler.py b/python/paddle/fluid/tests/unittests/test_lr_scheduler.py
index fb2038819fcc5..def22575eea91 100644
--- a/python/paddle/fluid/tests/unittests/test_lr_scheduler.py
+++ b/python/paddle/fluid/tests/unittests/test_lr_scheduler.py
@@ -389,6 +389,53 @@ def anneal_func(start, end, pct):
     return computed_lr
 
 
+def cyclic_lr(epoch_num,
+              base_learning_rate,
+              max_learning_rate,
+              step_size_up,
+              step_size_down,
+              mode,
+              exp_gamma=0.1,
+              scale_fn=None,
+              scale_mode='cycle',
+              verbose=False):
+    total_steps = step_size_up + step_size_down
+    step_ratio = step_size_up / total_steps
+
+    def triangular(x):
+        return 1.
+
+    def triangular2(x):
+        return 1 / (2.**(x - 1))
+
+    def exp_range(x):
+        return exp_gamma**x
+
+    if scale_fn is None:
+        if mode == 'triangular':
+            scale_fn = triangular
+            scale_mode = 'cycle'
+        elif mode == 'triangular2':
+            scale_fn = triangular2
+            scale_mode = 'cycle'
+        elif mode == 'exp_range':
+            scale_fn = exp_range
+            scale_mode = 'iterations'
+
+    cycle = math.floor(1 + epoch_num / total_steps)
+    iterations = epoch_num
+    x = 1. + epoch_num / total_steps - cycle
+
+    if x <= step_ratio:
+        scale_factor = x / step_ratio
+    else:
+        scale_factor = (x - 1) / (step_ratio - 1)
+
+    base_height = (max_learning_rate - base_learning_rate) * scale_factor
+
+    return base_learning_rate + base_height * scale_fn(eval(scale_mode))
+
+
 class TestLRScheduler(unittest.TestCase):
 
     def _test_static(self, python_func, paddle_api, kwarg, place):
@@ -533,35 +580,89 @@ def test_scheduler(self):
             paddle.optimizer.lr.MultiStepDecay(learning_rate=0.5,
                                                milestones=[1, 2, 3],
                                                gamma=2)
+        # check type of max_learning_rate
         with self.assertRaises(TypeError):
             paddle.optimizer.lr.OneCycleLR(max_learning_rate='test',
                                            total_steps=20)
+        # check value of max_learning_rate
         with self.assertRaises(ValueError):
             paddle.optimizer.lr.OneCycleLR(max_learning_rate=-1.5,
                                            total_steps=20)
+        # check type of end_learning_rate
         with self.assertRaises(TypeError):
             paddle.optimizer.lr.OneCycleLR(max_learning_rate=0.1,
                                            total_steps=20,
                                            end_learning_rate='test')
+        # check value of end_learning_rate
         with self.assertRaises(ValueError):
             paddle.optimizer.lr.OneCycleLR(max_learning_rate=0.1,
                                            total_steps=20,
                                            end_learning_rate=-1)
+        # check type of total_steps
         with self.assertRaises(TypeError):
             paddle.optimizer.lr.OneCycleLR(max_learning_rate=0.1,
                                            total_steps='test')
+        # check value of total_steps
         with self.assertRaises(ValueError):
             paddle.optimizer.lr.OneCycleLR(max_learning_rate=0.1,
                                            total_steps=-10)
+        # check value of anneal_strategy
         with self.assertRaises(ValueError):
             paddle.optimizer.lr.OneCycleLR(max_learning_rate=0.1,
                                            total_steps=20,
                                            anneal_strategy='test')
+        # check value of phase_pct when three_phase is True
         with self.assertRaises(ValueError):
             paddle.optimizer.lr.OneCycleLR(max_learning_rate=0.1,
                                            total_steps=20,
                                            phase_pct=0.6,
                                            three_phase=True)
+        # check type of max_learning_rate
+        with self.assertRaises(TypeError):
+            paddle.optimizer.lr.CyclicLR(base_learning_rate=0.5,
+                                         max_learning_rate='test',
+                                         step_size_up=10)
+        # check value of max_learning_rate
+        with self.assertRaises(ValueError):
+            paddle.optimizer.lr.CyclicLR(base_learning_rate=0.5,
+                                         max_learning_rate=-1,
+                                         step_size_up=10)
+        # check type of step_size_up
+        with self.assertRaises(TypeError):
+            paddle.optimizer.lr.CyclicLR(base_learning_rate=0.5,
+                                         max_learning_rate=1.0,
+                                         step_size_up='test')
+        # check value of step_size_up
+        with self.assertRaises(ValueError):
+            paddle.optimizer.lr.CyclicLR(base_learning_rate=0.5,
+                                         max_learning_rate=1.0,
+                                         step_size_up=-1)
+        # check type of step_size_down
+        with self.assertRaises(TypeError):
+            paddle.optimizer.lr.CyclicLR(base_learning_rate=0.5,
+                                         max_learning_rate=1.0,
+                                         step_size_up=500,
+                                         step_size_down='test')
+        # check type of step_size_down
+        with self.assertRaises(ValueError):
+            paddle.optimizer.lr.CyclicLR(base_learning_rate=0.5,
+                                         max_learning_rate=1.0,
+                                         step_size_up=500,
+                                         step_size_down=-1)
+        # check value of mode
+        with self.assertRaises(ValueError):
+            paddle.optimizer.lr.CyclicLR(base_learning_rate=0.5,
+                                         max_learning_rate=1.0,
+                                         step_size_up=500,
+                                         step_size_down=500,
+                                         mode='test')
+        # check type value of scale_mode
+        with self.assertRaises(ValueError):
+            paddle.optimizer.lr.CyclicLR(base_learning_rate=0.5,
+                                         max_learning_rate=1.0,
+                                         step_size_up=500,
+                                         step_size_down=-1,
+                                         scale_mode='test')
 
         func_api_kwargs = [
             (noam_lr, paddle.optimizer.lr.NoamDecay, {
@@ -671,6 +772,61 @@ def test_scheduler(self):
                 "anneal_strategy": 'linear',
                 "phase_pct": 0.2,
                 "three_phase": True,
+            }),
+            (cyclic_lr, paddle.optimizer.lr.CyclicLR, {
+                "base_learning_rate": 0.5,
+                "max_learning_rate": 1.0,
+                "step_size_up": 15,
+                "step_size_down": 5,
+                "mode": 'triangular',
+                "exp_gamma": 1.,
+                "scale_fn": None,
+                "scale_mode": 'cycle',
+                "verbose": False
+            }),
+            (cyclic_lr, paddle.optimizer.lr.CyclicLR, {
+                "base_learning_rate": 0.5,
+                "max_learning_rate": 1.0,
+                "step_size_up": 15,
+                "step_size_down": 5,
+                "mode": 'triangular2',
+                "exp_gamma": 1.,
+                "scale_fn": None,
+                "scale_mode": 'cycle',
+                "verbose": False
+            }),
+            (cyclic_lr, paddle.optimizer.lr.CyclicLR, {
+                "base_learning_rate": 0.5,
+                "max_learning_rate": 1.0,
+                "step_size_up": 15,
+                "step_size_down": 5,
+                "mode": 'exp_range',
+                "exp_gamma": 0.8,
+                "scale_fn": None,
+                "scale_mode": 'cycle',
+                "verbose": False
+            }),
+            (cyclic_lr, paddle.optimizer.lr.CyclicLR, {
+                "base_learning_rate": 0.5,
+                "max_learning_rate": 1.0,
+                "step_size_up": 15,
+                "step_size_down": 5,
+                "mode": 'exp_range',
+                "exp_gamma": 1.,
+                "scale_fn": lambda x: 0.95**x,
+                "scale_mode": 'cycle',
+                "verbose": False
+            }),
+            (cyclic_lr, paddle.optimizer.lr.CyclicLR, {
+                "base_learning_rate": 0.5,
+                "max_learning_rate": 1.0,
+                "step_size_up": 15,
+                "step_size_down": 5,
+                "mode": 'exp_range',
+                "exp_gamma": 1.,
+                "scale_fn": lambda x: 0.95,
+                "scale_mode": 'iterations',
+                "verbose": False
             })
         ]
 
diff --git a/python/paddle/optimizer/lr.py b/python/paddle/optimizer/lr.py
index 883b2c1481703..4d7d128e05e49 100644
--- a/python/paddle/optimizer/lr.py
+++ b/python/paddle/optimizer/lr.py
@@ -20,10 +20,22 @@
 from ..fluid.framework import _in_legacy_dygraph
 
 __all__ = [  # noqa
-    'LRScheduler', 'NoamDecay', 'PiecewiseDecay', 'NaturalExpDecay',
-    'InverseTimeDecay', 'PolynomialDecay', 'LinearWarmup', 'ExponentialDecay',
-    'MultiStepDecay', 'StepDecay', 'LambdaDecay', 'ReduceOnPlateau',
-    'CosineAnnealingDecay', 'MultiplicativeDecay', 'OneCycleLR'
+    'LRScheduler',
+    'NoamDecay',
+    'PiecewiseDecay',
+    'NaturalExpDecay',
+    'InverseTimeDecay',
+    'PolynomialDecay',
+    'LinearWarmup',
+    'ExponentialDecay',
+    'MultiStepDecay',
+    'StepDecay',
+    'LambdaDecay',
+    'ReduceOnPlateau',
+    'CosineAnnealingDecay',
+    'MultiplicativeDecay',
+    'OneCycleLR',
+    'CyclicLR',
 ]
 
 
@@ -1681,7 +1693,7 @@ def __init__(self,
         if not isinstance(max_learning_rate, (float, int)):
             raise TypeError(
                 "'max_learning_rate' must be 'float' or 'int', but received {}".
-                format(type(total_steps)))
+                format(type(max_learning_rate)))
         if max_learning_rate < 0:
             raise ValueError("'max_learning_rate' must be a positive integer.")
 
@@ -1689,7 +1701,7 @@ def __init__(self,
         if not isinstance(end_learning_rate, (float, int)):
             raise TypeError(
                 "'end_learning_rate' must be 'float' or 'int', but received {}".
-                format(type(total_steps)))
+                format(type(end_learning_rate)))
         if end_learning_rate < 0:
             raise ValueError("'end_learning_rate' must be a positive integer.")
 
@@ -1792,3 +1804,205 @@ def get_lr(self):
                 percentage = (current_step - self._step_config[i]) / step_size
                 return self.anneal_func(self._lr_config[i],
                                         self._lr_config[i + 1], percentage)
+
+
+class CyclicLR(LRScheduler):
+    r"""
+    Set the learning rate according to the cyclic learning rate (CLR) scheduler.
+    The scheduler regards the process of learning rate adjustment as one cycle after another.
+    It cycles the learning rate between two boundaries with a constant frequency.
+    The distance between the two boundaries can be scaled on a per-iteration or per-cycle basis.
+
+    It has been proposed in `Cyclic Learning Rates for Training Neural Networks <https://arxiv.org/abs/1506.01186>`_.
+
+    According to the paper, the cyclic learning rate schedule has three build-in scale methods:
+
+    * "triangular": A basic triangular cycle without any amplitude scaling.
+    * "triangular2": A basic triangular cycle that reduce initial amplitude by half each cycle.
+    * "exp_range": A cycle that scales initial amplitude by scale function which is defined as :math:`gamma^{iterations}` .
+
+    The initial amplitude is defined as max_learning_rate - base_learning_rate.
+    Also note that you should update learning rate each step.
+
+    Args:
+        base_learning_rate (float): Initial learning rate, which is the lower boundary in the cycle. The paper recommends
+            that set the base_learning_rate to 1/3 or 1/4 of max_learning_rate.
+        max_learning_rate (float): Maximum learning rate in the cycle. It defines the cycle amplitude as above.
+            Since there is some scaling operation during process of learning rate adjustment,
+            max_learning_rate may not actually be reached.
+        step_size_up (int): Number of training steps, which is used to increase learning rate in a cycle.
+            The step size of one cycle will be defined by step_size_up + step_size_down. According to the paper, step
+            size should be set as at least 3 or 4 times steps in one epoch.
+        step_size_down (int, optional): Number of training steps, which is used to decrease learning rate in a cycle.
+            If not specified, it's value will initialize to `` step_size_up `` . Default: None
+        mode (str, optional): one of 'triangular', 'triangular2' or 'exp_range'.
+            If scale_fn is specified, this argument will be ignored. Default: 'triangular'
+        exp_gamma (float): Constant in 'exp_range' scaling function: exp_gamma**iterations. Used only when mode = 'exp_range'. Default: 1.0
+        scale_fn (function, optional): A custom scaling function, which is used to replace three build-in methods.
+            It should only have one argument. For all x >= 0, 0 <= scale_fn(x) <= 1.
+            If specified, then 'mode' will be ignored. Default: None
+        scale_mode (str, optional): One of 'cycle' or 'iterations'. Defines whether scale_fn is evaluated on cycle
+            number or cycle iterations (total iterations since start of training). Default: 'cycle'
+        last_epoch (int, optional): The index of last epoch. Can be set to restart training.Default: -1, means initial learning rate.
+        verbose: (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
+
+    Returns:
+    ``CyclicLR`` instance to schedule learning rate.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+
+            # train on default dynamic graph mode
+            linear = paddle.nn.Linear(10, 10)
+            scheduler = paddle.optimizer.lr.CyclicLR(base_learning_rate=0.5, max_learning_rate=1.0, step_size_up=15, step_size_down=5, verbose=True)
+            sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameters=linear.parameters())
+            for epoch in range(5):
+                for batch_id in range(20):
+                    x = paddle.uniform([10, 10])
+                    out = linear(x)
+                    loss = paddle.mean(out)
+                    loss.backward()
+                    sgd.step()
+                    sgd.clear_gradients()
+                    scheduler.step()        # You should update learning rate each step
+
+            # train on static graph mode
+            paddle.enable_static()
+            main_prog = paddle.static.Program()
+            start_prog = paddle.static.Program()
+            with paddle.static.program_guard(main_prog, start_prog):
+                x = paddle.static.data(name='x', shape=[None, 4, 5])
+                y = paddle.static.data(name='y', shape=[None, 4, 5])
+                z = paddle.static.nn.fc(x, 100)
+                loss = paddle.mean(z)
+                scheduler = paddle.optimizer.lr.CyclicLR(base_learning_rate=0.5,
+                    max_learning_rate=1.0, step_size_up=15, step_size_down=5, verbose=True)
+                sgd = paddle.optimizer.SGD(learning_rate=scheduler)
+                sgd.minimize(loss)
+
+            exe = paddle.static.Executor()
+            exe.run(start_prog)
+            for epoch in range(5):
+                for batch_id in range(20):
+                    out = exe.run(
+                        main_prog,
+                        feed={
+                            'x': np.random.randn(3, 4, 5).astype('float32'),
+                            'y': np.random.randn(3, 4, 5).astype('float32')
+                        },
+                        fetch_list=loss.name)
+                    scheduler.step()    # You should update learning rate each step
+    """
+
+    def __init__(self,
+                 base_learning_rate,
+                 max_learning_rate,
+                 step_size_up,
+                 step_size_down=None,
+                 mode='triangular',
+                 exp_gamma=1.,
+                 scale_fn=None,
+                 scale_mode='cycle',
+                 last_epoch=-1,
+                 verbose=False):
+        # check type and value of max_learning_rate
+        if not isinstance(max_learning_rate, (float, int)):
+            raise TypeError(
+                "'max_learning_rate' must be 'float' or 'int', but received {}".
+                format(type(max_learning_rate)))
+        if max_learning_rate < 0:
+            raise ValueError(
+                "'max_learning_rate' must be a positive integer, but received {}"
+                .format(max_learning_rate))
+
+        # check type and value of step_size_up
+        if not isinstance(step_size_up, int):
+            raise TypeError(
+                "The type of 'step_size_up' must be int, but received {}".
+                format(type(step_size_up)))
+        if step_size_up <= 0:
+            raise ValueError(
+                "'step_size_up' must be a positive integer, but received {}".
+                format(step_size_up))
+
+        # check type and value of step_size_down
+        if step_size_down is not None:
+            if not isinstance(step_size_down, int):
+                raise TypeError(
+                    "The type of 'step_size_down' must be int, but received {}".
+                    format(type(step_size_down)))
+            if step_size_down <= 0:
+                raise ValueError(
+                    "'step_size_down' must be a positive integer, but received {}"
+                    .format(step_size_down))
+
+        # check type of exp_gamma
+        if not isinstance(exp_gamma, float):
+            raise TypeError(
+                "The type of 'exp_gamma' must be float, but received {}".format(
+                    type(exp_gamma)))
+
+        step_size_up = float(step_size_up)
+        step_size_down = float(
+            step_size_down) if step_size_down is not None else step_size_up
+
+        self.cycle_size = step_size_up + step_size_down
+        self.step_up_pct = step_size_up / self.cycle_size
+        self.max_lr = float(max_learning_rate)
+        self.amplitude = self.max_lr - base_learning_rate
+
+        if mode not in ['triangular', 'triangular2', 'exp_range'
+                        ] and scale_fn is None:
+            raise ValueError(
+                "'mode' is invalid and 'scale_fn' is not specified, make sure one of 'mode' or 'scale_fn' is valid"
+            )
+        if scale_mode not in ['cycle', 'iterations']:
+            raise ValueError(
+                "'scale_mode' must be one of 'cycle' or 'iterations")
+
+        self.mode = mode
+        self.gamma = exp_gamma  # only for exp_range mode
+
+        if scale_fn is None:
+            if self.mode == 'triangular':
+                self.scale_fn = self._triangular_scale_fn
+                self.scale_mode = 'cycle'
+            elif self.mode == 'triangular2':
+                self.scale_fn = self._triangular2_scale_fn
+                self.scale_mode = 'cycle'
+            elif self.mode == 'exp_range':
+                self.scale_fn = self._exp_range_scale_fn
+                self.scale_mode = 'iterations'
+        else:
+            self.scale_fn = scale_fn
+            self.scale_mode = scale_mode
+        super().__init__(base_learning_rate, last_epoch, verbose)
+
+    def _triangular_scale_fn(self, x):
+        return 1.
+
+    def _triangular2_scale_fn(self, x):
+        return 1 / (2.**(x - 1))
+
+    def _exp_range_scale_fn(self, x):
+        return self.gamma**x
+
+    def get_lr(self):
+        iterations = self.last_epoch
+
+        cycle = 1 + iterations // self.cycle_size
+        pct_per_cycle = 1. + iterations / self.cycle_size - cycle
+
+        if pct_per_cycle <= self.step_up_pct:
+            scale_factor = pct_per_cycle / self.step_up_pct
+        else:
+            scale_factor = (1 - pct_per_cycle) / (1 - self.step_up_pct)
+
+        base_height = self.amplitude * scale_factor
+
+        lr = self.base_lr + base_height * self.scale_fn(eval(self.scale_mode))
+
+        return lr

From 899efc4cc308f25f0aeb6d2bcf1c61237035c08b Mon Sep 17 00:00:00 2001
From: Sing_chan <51314274+betterpig@users.noreply.github.com>
Date: Wed, 8 Jun 2022 16:07:45 +0800
Subject: [PATCH 47/53] optimize installation and instruction of clang-format
 (#43248)

---
 tools/codestyle/clang_format.hook | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/tools/codestyle/clang_format.hook b/tools/codestyle/clang_format.hook
index 72608fd8b83fd..79a0a542ebc17 100755
--- a/tools/codestyle/clang_format.hook
+++ b/tools/codestyle/clang_format.hook
@@ -5,7 +5,15 @@ readonly VERSION="13.0.0"
 
 version=$(clang-format -version)
 
+if ! [[ $(python -V 2>&1 | awk '{print $2}' | awk -F '.' '{print $1$2}') -ge 36 ]]; then
+    echo "clang-format installation by pip need python version great equal 3.6, 
+          please change the default python to higher version."
+    exit 1
+fi
+
 if ! [[ $version == *"$VERSION"* ]]; then
+    # low version of pip may not have the source of clang-format whl
+    pip install --upgrade pip 
     pip install clang-format==13.0.0
 fi
 

From 3adeea60d85d316772c3d722687dd9c84c8dddd0 Mon Sep 17 00:00:00 2001
From: zhangchunle <clzhang_cauc@163.com>
Date: Wed, 8 Jun 2022 18:05:13 +0800
Subject: [PATCH 48/53] test=document_fix (#43322)

---
 paddle/scripts/paddle_build.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index ad081d8128162..3ed5f992ed40c 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -481,10 +481,10 @@ EOF
 }
 
 function cmake_gen_and_build() {
-    startTime_s=100
+    startTime_s=`date +%s`
     cmake_gen $1
     build $2
-    endTime_s=200
+    endTime_s=`date +%s`
     [ -n "$startTime_firstBuild" ] && startTime_s=$startTime_firstBuild
     echo "Build Time: $[ $endTime_s - $startTime_s ]s"
     echo "ipipe_log_param_Build_Time: $[ $endTime_s - $startTime_s ]s" >> ${PADDLE_ROOT}/build/build_summary.txt

From 07ede11816c46c3b2cca9a1a6d99241c2d84683a Mon Sep 17 00:00:00 2001
From: Aganlengzi <aganlengzi@gmail.com>
Date: Wed, 8 Jun 2022 18:29:16 +0800
Subject: [PATCH 49/53] [NPU] fix reduce_max (#43230)

---
 paddle/fluid/operators/reduce_ops/reduce_max_op_npu.cc | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/paddle/fluid/operators/reduce_ops/reduce_max_op_npu.cc b/paddle/fluid/operators/reduce_ops/reduce_max_op_npu.cc
index e3d8d15a305a9..8b9925a4c20ea 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_max_op_npu.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_max_op_npu.cc
@@ -141,6 +141,9 @@ class ReduceMaxGradNPUKernel : public framework::OpKernel<T> {
     Tensor tmp_out, tmp_out_grad;
     auto tmp_out_dims_vec = x_dims_vec;
     for (auto d : reduce_dims) {
+      if (d < 0) {
+        d += x_dims_vec.size();
+      }
       tmp_out_dims_vec[d] = 1;
     }
 

From 971e4791c482a494abfed4ebecc4db8ce8bcd205 Mon Sep 17 00:00:00 2001
From: zhaoyingli <86812880+zhaoyinglia@users.noreply.github.com>
Date: Wed, 8 Jun 2022 19:01:22 +0800
Subject: [PATCH 50/53] [AutoParallel] add fetch_list in engine api (#43312)

* add fetch_list

* fix evaluate log

* tiny fix
---
 .../distributed/auto_parallel/engine.py       | 74 ++++++++++++-------
 .../unittests/auto_parallel/engine_api.py     |  7 +-
 2 files changed, 53 insertions(+), 28 deletions(-)

diff --git a/python/paddle/distributed/auto_parallel/engine.py b/python/paddle/distributed/auto_parallel/engine.py
index fb12ae4971ae1..a0b2125f16642 100644
--- a/python/paddle/distributed/auto_parallel/engine.py
+++ b/python/paddle/distributed/auto_parallel/engine.py
@@ -28,7 +28,7 @@
 from paddle.fluid.layers.utils import flatten
 from paddle.fluid.executor import global_scope
 from paddle.fluid.backward import append_backward
-from paddle.fluid.framework import Operator
+from paddle.fluid.framework import Operator, Variable
 from paddle.fluid.framework import _current_expected_place as _get_device
 from paddle.fluid.dygraph.parallel import ParallelEnv
 from paddle.distributed import fleet
@@ -256,6 +256,7 @@ def fit(self,
             train_data,
             batch_size=1,
             epochs=1,
+            fetch_list=None,
             steps_per_epoch=None,
             use_program_cache=False,
             return_numpy=True):
@@ -266,13 +267,14 @@ def fit(self,
             "train model is not ready, please call `engine.prepare()` first."
         train_dataloader = self._create_dataloader(train_data, batch_size,
                                                    epochs, steps_per_epoch)
+        self._usr_fetch_list = fetch_list
 
         outputs = []
         for epoch in range(epochs):
             for step, data in enumerate(train_dataloader):
-                logs, loss = self._train_step(data, use_program_cache,
+                logs, outs = self._train_step(data, use_program_cache,
                                               return_numpy)
-                outputs.append(loss)
+                outputs.append(outs)
                 train_logs = {
                     "train_" + name: val
                     for name, val in logs.items()
@@ -283,86 +285,97 @@ def fit(self,
     def evaluate(self,
                  eval_data,
                  batch_size=1,
+                 fetch_list=None,
                  use_program_cache=False,
                  return_numpy=True):
         self.mode = 'eval'
         assert self.mode in self._dist_main_progs, \
             "eval model is not ready, please call `engine.prepare()` first."
         eval_dataloader = self._create_dataloader(eval_data, batch_size)
+        self._usr_fetch_list = fetch_list
 
         for step, data in enumerate(eval_dataloader):
             eval_logs = dict()
-            outs = self._eval_step(data, use_program_cache, return_numpy)
+            logs, outs = self._eval_step(data, use_program_cache, return_numpy)
             eval_logs["eval_loss"] = outs[0] if len(outs) > 0 else []
             for metric in self._metrics:
                 results = metric.accumulate()
                 for i, res in enumerate(to_list(results)):
                     eval_logs["eval_" + metric.name()[i]] = res
+            for name, val in logs.items():
+                eval_logs["eval_" + name] = val
             self._logger.info(eval_logs)
         return eval_logs
 
     def predict(self,
                 test_data,
                 batch_size=1,
+                fetch_list=None,
                 use_program_cache=False,
                 return_numpy=True):
         self.mode = 'predict'
         assert self.mode in self._dist_main_progs, \
             "predict model is not ready, please call `engine.prepare()` first."
         test_dataloader = self._create_dataloader(test_data, batch_size)
+        self._usr_fetch_list = fetch_list
 
         outputs = []
         for step, data in enumerate(test_dataloader):
             logs, outs = self._predict_step(data, use_program_cache,
                                             return_numpy)
             outputs.append(outs)
-            predict_logs = {
-                "predict_" + name: val
-                for name, val in logs.items()
-            }
+            predict_logs = {"pred_" + name: val for name, val in logs.items()}
             self._logger.info(predict_logs)
         return outputs
 
     def _train_step(self, data, use_program_cache=False, return_numpy=True):
         logs = {}
         fetch_vars = self._fetch_vars[self.mode]["loss"]
-        fetch_list = self._fetch_list(fetch_vars)
+        fetch_list, usr_fetch_list = self._fetch_list(fetch_vars)
+        fetch_list += usr_fetch_list
 
-        loss = self._executor.run(self.main_program,
+        outs = self._executor.run(self.main_program,
                                   fetch_list=fetch_list,
                                   use_program_cache=use_program_cache,
                                   return_numpy=return_numpy)
-        logs["loss"] = loss
-        return logs, loss
+        for i, out in enumerate(outs):
+            logs[fetch_list[i]] = out
+        return logs, outs
 
     def _eval_step(self, data, use_program_cache=False, return_numpy=True):
         logs = {}
         metrics = self._fetch_vars[self.mode]["metrics"]
         losses = self._fetch_vars[self.mode]["loss"]
-        fetch_loss = self._fetch_list(losses)
-        fetch_metrics = self._fetch_list(metrics)
+        fetch_loss, usr_fetch_list = self._fetch_list(losses)
+        fetch_metrics, usr_fetch_list = self._fetch_list(metrics)
         fetch_list = fetch_loss + fetch_metrics
 
-        res = self._executor.run(self.main_program,
-                                 fetch_list=fetch_list,
-                                 use_program_cache=use_program_cache,
-                                 return_numpy=return_numpy)
-        if not res[len(fetch_loss):]:
-            return res[:len(fetch_loss)]
+        outs = self._executor.run(self.main_program,
+                                  fetch_list=fetch_list + usr_fetch_list,
+                                  use_program_cache=use_program_cache,
+                                  return_numpy=return_numpy)
+        usr_out = outs[len(fetch_list):]
+        for i, out in enumerate(usr_out):
+            logs[usr_fetch_list[i]] = out
+        outs = outs[:len(fetch_list)]
+        if not outs[len(fetch_loss):]:
+            return logs, outs[:len(fetch_loss)]
         for metric in self._metrics:
-            metric.update(*res[len(fetch_loss):])
-        return res[:len(fetch_loss)]
+            metric.update(*outs[len(fetch_loss):])
+        return logs, outs[:len(fetch_loss)]
 
     def _predict_step(self, data, use_program_cache=False, return_numpy=True):
         logs = {}
         fetch_vars = self._fetch_vars[self.mode]["outputs"]
-        fetch_list = self._fetch_list(fetch_vars)
+        fetch_list, usr_fetch_list = self._fetch_list(fetch_vars)
+        fetch_list += usr_fetch_list
 
         outs = self._executor.run(self.main_program,
                                   fetch_list=fetch_list,
                                   use_program_cache=use_program_cache,
                                   return_numpy=return_numpy)
-        logs["pred"] = outs
+        for i, out in enumerate(outs):
+            logs[fetch_list[i]] = out
         return logs, outs
 
     def _fetch_list(self, fetch_vars):
@@ -370,7 +383,18 @@ def _fetch_list(self, fetch_vars):
         for var in fetch_vars:
             if var.name in self.main_program.global_block().vars:
                 fetch_list.append(var.name)
-        return fetch_list
+        usr_fetch_list = []
+        if self._usr_fetch_list:
+            assert isinstance(self._usr_fetch_list,
+                              list), "'fetch_list' type should be list."
+            for var in self._usr_fetch_list:
+                if isinstance(var, str):
+                    if var in self.main_program.global_block().vars:
+                        usr_fetch_list.append(var)
+                elif isinstance(var, Variable):
+                    if var.name in self.main_program.global_block().vars:
+                        usr_fetch_list.append(var.name)
+        return fetch_list, usr_fetch_list
 
     def _create_dataloader(self,
                            dataset,
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/engine_api.py b/python/paddle/fluid/tests/unittests/auto_parallel/engine_api.py
index e6a730f0a64d6..0d96c57c2437f 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/engine_api.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/engine_api.py
@@ -133,15 +133,16 @@ def train():
     train_dataset = MyDataset(batch_num * batch_size)
     engine.fit(train_dataset,
                batch_size=batch_size,
-               steps_per_epoch=batch_num * batch_size)
+               steps_per_epoch=batch_num * batch_size,
+               fetch_list=['label'])
 
     # eval
     eval_dataset = MyDataset(batch_size)
-    engine.evaluate(eval_dataset, batch_size)
+    engine.evaluate(eval_dataset, batch_size, fetch_list=['label'])
 
     # predict
     test_dataset = MyDataset(batch_size)
-    engine.predict(test_dataset, batch_size)
+    engine.predict(test_dataset, batch_size, fetch_list=['label'])
 
     # save
     engine.save('./mlp_inf', training=False, mode='predict')

From 811d57d8d0b8b96ea649c92a06ad650a3168387c Mon Sep 17 00:00:00 2001
From: zhupengyang <zhu_py@qq.com>
Date: Wed, 8 Jun 2022 19:38:47 +0800
Subject: [PATCH 51/53] disable lite gpu (#43177)

---
 cmake/external/lite.cmake                     |  5 +----
 paddle/fluid/inference/lite/engine.cc         |  4 ----
 .../fluid/inference/lite/test_tensor_utils.cc | 10 ---------
 .../tests/api/lite_mul_model_test.cc          | 22 -------------------
 .../inference/tests/api/lite_resnet50_test.cc |  6 ++---
 5 files changed, 3 insertions(+), 44 deletions(-)

diff --git a/cmake/external/lite.cmake b/cmake/external/lite.cmake
index b994f407604b4..1d5dd6ae8f425 100644
--- a/cmake/external/lite.cmake
+++ b/cmake/external/lite.cmake
@@ -115,7 +115,7 @@ if(NOT LITE_SOURCE_DIR OR NOT LITE_BINARY_DIR)
     set(LITE_BUILD_COMMAND $(MAKE) publish_inference -j)
     set(LITE_OPTIONAL_ARGS
         -DWITH_MKL=ON
-        -DLITE_WITH_CUDA=${WITH_GPU}
+        -DLITE_WITH_CUDA=OFF
         -DWITH_MKLDNN=OFF
         -DLITE_WITH_X86=ON
         -DLITE_WITH_PROFILE=OFF
@@ -124,9 +124,6 @@ if(NOT LITE_SOURCE_DIR OR NOT LITE_BINARY_DIR)
         -DWITH_PYTHON=OFF
         -DWITH_TESTING=OFF
         -DLITE_BUILD_EXTRA=ON
-        -DCUDNN_ROOT=${CUDNN_ROOT}
-        -DLITE_WITH_STATIC_CUDA=OFF
-        -DCUDA_ARCH_NAME=${CUDA_ARCH_NAME}
         -DLITE_WITH_XPU=${LITE_WITH_XPU}
         -DXPU_SDK_URL=${XPU_BASE_URL}
         -DXPU_SDK_ENV=${XPU_SDK_ENV}
diff --git a/paddle/fluid/inference/lite/engine.cc b/paddle/fluid/inference/lite/engine.cc
index 8f8f68b170b62..615a90cdf5798 100644
--- a/paddle/fluid/inference/lite/engine.cc
+++ b/paddle/fluid/inference/lite/engine.cc
@@ -12,10 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-#define LITE_WITH_CUDA 1
-#endif
-
 #ifdef LITE_SUBGRAPH_WITH_XPU
 #define LITE_WITH_XPU 1
 #endif
diff --git a/paddle/fluid/inference/lite/test_tensor_utils.cc b/paddle/fluid/inference/lite/test_tensor_utils.cc
index 09a6cda62b352..43e1d8770c37c 100644
--- a/paddle/fluid/inference/lite/test_tensor_utils.cc
+++ b/paddle/fluid/inference/lite/test_tensor_utils.cc
@@ -152,22 +152,12 @@ TEST(LiteEngineOp, TensorCopyAsync) {
   auto* ctx_cpu =
       platform::DeviceContextPool::Instance().Get(platform::CPUPlace());
   test_tensor_copy(*ctx_cpu);
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  auto* ctx_gpu =
-      platform::DeviceContextPool::Instance().Get(platform::CUDAPlace(0));
-  test_tensor_copy(*ctx_gpu);
-#endif
 }
 
 TEST(LiteEngineOp, TensorShare) {
   auto* ctx_cpu =
       platform::DeviceContextPool::Instance().Get(platform::CPUPlace());
   test_tensor_share(*ctx_cpu);
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  auto* ctx_gpu =
-      platform::DeviceContextPool::Instance().Get(platform::CUDAPlace(0));
-  test_tensor_share(*ctx_gpu);
-#endif
 }
 
 }  // namespace utils
diff --git a/paddle/fluid/inference/tests/api/lite_mul_model_test.cc b/paddle/fluid/inference/tests/api/lite_mul_model_test.cc
index 1adbf0ec7a552..1677d00ac1f44 100644
--- a/paddle/fluid/inference/tests/api/lite_mul_model_test.cc
+++ b/paddle/fluid/inference/tests/api/lite_mul_model_test.cc
@@ -120,35 +120,13 @@ TEST(AnalysisPredictor, lite_xpu) {
 }
 #endif
 
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-TEST(AnalysisPredictor, thread_local_stream) {
-  const size_t thread_num = 5;
-  std::vector<std::thread> threads(thread_num);
-  Barrier barrier(thread_num);
-  for (size_t i = 0; i < threads.size(); ++i) {
-    threads[i] = std::thread([&barrier, i]() {
-      AnalysisConfig config;
-      config.EnableUseGpu(100, 0);
-      config.SetModel(FLAGS_infer_model + "/" + "mul_model");
-      config.EnableGpuMultiStream();
-      test_predictor(config, &barrier);
-      test_predictor_zero_copy(config);
-    });
-  }
-  for (auto& th : threads) {
-    th.join();
-  }
-}
-
 TEST(AnalysisPredictor, lite_engine) {
   AnalysisConfig config;
-  config.EnableUseGpu(100, 0);
   config.SetModel(FLAGS_infer_model + "/" + "mul_model");
   config.EnableLiteEngine(paddle::AnalysisConfig::Precision::kFloat32);
   test_predictor(config);
   test_predictor_zero_copy(config);
 }
-#endif
 
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/tests/api/lite_resnet50_test.cc b/paddle/fluid/inference/tests/api/lite_resnet50_test.cc
index 169d0b9987d79..b519a7f9b6cea 100644
--- a/paddle/fluid/inference/tests/api/lite_resnet50_test.cc
+++ b/paddle/fluid/inference/tests/api/lite_resnet50_test.cc
@@ -23,10 +23,9 @@ limitations under the License. */
 namespace paddle {
 namespace inference {
 
-TEST(AnalysisPredictor, use_gpu) {
+TEST(AnalysisPredictor, use_cpu) {
   std::string model_dir = FLAGS_infer_model + "/" + "model";
   AnalysisConfig config;
-  config.EnableUseGpu(100, 0);
   config.SetModel(model_dir + "/model", model_dir + "/params");
   config.EnableLiteEngine(paddle::AnalysisConfig::Precision::kFloat32, true);
 
@@ -74,10 +73,9 @@ TEST(AnalysisPredictor, use_gpu) {
 
 namespace paddle_infer {
 
-TEST(Predictor, use_gpu) {
+TEST(Predictor, use_cpu) {
   std::string model_dir = FLAGS_infer_model + "/" + "model";
   Config config;
-  config.EnableUseGpu(100, 0);
   config.SetModel(model_dir + "/model", model_dir + "/params");
   config.EnableLiteEngine(PrecisionType::kFloat32);
 

From cff1ef2be42eacdb56a35628c8e45649ccfae16d Mon Sep 17 00:00:00 2001
From: Zhou Wei <1183042833@qq.com>
Date: Wed, 8 Jun 2022 19:59:06 +0800
Subject: [PATCH 52/53] fix test_poisson UT (#43315)

---
 python/paddle/fluid/tests/unittests/test_poisson_op.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/test_poisson_op.py b/python/paddle/fluid/tests/unittests/test_poisson_op.py
index 57adcd26959ae..51f19747f2b66 100644
--- a/python/paddle/fluid/tests/unittests/test_poisson_op.py
+++ b/python/paddle/fluid/tests/unittests/test_poisson_op.py
@@ -45,8 +45,8 @@ def setUp(self):
         self.config()
 
         self.attrs = {}
-        self.inputs = {'X': np.full([1024, 1024], self.lam, dtype=self.dtype)}
-        self.outputs = {'Out': np.ones([1024, 1024], dtype=self.dtype)}
+        self.inputs = {'X': np.full([2048, 1024], self.lam, dtype=self.dtype)}
+        self.outputs = {'Out': np.ones([2048, 1024], dtype=self.dtype)}
 
     def config(self):
         self.lam = 10
@@ -66,9 +66,9 @@ def test_check_grad_normal(self):
         self.check_grad(
             ['X'],
             'Out',
-            user_defined_grads=[np.zeros([1024, 1024], dtype=self.dtype)],
+            user_defined_grads=[np.zeros([2048, 1024], dtype=self.dtype)],
             user_defined_grad_outputs=[
-                np.random.rand(1024, 1024).astype(self.dtype)
+                np.random.rand(2048, 1024).astype(self.dtype)
             ])
 
 
@@ -77,7 +77,7 @@ class TestPoissonOp2(TestPoissonOp1):
     def config(self):
         self.lam = 5
         self.a = 1
-        self.b = 9
+        self.b = 8
         self.dtype = "float32"
 
 

From cab0f2f503b03ba6497b25fbc4598249417457d1 Mon Sep 17 00:00:00 2001
From: Wilber <jiweibo@baidu.com>
Date: Wed, 8 Jun 2022 21:16:18 +0800
Subject: [PATCH 53/53] thread_local method to support predictor stream.
 (#42785)

---
 paddle/fluid/inference/api/analysis_config.cc |  24 ++
 .../fluid/inference/api/analysis_predictor.cc | 267 ++++++++++++++----
 .../fluid/inference/api/analysis_predictor.h  |  23 +-
 .../api/analysis_predictor_tester.cc          | 115 +++++++-
 paddle/fluid/inference/api/api_impl.cc        |   2 +-
 paddle/fluid/inference/api/api_impl.h         |   2 +-
 paddle/fluid/inference/api/api_tester.cc      |   4 +-
 .../inference/api/details/zero_copy_tensor.cc |  34 ++-
 .../api/details/zero_copy_tensor_test.cc      |  17 +-
 .../inference/api/onnxruntime_predictor.cc    |  16 +-
 .../inference/api/onnxruntime_predictor.h     |   5 +-
 .../inference/api/paddle_analysis_config.h    |  21 ++
 paddle/fluid/inference/api/paddle_api.h       |  10 +-
 .../inference/api/paddle_inference_api.h      |  10 +-
 paddle/fluid/inference/api/paddle_tensor.h    |   3 +-
 .../paddle_infer_api_copy_tensor_tester.cc    |   5 +-
 .../inference/tests/infer_ut/test_LeViT.cc    |  67 +++++
 paddle/fluid/platform/device_context.cc       |  36 ++-
 paddle/fluid/platform/device_context.h        |  15 +-
 19 files changed, 576 insertions(+), 100 deletions(-)

diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc
index c23397a082860..9fdc7a93cc27b 100644
--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
@@ -100,6 +100,24 @@ void AnalysisConfig::EnableUseGpu(uint64_t memory_pool_init_size_mb,
   Update();
 }
 
+void AnalysisConfig::SetExecStream(void *stream) {
+  PADDLE_ENFORCE_NOT_NULL(stream, platform::errors::InvalidArgument(
+                                      "`stream` should not be nullptr"));
+  exec_stream_ = stream;
+  use_external_stream_ = true;
+  Update();
+}
+
+void *AnalysisConfig::GetExecStream() const {
+  PADDLE_ENFORCE_NOT_NULL(exec_stream_, platform::errors::InvalidArgument(
+                                            "`stream` should not be nullptr"));
+  return exec_stream_;
+}
+
+bool AnalysisConfig::external_stream_enabled() const {
+  return use_external_stream_;
+}
+
 void AnalysisConfig::DisableGpu() {
   use_gpu_ = false;
 
@@ -239,6 +257,8 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
   CP_MEMBER(use_fc_padding_);
   // GPU related.
   CP_MEMBER(use_gpu_);
+  CP_MEMBER(use_external_stream_);
+  CP_MEMBER(exec_stream_);
   CP_MEMBER(use_cudnn_);
   CP_MEMBER(gpu_device_id_);
   CP_MEMBER(memory_pool_init_size_mb_);
@@ -787,6 +807,8 @@ std::string AnalysisConfig::SerializeInfoCache() {
   ss << params_file_;
 
   ss << use_gpu_;
+  ss << use_external_stream_;
+  ss << exec_stream_;
   ss << use_gpu_fp16_;
   for (auto &item : gpu_fp16_disabled_op_types_) ss << item;
   ss << use_fc_padding_;
@@ -985,6 +1007,8 @@ std::string AnalysisConfig::Summary() {
     os.InsertRow({"gpu_device_id", std::to_string(gpu_device_id_)});
     os.InsertRow({"memory_pool_init_size",
                   std::to_string(memory_pool_init_size_mb_) + "MB"});
+    os.InsertRow(
+        {"use_external_stream", use_external_stream_ ? "true" : "false"});
     os.InsertRow(
         {"thread_local_stream", thread_local_stream_ ? "true" : "false"});
 
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index 18229c302db39..7f30b80224e0d 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -27,6 +27,7 @@
 #include "paddle/fluid//platform/device/gpu/gpu_types.h"
 #include "paddle/fluid/framework/feed_fetch_method.h"
 #include "paddle/fluid/framework/feed_fetch_type.h"
+#include "paddle/fluid/framework/generator.h"
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/pass.h"
 #include "paddle/fluid/framework/naive_executor.h"
@@ -37,6 +38,7 @@
 #include "paddle/fluid/inference/analysis/helper.h"
 #include "paddle/fluid/inference/analysis/passes/memory_optimize_pass.h"
 #include "paddle/fluid/inference/api/helper.h"
+#include "paddle/fluid/inference/api/infer_context.h"
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
 #include "paddle/fluid/inference/api/paddle_inference_pass.h"
 #include "paddle/fluid/inference/utils/io_utils.h"
@@ -198,6 +200,9 @@ bool AnalysisPredictor::Init(
   if (!PrepareScope(parent_scope)) {
     return false;
   }
+
+  InitPlace();
+
   if (!CreateExecutor()) {
     return false;
   }
@@ -213,56 +218,32 @@ bool AnalysisPredictor::Init(
     return true;
   }
 
-  return true;
-}
-
-bool AnalysisPredictor::PrepareScope(
-    const std::shared_ptr<framework::Scope> &parent_scope) {
-  if (parent_scope) {
-    PADDLE_ENFORCE_NOT_NULL(
-        parent_scope,
-        platform::errors::PreconditionNotMet(
-            "Both program and parent_scope should be set in Clone mode."));
-    scope_ = parent_scope;
-    status_is_cloned_ = true;
-  } else {
-    paddle::framework::InitDevices();
-    paddle::framework::InitDefaultKernelSignatureMap();
-    // TODO(wilber): we need to release memory occupied by weights.
-    scope_.reset(new paddle::framework::Scope());
-    status_is_cloned_ = false;
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  // TODO(inference): Now only gpu with external stream support private
+  // device_context.
+  if (config_.use_gpu_ && config_.use_external_stream_) {
+    private_context_ = true;
+  }
+  if (private_context_) {
+    if (!status_is_cloned_) {
+      predictor_stream_ = config_.GetExecStream();
+    }
+    // NOTE: If the external_stream equals to global_device_contexts's stream,
+    // then fallback.
+    auto global_stream =
+        static_cast<platform::CUDADeviceContext *>(
+            platform::DeviceContextPool::Instance().Get(place_))
+            ->stream();
+    if (predictor_stream_ != global_stream) {
+      InitResourceManager(predictor_stream_);
+      InitDeviceContexts();
+    }
   }
-  sub_scope_ = &scope_->NewScope();
+#endif
   return true;
 }
-bool AnalysisPredictor::PrepareProgram(
-    const std::shared_ptr<framework::ProgramDesc> &program) {
-  if (!program) {
-    if (!LoadProgramDesc()) return false;
-    // If not cloned, the parameters should be loaded.
-    // If config_.ir_optim() is True, parameters is loaded in
-    // OptimizeInferenceProgram(), but other persistable variables
-    // (like RAW type var) are not created in scope.
-    // If config_.ir_optim() is False, parameters is loaded in LoadParameters(),
-    // still need to create other persistable variables.
-    // So in both case, create persistable variables at first.
-    executor_->CreateVariables(*inference_program_, 0, true, sub_scope_);
 
-    // if enable_ir_optim_ is false,
-    // the analysis pass(op fuse, graph analysis, trt subgraph, mkldnn etc) will
-    // not be executed.
-    OptimizeInferenceProgram();
-  } else {
-    // If the program is passed from external, no need to optimize it, this
-    // logic is used in the clone scenario.
-    inference_program_ = program;
-  }
-
-  executor_->CreateVariables(*inference_program_, 0, false, sub_scope_);
-
-  return true;
-}
-bool AnalysisPredictor::CreateExecutor() {
+void AnalysisPredictor::InitPlace() {
   if (config_.use_gpu()) {
     PADDLE_ENFORCE_EQ(config_.use_xpu(), false,
                       platform::errors::InvalidArgument(
@@ -345,6 +326,160 @@ bool AnalysisPredictor::CreateExecutor() {
   } else {
     place_ = paddle::platform::CPUPlace();
   }
+}
+
+void AnalysisPredictor::InitResourceManager(void *stream) {
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  predictor_stream_ =
+      ResourceManager::Instance().InitGPUResource(place_, stream);
+#endif
+}
+
+void AnalysisPredictor::InitDeviceContexts() {
+// Init GPUContext.
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  if (place_.GetType() == phi::AllocationType::GPU) {
+    device_contexts_.emplace(
+        place_, std::async(std::launch::deferred, [=] {
+          auto *gpu_resource =
+              ResourceManager::Instance().GetGPUResource(predictor_stream_);
+          auto *gpu_context = new InferGPUContext();
+          gpu_context->SetAllocator(
+              memory::allocation::AllocatorFacade::Instance()
+                  .GetAllocator(place_, gpu_resource->GetStream())
+                  .get());
+          gpu_context->SetPinnedAllocator(
+              memory::allocation::AllocatorFacade::Instance()
+                  .GetAllocator(paddle::platform::CUDAPinnedPlace())
+                  .get());
+          gpu_context->SetHostAllocator(
+              memory::allocation::AllocatorFacade::Instance()
+                  .GetAllocator(platform::CPUPlace())
+                  .get());
+          gpu_context->SetZeroAllocator(
+              memory::allocation::AllocatorFacade::Instance()
+                  .GetZeroAllocator(place_)
+                  .get());
+          gpu_context->SetGenerator(
+              framework::DefaultCUDAGenerator(place_.GetDeviceId()).get());
+          gpu_context->SetHostGenerator(framework::DefaultCPUGenerator().get());
+
+          gpu_context->SetStream(gpu_resource->GetStream());
+          gpu_context->SetBlasHandle(gpu_resource->GetBlasHandle());
+          gpu_context->SetBlasTensorCoreHandle(
+              gpu_resource->GetBlasTensorCoreHandle());
+          gpu_context->SetBlasTF32Handle(gpu_resource->GetBlasTF32Handle());
+          gpu_context->SetDnnHandle(gpu_resource->GetDnnHandle());
+          gpu_context->SetSolverHandle(gpu_resource->GetSolverDnHandle());
+          gpu_context->SetSparseHandle(gpu_resource->GetSparseHandle());
+          gpu_context->SetEigenDevice(gpu_resource->GetGpuEigenDevice());
+          gpu_context->SetComputeCapability(
+              gpu_resource->GetGpuComputeCapability());
+          gpu_context->SetMaxThreadsPerBlock(
+              gpu_resource->GetGpuMaxThreadsPerBlock());
+          gpu_context->SetMaxThreadsPerMultiProcessor(
+              gpu_resource->GetGpuMaxThreadsPerMp());
+          gpu_context->SetMaxGridDimSize(gpu_resource->GetGpuMaxGridDimSize());
+          gpu_context->SetMultiProcessors(
+              gpu_resource->GetGPUMultiProcessors());
+          gpu_context->SetDriverVersion(gpu_resource->GetGpuDriverVersion());
+          gpu_context->SetRuntimeVersion(gpu_resource->GetGpuRuntimeVersion());
+          VLOG(1) << "thread id is " << std::this_thread::get_id()
+                  << ", stream id is "
+                  << reinterpret_cast<void *>(gpu_resource->GetStream())
+                  << ", allotor ptr is "
+                  << reinterpret_cast<void *>(
+                         memory::allocation::AllocatorFacade::Instance()
+                             .GetAllocator(place_, gpu_resource->GetStream())
+                             .get());
+          return std::unique_ptr<phi::DeviceContext>(gpu_context);
+        }));
+  }
+#endif
+  // TODO(Inference): Support other backends.
+}
+
+void *AnalysisPredictor::GetExecStream() const {
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  if (place_.GetType() == phi::AllocationType::GPU) {
+    if (private_context_) {
+      return predictor_stream_;
+    } else {
+      paddle::platform::DeviceContextPool &pool =
+          paddle::platform::DeviceContextPool::Instance();
+      return reinterpret_cast<const phi::GPUContext *>(pool.Get(place_))
+          ->stream();
+    }
+  } else {
+    return nullptr;
+  }
+  return nullptr;
+#else
+  // TODO(inference): Support other backends.
+  return nullptr;
+#endif
+}
+
+const void *AnalysisPredictor::GetDeviceContexts() const {
+  if (private_context_) {
+    return &device_contexts_;
+  } else {
+    paddle::platform::DeviceContextPool &pool =
+        paddle::platform::DeviceContextPool::Instance();
+    const auto &dev_ctxs = pool.device_contexts();
+    return &dev_ctxs;
+  }
+}
+
+bool AnalysisPredictor::PrepareScope(
+    const std::shared_ptr<framework::Scope> &parent_scope) {
+  if (parent_scope) {
+    PADDLE_ENFORCE_NOT_NULL(
+        parent_scope,
+        platform::errors::PreconditionNotMet(
+            "Both program and parent_scope should be set in Clone mode."));
+    scope_ = parent_scope;
+    status_is_cloned_ = true;
+  } else {
+    paddle::framework::InitDevices();
+    paddle::framework::InitDefaultKernelSignatureMap();
+    // TODO(wilber): we need to release memory occupied by weights.
+    scope_.reset(new paddle::framework::Scope());
+    status_is_cloned_ = false;
+  }
+  sub_scope_ = &scope_->NewScope();
+  return true;
+}
+
+bool AnalysisPredictor::PrepareProgram(
+    const std::shared_ptr<framework::ProgramDesc> &program) {
+  if (!program) {
+    if (!LoadProgramDesc()) return false;
+    // If not cloned, the parameters should be loaded.
+    // If config_.ir_optim() is True, parameters is loaded in
+    // OptimizeInferenceProgram(), but other persistable variables
+    // (like RAW type var) are not created in scope.
+    // If config_.ir_optim() is False, parameters is loaded in LoadParameters(),
+    // still need to create other persistable variables.
+    // So in both case, create persistable variables at first.
+    executor_->CreateVariables(*inference_program_, 0, true, sub_scope_);
+
+    // if enable_ir_optim_ is false,
+    // the analysis pass(op fuse, graph analysis, trt subgraph, mkldnn etc) will
+    // not be executed.
+    OptimizeInferenceProgram();
+  } else {
+    // If the program is passed from external, no need to optimize it, this
+    // logic is used in the clone scenario.
+    inference_program_ = program;
+  }
+
+  executor_->CreateVariables(*inference_program_, 0, false, sub_scope_);
+
+  return true;
+}
+
+bool AnalysisPredictor::CreateExecutor() {
   executor_.reset(new paddle::framework::NaiveExecutor(place_));
   return true;
 }
@@ -1222,8 +1357,8 @@ std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetInputTensor(
       platform::errors::PreconditionNotMet(
           "The variable named %s is not found in the scope of the executor.",
           name));
-  std::unique_ptr<ZeroCopyTensor> res(
-      new ZeroCopyTensor(static_cast<void *>(scope)));
+  std::unique_ptr<ZeroCopyTensor> res(new ZeroCopyTensor(
+      static_cast<void *>(scope), this->GetDeviceContexts()));
   res->input_or_output_ = true;
   res->SetName(name);
   if (platform::is_cpu_place(place_)) {
@@ -1277,8 +1412,8 @@ std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetOutputTensor(
       platform::errors::PreconditionNotMet(
           "The variable named %s is not found in the scope of the executor.",
           name));
-  std::unique_ptr<ZeroCopyTensor> res(
-      new ZeroCopyTensor(static_cast<void *>(scope)));
+  std::unique_ptr<ZeroCopyTensor> res(new ZeroCopyTensor(
+      static_cast<void *>(scope), this->GetDeviceContexts()));
   res->input_or_output_ = false;
   res->SetName(name);
   if (platform::is_cpu_place(place_)) {
@@ -1327,6 +1462,9 @@ bool AnalysisPredictor::ZeroCopyRun() {
     return true;
   }
 #endif
+  if (private_context_) {
+    paddle::platform::DeviceContextPool::SetDeviceContexts(&device_contexts_);
+  }
   paddle::platform::SetNumThreads(config_.cpu_math_library_num_threads());
 #ifdef PADDLE_WITH_MKLDNN
   if (config_.use_mkldnn_) {
@@ -1352,6 +1490,9 @@ bool AnalysisPredictor::ZeroCopyRun() {
   // recover the cpu_math_library_num_threads to 1, in order to avoid thread
   // conflict when integrating it into deployment service.
   paddle::platform::SetNumThreads(1);
+  if (private_context_) {
+    paddle::platform::DeviceContextPool::SetDeviceContexts(nullptr);
+  }
 #ifdef PADDLE_WITH_MKLDNN
   if (config_.use_mkldnn_) MkldnnPostReset();
 #endif
@@ -1659,15 +1800,31 @@ AnalysisPredictor::~AnalysisPredictor() {
   if (config_.shape_range_info_collected()) {
     StatisticShapeRangeInfo();
   }
-
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  if (predictor_stream_ != nullptr) {
+    ResourceManager::Instance().DestroyGPUResource(predictor_stream_);
+  }
+#endif
   if (place_.GetType() != phi::AllocationType::UNDEFINED) {
     memory::Release(place_);
   }
+  device_contexts_.clear();
 }
 
-std::unique_ptr<PaddlePredictor> AnalysisPredictor::Clone() {
+std::unique_ptr<PaddlePredictor> AnalysisPredictor::Clone(void *stream) {
   std::lock_guard<std::mutex> lk(clone_mutex_);
   auto *x = new AnalysisPredictor(config_);
+  x->status_is_cloned_ = true;
+  if (config_.use_external_stream_ && stream == nullptr) {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "config has been configured to use external stream, but the Clone "
+        "function has not received a valid stream parameter."));
+  } else if (!config_.use_external_stream_ && stream != nullptr) {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "config has not been configured to use external stream, but the Clone "
+        "function has received a stream parameter."));
+  }
+  x->predictor_stream_ = stream;
   x->Init(scope_, inference_program_);
   x->executor_->ResetTrtOps(++AnalysisPredictor::clone_num_);
   return std::unique_ptr<PaddlePredictor>(x);
@@ -1853,8 +2010,8 @@ std::unique_ptr<Tensor> Predictor::GetOutputHandle(const std::string &name) {
 
 bool Predictor::Run() { return predictor_->ZeroCopyRun(); }
 
-std::unique_ptr<Predictor> Predictor::Clone() {
-  auto analysis_pred = predictor_->Clone();
+std::unique_ptr<Predictor> Predictor::Clone(void *stream) {
+  auto analysis_pred = predictor_->Clone(stream);
   std::unique_ptr<Predictor> pred(new Predictor(std::move(analysis_pred)));
   return pred;
 }
@@ -1865,6 +2022,8 @@ void Predictor::ClearIntermediateTensor() {
 
 uint64_t Predictor::TryShrinkMemory() { return predictor_->TryShrinkMemory(); }
 
+void *Predictor::GetExecStream() const { return predictor_->GetExecStream(); }
+
 int GetNumBytesOfDataType(DataType dtype) {
   switch (dtype) {
     case DataType::FLOAT32:
diff --git a/paddle/fluid/inference/api/analysis_predictor.h b/paddle/fluid/inference/api/analysis_predictor.h
index 1cfdaf1a55864..ff17353f303a3 100644
--- a/paddle/fluid/inference/api/analysis_predictor.h
+++ b/paddle/fluid/inference/api/analysis_predictor.h
@@ -28,6 +28,7 @@
 #include "paddle/fluid/inference/api/details/reset_tensor_array.h"
 #include "paddle/fluid/inference/api/helper.h"
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
+#include "paddle/fluid/inference/api/resource_manager.h"
 #include "paddle/fluid/platform/device/gpu/gpu_types.h"
 #include "paddle/fluid/platform/float16.h"
 #include "paddle/fluid/string/printf.h"
@@ -184,6 +185,14 @@ class AnalysisPredictor : public PaddlePredictor {
   bool ExpRunWithExternalStream(const gpuStream_t stream);
 #endif
 
+  ///
+  /// \brief Get the execution stream on devices with a concept of stream,
+  /// otherwise returns nullptr.
+  ///
+  /// \return The execution stream or nullptr (CPU).
+  ///
+  void *GetExecStream() const override;
+
   ///
   /// \brief Create feed fetch variables
   ///
@@ -235,7 +244,7 @@ class AnalysisPredictor : public PaddlePredictor {
   ///
   /// \return get a new predictor
   ///
-  std::unique_ptr<PaddlePredictor> Clone() override;
+  std::unique_ptr<PaddlePredictor> Clone(void *stream = nullptr) override;
   ///
   /// \brief Get the scope used by predictor
   ///
@@ -393,10 +402,17 @@ class AnalysisPredictor : public PaddlePredictor {
   FRIEND_TEST(AnalysisPredictor, with_gpu);
 #endif
 
+ protected:
+  const void *GetDeviceContexts() const override;
+
  private:
   void StatisticShapeRangeInfo();
   void CollectShapeRangeInfo();
 
+  void InitPlace();
+  void InitDeviceContexts();
+  void InitResourceManager(void *stream);
+
 #if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE)
   // fleet exe related
 
@@ -489,6 +505,11 @@ class AnalysisPredictor : public PaddlePredictor {
   std::map<std::string, std::vector<std::vector<int32_t>>> shape_info_;
   static int clone_num_;
 
+  bool private_context_{false};
+  void *predictor_stream_{nullptr};
+  std::map<phi::Place, std::shared_future<std::unique_ptr<phi::DeviceContext>>>
+      device_contexts_;
+
 #if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE)
   // fleet executor related
   distributed::FleetExecutorDesc executor_desc_;
diff --git a/paddle/fluid/inference/api/analysis_predictor_tester.cc b/paddle/fluid/inference/api/analysis_predictor_tester.cc
index f16054565a7fc..1e45c24534267 100644
--- a/paddle/fluid/inference/api/analysis_predictor_tester.cc
+++ b/paddle/fluid/inference/api/analysis_predictor_tester.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/inference/api/analysis_predictor.h"
+#include "paddle/fluid/inference/api/resource_manager.h"
 #if defined(PADDLE_WITH_CUDA)
 #include <cuda_runtime.h>
 #endif
@@ -183,18 +184,11 @@ TEST(AnalysisPredictor, CollectShapeRangeInfo) {
   w1->Reshape({4, 1});
   w2->Reshape({4, 1});
   w3->Reshape({4, 1});
-
-  auto* w0_data = w0->mutable_data<int64_t>(PaddlePlace::kCPU);
-  auto* w1_data = w1->mutable_data<int64_t>(PaddlePlace::kCPU);
-  auto* w2_data = w2->mutable_data<int64_t>(PaddlePlace::kCPU);
-  auto* w3_data = w3->mutable_data<int64_t>(PaddlePlace::kCPU);
-
-  for (int i = 0; i < 4; i++) {
-    w0_data[i] = i;
-    w1_data[i] = i;
-    w2_data[i] = i;
-    w3_data[i] = i;
-  }
+  std::vector<int64_t> input_data{0, 1, 2, 3};
+  w0->copy_from_cpu(input_data.data());
+  w1->copy_from_cpu(input_data.data());
+  w2->copy_from_cpu(input_data.data());
+  w3->copy_from_cpu(input_data.data());
 
   predictor->ZeroCopyRun();
 
@@ -539,6 +533,103 @@ TEST(Tensor, GpuShareExternalData) {
   LOG(INFO) << "output size: " << size / sizeof(float);
   predictor->TryShrinkMemory();
 }
+
+TEST(Predictor, Streams) {
+  // internal stream.
+  {
+    Config config;
+    config.SetModel(FLAGS_dirname);
+    config.EnableUseGpu(100, 0);
+    auto predictor = CreatePredictor(config);
+    gpuStream_t stream =
+        reinterpret_cast<gpuStream_t>(predictor->GetExecStream());
+    CHECK_EQ(paddle::ResourceManager::Instance().RefCount(stream), 0);
+  }
+
+  // internal stream, create 2 predictor.
+  {
+    Config config1;
+    config1.SetModel(FLAGS_dirname);
+    config1.EnableUseGpu(100, 0);
+    auto predictor1 = CreatePredictor(config1);
+    gpuStream_t stream1 =
+        reinterpret_cast<gpuStream_t>(predictor1->GetExecStream());
+    CHECK_EQ(paddle::ResourceManager::Instance().RefCount(stream1), 0);
+
+    Config config2;
+    config2.SetModel(FLAGS_dirname);
+    config2.EnableUseGpu(100, 0);
+    auto predictor2 = CreatePredictor(config2);
+    gpuStream_t stream2 =
+        reinterpret_cast<gpuStream_t>(predictor2->GetExecStream());
+    CHECK_EQ(paddle::ResourceManager::Instance().RefCount(stream2), 0);
+    CHECK_EQ(stream1, stream2);
+  }
+
+  // internal stream, clone
+  {
+    Config config;
+    config.SetModel(FLAGS_dirname);
+    config.EnableUseGpu(100, 0);
+    auto predictor = CreatePredictor(config);
+    gpuStream_t stream =
+        reinterpret_cast<gpuStream_t>(predictor->GetExecStream());
+    CHECK_EQ(paddle::ResourceManager::Instance().RefCount(stream), 0);
+
+    auto predictor2 = predictor->Clone();
+    gpuStream_t stream2 =
+        reinterpret_cast<gpuStream_t>(predictor2->GetExecStream());
+    CHECK_EQ(paddle::ResourceManager::Instance().RefCount(stream2), 0);
+    CHECK_EQ(stream, stream2);
+  }
+
+  // external stream
+  {
+    cudaStream_t external_stream;
+    cudaStreamCreate(&external_stream);
+    Config config;
+    config.SetModel(FLAGS_dirname);
+    config.EnableUseGpu(100, 0);
+    config.SetExecStream(external_stream);
+    CHECK_EQ(config.external_stream_enabled(), true);
+
+    auto predictor = CreatePredictor(config);
+    gpuStream_t stream =
+        reinterpret_cast<gpuStream_t>(predictor->GetExecStream());
+    CHECK_EQ(external_stream, stream);
+    CHECK_NOTNULL(paddle::ResourceManager::Instance().GetGPUResource(stream));
+    CHECK_EQ(paddle::ResourceManager::Instance().RefCount(stream), 1);
+  }
+
+  // 2 predictor on 2 stream
+  {
+    cudaStream_t external_stream;
+    cudaStreamCreate(&external_stream);
+    Config config;
+    config.SetModel(FLAGS_dirname);
+    config.EnableUseGpu(100, 0);
+    config.SetExecStream(external_stream);
+    auto predictor = CreatePredictor(config);
+    gpuStream_t stream =
+        reinterpret_cast<gpuStream_t>(predictor->GetExecStream());
+    CHECK_NOTNULL(paddle::ResourceManager::Instance().GetGPUResource(stream));
+    CHECK_EQ(paddle::ResourceManager::Instance().RefCount(stream), 1);
+
+    cudaStream_t external_stream2;
+    cudaStreamCreate(&external_stream2);
+    Config config2;
+    config2.SetModel(FLAGS_dirname);
+    config2.EnableUseGpu(100, 0);
+    config2.SetExecStream(external_stream2);
+    auto predictor2 = CreatePredictor(config2);
+    gpuStream_t stream2 =
+        reinterpret_cast<gpuStream_t>(predictor2->GetExecStream());
+    CHECK_NOTNULL(paddle::ResourceManager::Instance().GetGPUResource(stream2));
+    CHECK_EQ(paddle::ResourceManager::Instance().RefCount(stream2), 1);
+
+    CHECK_NE(stream, stream2);
+  }
+}
 #endif
 
 }  // namespace paddle_infer
diff --git a/paddle/fluid/inference/api/api_impl.cc b/paddle/fluid/inference/api/api_impl.cc
index 38960aecb703b..28f2010161ded 100644
--- a/paddle/fluid/inference/api/api_impl.cc
+++ b/paddle/fluid/inference/api/api_impl.cc
@@ -181,7 +181,7 @@ bool NativePaddlePredictor::Run(const std::vector<PaddleTensor> &inputs,
   return true;
 }
 
-std::unique_ptr<PaddlePredictor> NativePaddlePredictor::Clone() {
+std::unique_ptr<PaddlePredictor> NativePaddlePredictor::Clone(void *stream) {
   std::lock_guard<std::mutex> lk(clone_mutex_);
   VLOG(3) << "Predictor::clone";
   std::unique_ptr<PaddlePredictor> cls(new NativePaddlePredictor(config_));
diff --git a/paddle/fluid/inference/api/api_impl.h b/paddle/fluid/inference/api/api_impl.h
index d503d2581392a..14a7e22e6444f 100644
--- a/paddle/fluid/inference/api/api_impl.h
+++ b/paddle/fluid/inference/api/api_impl.h
@@ -51,7 +51,7 @@ class NativePaddlePredictor : public PaddlePredictor {
            std::vector<PaddleTensor> *output_data,
            int batch_size = -1) override;
 
-  std::unique_ptr<PaddlePredictor> Clone() override;
+  std::unique_ptr<PaddlePredictor> Clone(void *stream = nullptr) override;
 
   ~NativePaddlePredictor() override;
 
diff --git a/paddle/fluid/inference/api/api_tester.cc b/paddle/fluid/inference/api/api_tester.cc
index 1faf46fad2be6..cc6527a7e554e 100644
--- a/paddle/fluid/inference/api/api_tester.cc
+++ b/paddle/fluid/inference/api/api_tester.cc
@@ -46,7 +46,9 @@ class DemoPredictor : public PaddlePredictor {
     return false;
   }
 
-  std::unique_ptr<PaddlePredictor> Clone() override { return nullptr; }
+  std::unique_ptr<PaddlePredictor> Clone(void *stream = nullptr) override {
+    return nullptr;
+  }
 
   ~DemoPredictor() override {}
 };
diff --git a/paddle/fluid/inference/api/details/zero_copy_tensor.cc b/paddle/fluid/inference/api/details/zero_copy_tensor.cc
index 661d9def40653..ae0af77319ece 100644
--- a/paddle/fluid/inference/api/details/zero_copy_tensor.cc
+++ b/paddle/fluid/inference/api/details/zero_copy_tensor.cc
@@ -94,7 +94,17 @@ T *Tensor::mutable_data(PlaceType place) {
       return tensor->mutable_data<T>(paddle::platform::CPUPlace());
     }
     case static_cast<int>(PlaceType::kGPU): {
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+      paddle::platform::CUDAPlace gpu_place(device_);
+      auto *dev_ctxs = reinterpret_cast<const std::map<
+          phi::Place, std::shared_future<std::unique_ptr<phi::DeviceContext>>>
+                                            *>(device_contexs_);
+      auto *dev_ctx =
+          static_cast<phi::GPUContext *>(dev_ctxs->at(gpu_place).get().get());
+      return dev_ctx->Alloc<T>(tensor, tensor->numel() * sizeof(T));
+#else
       return tensor->mutable_data<T>(paddle::platform::CUDAPlace(device_));
+#endif
     }
     case static_cast<int>(PlaceType::kXPU): {
       return tensor->mutable_data<T>(paddle::platform::XPUPlace(device_));
@@ -181,12 +191,14 @@ void Tensor::CopyFromCpu(const T *data) {
     std::memcpy(static_cast<void *>(t_data), data, ele_size);
   } else if (place_ == PlaceType::kGPU) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-    paddle::platform::DeviceContextPool &pool =
-        paddle::platform::DeviceContextPool::Instance();
+
     paddle::platform::CUDAPlace gpu_place(device_);
-    auto *t_data = tensor->mutable_data<T>(gpu_place);
-    auto *dev_ctx = static_cast<const paddle::platform::CUDADeviceContext *>(
-        pool.Get(gpu_place));
+    auto *dev_ctxs = reinterpret_cast<const std::map<
+        phi::Place, std::shared_future<std::unique_ptr<phi::DeviceContext>>> *>(
+        device_contexs_);
+    auto *dev_ctx =
+        static_cast<phi::GPUContext *>(dev_ctxs->at(gpu_place).get().get());
+    auto *t_data = dev_ctx->Alloc<T>(tensor, tensor->numel() * sizeof(T));
 
     paddle::memory::Copy(gpu_place, static_cast<void *>(t_data),
                          paddle::platform::CPUPlace(), data, ele_size,
@@ -359,11 +371,12 @@ void Tensor::CopyToCpuImpl(T *data, void *exec_stream, CallbackFunc cb,
 #endif
   } else if (place_ == PlaceType::kGPU) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-    paddle::platform::DeviceContextPool &pool =
-        paddle::platform::DeviceContextPool::Instance();
     auto gpu_place = t_place;
-    auto *dev_ctx = static_cast<const paddle::platform::CUDADeviceContext *>(
-        pool.Get(gpu_place));
+    auto *dev_ctxs = reinterpret_cast<const std::map<
+        phi::Place, std::shared_future<std::unique_ptr<phi::DeviceContext>>> *>(
+        device_contexs_);
+    auto *dev_ctx =
+        static_cast<phi::GPUContext *>(dev_ctxs->at(gpu_place).get().get());
     paddle::memory::Copy(paddle::platform::CPUPlace(),
                          static_cast<void *>(data), gpu_place, t_data,
                          ele_num * sizeof(T), dev_ctx->stream());
@@ -547,7 +560,8 @@ template PD_INFER_DECL uint8_t *Tensor::mutable_data<uint8_t>(PlaceType place);
 template PD_INFER_DECL int8_t *Tensor::mutable_data<int8_t>(PlaceType place);
 template PD_INFER_DECL float16 *Tensor::mutable_data<float16>(PlaceType place);
 
-Tensor::Tensor(void *scope) : scope_{scope} {}
+Tensor::Tensor(void *scope, const void *device_contexts)
+    : scope_{scope}, device_contexs_(device_contexts) {}
 
 template <typename T>
 void *Tensor::FindTensor() const {
diff --git a/paddle/fluid/inference/api/details/zero_copy_tensor_test.cc b/paddle/fluid/inference/api/details/zero_copy_tensor_test.cc
index 4b6f90f3f0652..5a04656bc3071 100644
--- a/paddle/fluid/inference/api/details/zero_copy_tensor_test.cc
+++ b/paddle/fluid/inference/api/details/zero_copy_tensor_test.cc
@@ -25,14 +25,19 @@
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/inference/api/helper.h"
 #include "paddle/fluid/inference/api/paddle_tensor.h"
+#include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/place.h"
 
 namespace paddle_infer {
 
 struct TensorWrapper : public Tensor {
-  TensorWrapper(paddle_infer::PlaceType place, paddle::framework::Scope* scope,
-                const std::string& name)
-      : Tensor{static_cast<void*>(scope)} {
+  TensorWrapper(
+      paddle_infer::PlaceType place, paddle::framework::Scope* scope,
+      const std::map<phi::Place,
+                     std::shared_future<std::unique_ptr<phi::DeviceContext>>>*
+          dev_ctxs,
+      const std::string& name)
+      : Tensor{static_cast<void*>(scope), dev_ctxs} {
     SetPlace(place, 0 /*device_id*/);
     SetName(name);
     input_or_output_ = true;
@@ -42,7 +47,11 @@ struct TensorWrapper : public Tensor {
 std::unique_ptr<Tensor> CreateTensor(paddle_infer::PlaceType place,
                                      paddle::framework::Scope* scope,
                                      const std::string& name) {
-  return std::unique_ptr<Tensor>(new TensorWrapper{place, scope, name});
+  paddle::platform::DeviceContextPool& pool =
+      paddle::platform::DeviceContextPool::Instance();
+  const auto& dev_ctxs = pool.device_contexts();
+  return std::unique_ptr<Tensor>(
+      new TensorWrapper{place, scope, &dev_ctxs, name});
 }
 
 template <typename T>
diff --git a/paddle/fluid/inference/api/onnxruntime_predictor.cc b/paddle/fluid/inference/api/onnxruntime_predictor.cc
index 93a96863053e5..326da0e4339ad 100644
--- a/paddle/fluid/inference/api/onnxruntime_predictor.cc
+++ b/paddle/fluid/inference/api/onnxruntime_predictor.cc
@@ -243,7 +243,7 @@ std::unique_ptr<ZeroCopyTensor> ONNXRuntimePredictor::GetInputTensor(
                         "The in variable named %s is not found in the "
                         "ONNXPredictor.",
                         name));
-  std::unique_ptr<ZeroCopyTensor> res(new ZeroCopyTensor(nullptr));
+  std::unique_ptr<ZeroCopyTensor> res(new ZeroCopyTensor(nullptr, this));
   res->input_or_output_ = true;
   res->SetName(name);
   if (platform::is_cpu_place(place_)) {
@@ -264,7 +264,7 @@ std::unique_ptr<ZeroCopyTensor> ONNXRuntimePredictor::GetOutputTensor(
                         "The out variable named %s is not found in the "
                         "ONNXPredictor.",
                         name));
-  std::unique_ptr<ZeroCopyTensor> res(new ZeroCopyTensor(nullptr));
+  std::unique_ptr<ZeroCopyTensor> res(new ZeroCopyTensor(nullptr, this));
   res->input_or_output_ = false;
   res->SetName(name);
   if (platform::is_cpu_place(place_)) {
@@ -309,7 +309,7 @@ bool ONNXRuntimePredictor::ZeroCopyRun() {
   return true;
 }
 
-std::unique_ptr<PaddlePredictor> ONNXRuntimePredictor::Clone() {
+std::unique_ptr<PaddlePredictor> ONNXRuntimePredictor::Clone(void *stream) {
   LOG(ERROR) << "Not support Clone(), Please create new Predictor";
   return nullptr;
 }
@@ -325,4 +325,14 @@ ONNXRuntimePredictor::~ONNXRuntimePredictor() {
   memory::Release(place_);
 }
 
+const void *ONNXRuntimePredictor::GetDeviceContexts() const {
+  // TODO(inference): Support private device contexts.
+  paddle::platform::DeviceContextPool &pool =
+      paddle::platform::DeviceContextPool::Instance();
+  const auto &dev_ctxs = pool.device_contexts();
+  return &const_cast<std::map<
+      phi::Place, std::shared_future<std::unique_ptr<phi::DeviceContext>>> &>(
+      dev_ctxs);
+}
+
 }  // namespace paddle
diff --git a/paddle/fluid/inference/api/onnxruntime_predictor.h b/paddle/fluid/inference/api/onnxruntime_predictor.h
index 294a83a4335ba..4c44a7dc0a9e4 100644
--- a/paddle/fluid/inference/api/onnxruntime_predictor.h
+++ b/paddle/fluid/inference/api/onnxruntime_predictor.h
@@ -174,7 +174,10 @@ class ONNXRuntimePredictor : public PaddlePredictor {
   ///
   /// \return get a new predictor
   ///
-  std::unique_ptr<PaddlePredictor> Clone() override;
+  std::unique_ptr<PaddlePredictor> Clone(void *stream = nullptr) override;
+
+ protected:
+  const void *GetDeviceContexts() const override;
 
  private:
   ///
diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h
index 489c32bc59d17..fe82bbf29cbb2 100644
--- a/paddle/fluid/inference/api/paddle_analysis_config.h
+++ b/paddle/fluid/inference/api/paddle_analysis_config.h
@@ -585,6 +585,25 @@ struct PD_INFER_DECL AnalysisConfig {
   ///
   bool trt_allow_build_at_runtime();
 
+  ///
+  /// \brief Set execution stream. If not set a stream will be created
+  /// internally.
+  ///
+  void SetExecStream(void* stream);
+
+  ///
+  /// \brief Get execution stream. The user needs to explicitly cast into a
+  /// stream type such as cudaStream_t, hipStream_t, etc.
+  ///
+  void* GetExecStream() const;
+
+  ///
+  /// \brief Whether the external stream is used, if True, the predictor clone
+  /// operation must use the external stream, otherwise the framework manages
+  /// the stream internally.
+  ///
+  bool external_stream_enabled() const;
+
   ///
   /// \brief Collect shape info of all tensors in compute graph.
   ///
@@ -926,6 +945,8 @@ struct PD_INFER_DECL AnalysisConfig {
       "matrix_nms"};
 
   bool use_cudnn_{false};
+  bool use_external_stream_{false};
+  void* exec_stream_{nullptr};
 
   // NPU related
   bool use_npu_{false};
diff --git a/paddle/fluid/inference/api/paddle_api.h b/paddle/fluid/inference/api/paddle_api.h
index 78af756c24b03..b28370fb8221c 100644
--- a/paddle/fluid/inference/api/paddle_api.h
+++ b/paddle/fluid/inference/api/paddle_api.h
@@ -195,7 +195,8 @@ class PD_INFER_DECL ZeroCopyTensor : public paddle_infer::Tensor {
  private:
   friend class AnalysisPredictor;
   friend class ONNXRuntimePredictor;
-  explicit ZeroCopyTensor(void* scope) : paddle_infer::Tensor{scope} {}
+  explicit ZeroCopyTensor(void* scope, const void* device_contexts)
+      : paddle_infer::Tensor{scope, device_contexts} {}
 };
 
 /// \brief A Predictor for executing inference on a model.
@@ -286,7 +287,7 @@ class PD_INFER_DECL PaddlePredictor {
   /// When using clone, the same network will be created,
   /// and the parameters between them are shared.
   /// \return unique_ptr which contains the pointer of predictor
-  virtual std::unique_ptr<PaddlePredictor> Clone() = 0;
+  virtual std::unique_ptr<PaddlePredictor> Clone(void* stream = nullptr) = 0;
 
   /// \brief Destroy the Predictor.
   virtual ~PaddlePredictor() = default;
@@ -300,6 +301,11 @@ class PD_INFER_DECL PaddlePredictor {
   struct Config {
     std::string model_dir; /*!< path to the model directory. */
   };
+
+  virtual void* GetExecStream() const { return nullptr; }
+
+ protected:
+  virtual const void* GetDeviceContexts() const { return nullptr; }
 };
 
 ///
diff --git a/paddle/fluid/inference/api/paddle_inference_api.h b/paddle/fluid/inference/api/paddle_inference_api.h
index 58ccd79d84d6e..3111db026c4e6 100644
--- a/paddle/fluid/inference/api/paddle_inference_api.h
+++ b/paddle/fluid/inference/api/paddle_inference_api.h
@@ -133,7 +133,7 @@ class PD_INFER_DECL Predictor {
   ///
   /// \return get a new predictor
   ///
-  std::unique_ptr<Predictor> Clone();
+  std::unique_ptr<Predictor> Clone(void* stream = nullptr);
 
   /// \brief Clear the intermediate tensors of the predictor
   void ClearIntermediateTensor();
@@ -149,6 +149,14 @@ class PD_INFER_DECL Predictor {
   ///
   uint64_t TryShrinkMemory();
 
+  ///
+  /// \brief Get the execution stream on devices with a concept of stream,
+  /// otherwise returns nullptr.
+  ///
+  /// \return The execution stream or nullptr (CPU).
+  ///
+  void* GetExecStream() const;
+
  private:
   std::unique_ptr<paddle::PaddlePredictor> predictor_;
   friend class paddle_infer::experimental::InternalUtils;
diff --git a/paddle/fluid/inference/api/paddle_tensor.h b/paddle/fluid/inference/api/paddle_tensor.h
index 11086b369fc15..39ba366f35d2b 100644
--- a/paddle/fluid/inference/api/paddle_tensor.h
+++ b/paddle/fluid/inference/api/paddle_tensor.h
@@ -162,7 +162,7 @@ class PD_INFER_DECL Tensor {
   PlaceType place() const;
 
  protected:
-  explicit Tensor(void* scope);
+  explicit Tensor(void* scope, const void* device_contexs);
 
   template <typename T>
   void* FindTensor() const;
@@ -181,6 +181,7 @@ class PD_INFER_DECL Tensor {
   DataType dtype_;
   bool input_or_output_;
   void* scope_{nullptr};
+  const void* device_contexs_{nullptr};
   PlaceType place_;
   int device_;
 
diff --git a/paddle/fluid/inference/tests/api/paddle_infer_api_copy_tensor_tester.cc b/paddle/fluid/inference/tests/api/paddle_infer_api_copy_tensor_tester.cc
index 38bcb7645abb5..36fcf4ba8da16 100644
--- a/paddle/fluid/inference/tests/api/paddle_infer_api_copy_tensor_tester.cc
+++ b/paddle/fluid/inference/tests/api/paddle_infer_api_copy_tensor_tester.cc
@@ -32,7 +32,10 @@ class InferApiTesterUtils {
       const std::string &name, PlaceType place, void *p_scope) {
     auto var = static_cast<paddle::framework::Scope *>(p_scope)->Var(name);
     var->GetMutable<paddle::framework::LoDTensor>();
-    std::unique_ptr<Tensor> res(new Tensor(p_scope));
+    paddle::platform::DeviceContextPool &pool =
+        paddle::platform::DeviceContextPool::Instance();
+    const auto &dev_ctxs = pool.device_contexts();
+    std::unique_ptr<Tensor> res(new Tensor(p_scope, &dev_ctxs));
     res->input_or_output_ = true;
     res->SetName(name);
     res->SetPlace(place, 0 /*device id*/);
diff --git a/paddle/fluid/inference/tests/infer_ut/test_LeViT.cc b/paddle/fluid/inference/tests/infer_ut/test_LeViT.cc
index b069feaec1ae7..c9298692334c0 100644
--- a/paddle/fluid/inference/tests/infer_ut/test_LeViT.cc
+++ b/paddle/fluid/inference/tests/infer_ut/test_LeViT.cc
@@ -13,6 +13,9 @@
 // limitations under the License.
 
 #include "test_suite.h"  // NOLINT
+#ifdef PADDLE_WITH_GPU
+#include <cuda_runtime.h>
+#endif
 
 DEFINE_string(modeldir, "", "Directory of the inference model.");
 
@@ -170,6 +173,70 @@ TEST(tensorrt_tester_LeViT, multi_thread4_trt_fp32_bz2) {
   std::cout << "finish multi-thread test" << std::endl;
 }
 
+#ifdef PADDLE_WITH_GPU
+TEST(tensorrt_tester_LeViT, multi_stream_thread4_trt_fp32_bz2) {
+  int thread_num = 4;
+
+  // init stream
+  std::vector<cudaStream_t> streams(thread_num);
+  for (size_t i = 0; i < thread_num; ++i) {
+    cudaStreamCreate(&streams[i]);
+  }
+
+  // init input data
+  std::map<std::string, paddle::test::Record> my_input_data_map;
+  my_input_data_map["x"] = PrepareInput(2);
+  // init output data
+  std::map<std::string, paddle::test::Record> infer_output_data,
+      truth_output_data;
+  // prepare groudtruth config
+  paddle_infer::Config config, config_no_ir;
+  config_no_ir.SetModel(FLAGS_modeldir + "/inference.pdmodel",
+                        FLAGS_modeldir + "/inference.pdiparams");
+  config_no_ir.SwitchIrOptim(false);
+  // prepare inference config
+  config.SetModel(FLAGS_modeldir + "/inference.pdmodel",
+                  FLAGS_modeldir + "/inference.pdiparams");
+  config.EnableUseGpu(100, 0);
+  config.EnableTensorRtEngine(
+      1 << 20, 2, 50, paddle_infer::PrecisionType::kFloat32, false, false);
+  // get groudtruth by disbale ir
+
+  paddle_infer::services::PredictorPool pred_pool_no_ir(config_no_ir, 1);
+  SingleThreadPrediction(pred_pool_no_ir.Retrive(0), &my_input_data_map,
+                         &truth_output_data, 1);
+
+  // get infer results from multi threads
+  std::vector<std::thread> threads;
+  config.SetExecStream(streams[0]);
+  config.pass_builder()->DeletePass("add_support_int8_pass");
+  auto main_predictor = CreatePredictor(config);
+  std::vector<decltype(main_predictor)> predictors;
+  for (size_t i = 0; i < thread_num - 1; ++i) {
+    predictors.push_back(std::move(main_predictor->Clone(streams[i + 1])));
+    LOG(INFO) << "predictors[" << i << "] stream is "
+              << predictors[i]->GetExecStream();
+  }
+  predictors.push_back(std::move(main_predictor));
+  LOG(INFO) << "predictors[" << thread_num - 1 << "] stream is "
+            << predictors[thread_num - 1]->GetExecStream();
+  for (int i = 0; i < thread_num; ++i) {
+    threads.emplace_back(paddle::test::SingleThreadPrediction,
+                         predictors[i].get(), &my_input_data_map,
+                         &infer_output_data, 10);
+  }
+
+  // thread join & check outputs
+  for (int i = 0; i < thread_num; ++i) {
+    LOG(INFO) << "join tid : " << i;
+    threads[i].join();
+    // CompareRecord(&truth_output_data, &infer_output_data);
+  }
+
+  std::cout << "finish multi-thread test" << std::endl;
+}
+#endif
+
 }  // namespace paddle_infer
 
 int main(int argc, char** argv) {
diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc
index fd61b813f0aa2..d990aab57736d 100644
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -129,11 +129,22 @@ DeviceType Place2DeviceType(const platform::Place& place) {
 }
 
 DeviceContextPool* DeviceContextPool::pool = nullptr;
+thread_local const std::map<Place,
+                            std::shared_future<std::unique_ptr<DeviceContext>>>*
+    DeviceContextPool::external_device_contexts_ = nullptr;
 
 platform::DeviceContext* DeviceContextPool::Get(const platform::Place& place) {
   VLOG(6) << "DeviceContextPool Get: " << place;
-  auto it = device_contexts_.find(place);
-  if (it == device_contexts_.end()) {
+  const std::map<Place, std::shared_future<std::unique_ptr<DeviceContext>>>*
+      ptr;
+  if (external_device_contexts_ && external_device_contexts_->count(place)) {
+    ptr = external_device_contexts_;
+  } else {
+    ptr = &device_contexts_;
+  }
+
+  auto it = ptr->find(place);
+  if (it == ptr->end()) {
     PADDLE_THROW(platform::errors::Unimplemented(
         "Place %s is not supported. Please check that your paddle compiles "
         "with WITH_GPU, WITH_XPU, WITH_IPU, WITH_MLU or WITH_ASCEND_CL option "
@@ -145,6 +156,27 @@ platform::DeviceContext* DeviceContextPool::Get(const platform::Place& place) {
   return it->second.get().get();
 }
 
+size_t DeviceContextPool::size() const {
+  if (external_device_contexts_) {
+    return external_device_contexts_->size();
+  }
+  return device_contexts_.size();
+}
+
+const std::map<Place, std::shared_future<std::unique_ptr<DeviceContext>>>&
+DeviceContextPool::device_contexts() const {
+  if (external_device_contexts_) {
+    return *external_device_contexts_;
+  }
+  return device_contexts_;
+}
+
+void DeviceContextPool::SetDeviceContexts(
+    const std::map<Place, std::shared_future<std::unique_ptr<DeviceContext>>>*
+        dev_ctxs) {
+  external_device_contexts_ = dev_ctxs;
+}
+
 template <typename DevCtx>
 inline void EmplaceDeviceContext(
     std::map<Place, std::shared_future<std::unique_ptr<DeviceContext>>>*
diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h
index d0dae706ba572..1855f43f9d6cf 100644
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -57,7 +57,7 @@ limitations under the License. */
 #endif
 
 #ifdef PADDLE_WITH_MKLDNN
-#include "dnnl.hpp"
+#include "dnnl.hpp"  // NOLINT
 #include "paddle/fluid/framework/data_layout.h"
 #endif
 
@@ -915,17 +915,22 @@ class DeviceContextPool {
         const typename DefaultDeviceContextType<Place>::TYPE*>(Get(place));
   }
 
-  size_t size() const { return device_contexts_.size(); }
+  size_t size() const;
 
   const std::map<Place, std::shared_future<std::unique_ptr<DeviceContext>>>&
-  device_contexts() const {
-    return device_contexts_;
-  }
+  device_contexts() const;
+
+  static void SetDeviceContexts(
+      const std::map<Place,
+                     std::shared_future<std::unique_ptr<DeviceContext>>>*);
 
  private:
   static DeviceContextPool* pool;
   std::map<Place, std::shared_future<std::unique_ptr<DeviceContext>>>
       device_contexts_;
+  static thread_local const std::map<
+      Place, std::shared_future<std::unique_ptr<DeviceContext>>>*
+      external_device_contexts_;  // not owned
   DISABLE_COPY_AND_ASSIGN(DeviceContextPool);
 };