From 29a6b8c91cd1173423cc763db2b9961049cc3d16 Mon Sep 17 00:00:00 2001
From: Allen Guo <alleng@graphcore.ai>
Date: Wed, 11 May 2022 10:30:39 +0800
Subject: [PATCH 01/49] update CompilationProgressLogger (#42665)

---
 .../fluid/platform/device/ipu/ipu_strategy.cc | 20 ++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)
diff --git a/paddle/fluid/platform/device/ipu/ipu_strategy.cc b/paddle/fluid/platform/device/ipu/ipu_strategy.cc
index aff5498243000..20214428fab36 100644
--- a/paddle/fluid/platform/device/ipu/ipu_strategy.cc
+++ b/paddle/fluid/platform/device/ipu/ipu_strategy.cc
@@ -32,6 +32,20 @@ void RegisterGetter(
   options_type[name] = type_str;
 }
 
+struct DefaultCompilationProgressLogger {
+  void operator()(int progress, int total) {
+    if (progress != progress_ && progress % log_interval_ == 0) {
+      progress_ = progress;
+      VLOG(1) << "Graph compile progress: " << progress << "%";
+    }
+  }
+
+  int log_interval_ = 10;
+  int progress_ = 0;
+  // default total progress
+  int total_ = 100;
+};
+
 }  // namespace
 
 namespace paddle {
@@ -417,11 +431,7 @@ IpuStrategy::IpuStrategy() {
   // Default options
 
   // Can also be set as a custom logger in python, like using tqdm
-  popart_options.compilationProgressLogger = [](int progress, int total) {
-    if (progress % 10 == 0) {
-      VLOG(1) << "compile progress: " << progress << "%";
-    }
-  };
+  popart_options.compilationProgressLogger = DefaultCompilationProgressLogger();
 }
 
 void IpuStrategy::AddBoolOption(const std::string& option, bool value) {

From a1abb7c9b9bc214c39d6967a0e0f2bba5da9a1fe Mon Sep 17 00:00:00 2001
From: wenbin <wang3323032@qq.com>
Date: Wed, 11 May 2022 10:38:11 +0800
Subject: [PATCH 02/49] swish refactor (#42610)

* swish refactor

* bug fix

* trt7 non-linear bug fix
---
 .../inference/tensorrt/convert/swish_op.cc    |  2 +-
 .../tensorrt/plugin/swish_op_plugin.cu        | 43 +++++++++++++++----
 .../tensorrt/plugin/swish_op_plugin.h         | 25 +++++++++--
 3 files changed, 56 insertions(+), 14 deletions(-)

diff --git a/paddle/fluid/inference/tensorrt/convert/swish_op.cc b/paddle/fluid/inference/tensorrt/convert/swish_op.cc
index b2e394d14eba2..0df5c013d34d4 100644
--- a/paddle/fluid/inference/tensorrt/convert/swish_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/swish_op.cc
@@ -75,7 +75,7 @@ class SwishOpConverter : public OpConverter {
       bool with_fp16 =
           engine_->WithFp16() && !engine_->disable_trt_plugin_fp16();
       plugin::SwishPlugin* plugin = new plugin::SwishPlugin(beta, with_fp16);
-      layer = engine_->AddPlugin(&input, input_num, plugin);
+      layer = engine_->AddPluginV2Ext(&input, input_num, plugin);
     }
 
     auto output_name = op_desc.Output("Out")[0];
diff --git a/paddle/fluid/inference/tensorrt/plugin/swish_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/swish_op_plugin.cu
index 9720719fd0bca..2c2fad74b9a2d 100644
--- a/paddle/fluid/inference/tensorrt/plugin/swish_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/swish_op_plugin.cu
@@ -24,6 +24,16 @@ namespace tensorrt {
 namespace plugin {
 
 int SwishPlugin::initialize() TRT_NOEXCEPT { return 0; }
+void SwishPlugin::terminate() TRT_NOEXCEPT {}
+
+bool SwishPlugin::supportsFormat(
+    nvinfer1::DataType type, nvinfer1::PluginFormat format) const TRT_NOEXCEPT {
+  if (with_fp16_) {
+    return type == nvinfer1::DataType::kFLOAT ||
+           type == nvinfer1::DataType::kHALF;
+  }
+  return type == nvinfer1::DataType::kFLOAT;
+}
 
 nvinfer1::Dims SwishPlugin::getOutputDimensions(int index,
                                                 const nvinfer1::Dims *inputDims,
@@ -85,17 +95,29 @@ int SwishPlugin::enqueue(int batch_size, const void *const *inputs,
                          void *const *outputs, void *workspace,
                          cudaStream_t stream) TRT_NOEXCEPT {
 #endif
-  // input dims is CHW.
   const auto &input_dims = this->getInputDims(0);
-  const float *input = reinterpret_cast<const float *>(inputs[0]);
-  float *output = reinterpret_cast<float *const *>(outputs)[0];
   int num = batch_size;
   for (int i = 0; i < input_dims.nbDims; i++) {
     num *= input_dims.d[i];
   }
   int threads = 1024;
   int blocks = (num + threads - 1) / threads;
-  swish_kernel<<<blocks, threads, 0, stream>>>(num, input, output, beta_);
+  auto type = getDataType();
+  if (type == nvinfer1::DataType::kFLOAT) {
+    VLOG(1) << "TRT Plugin DataType selected. Swish-->fp32";
+    const float *input = reinterpret_cast<const float *>(inputs[0]);
+    float *output = reinterpret_cast<float *const *>(outputs)[0];
+    swish_kernel<<<blocks, threads, 0, stream>>>(num, input, output, beta_);
+  } else if (type == nvinfer1::DataType::kHALF) {
+    VLOG(1) << "TRT Plugin DataType selected. Swish-->fp16";
+    const half *input = reinterpret_cast<const half *>(inputs[0]);
+    half *output = reinterpret_cast<half *const *>(outputs)[0];
+    swish_kernel<<<blocks, threads, 0, stream>>>(num, input, output,
+                                                 (half)beta_);
+  } else {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "The Swish TRT Plugin's input type should be float or half."));
+  }
 
   return cudaGetLastError() != cudaSuccess;
 }
@@ -140,12 +162,15 @@ bool SwishPluginDynamic::supportsFormatCombination(
   const nvinfer1::PluginTensorDesc &in = in_out[pos];
   if (pos == 0) {
     if (with_fp16_) {
-      return (in.type == nvinfer1::DataType::kFLOAT ||
-              in.type == nvinfer1::DataType::kHALF) &&
-             (in.format == nvinfer1::TensorFormat::kLINEAR);
+      bool res = (in.type == nvinfer1::DataType::kFLOAT ||
+                  in.type == nvinfer1::DataType::kHALF);
+// encounter trt crash bug
+#if IS_TRT_VERSION_LT(8000)
+      res = res && (in.format == nvinfer1::TensorFormat::kLINEAR);
+#endif
+      return res;
     } else {
-      return (in.type == nvinfer1::DataType::kFLOAT) &&
-             (in.format == nvinfer1::TensorFormat::kLINEAR);
+      return in.type == nvinfer1::DataType::kFLOAT;
     }
   }
   const nvinfer1::PluginTensorDesc &prev = in_out[pos - 1];
diff --git a/paddle/fluid/inference/tensorrt/plugin/swish_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/swish_op_plugin.h
index c4bdc5f921509..aa8fdce23fa89 100644
--- a/paddle/fluid/inference/tensorrt/plugin/swish_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/swish_op_plugin.h
@@ -26,7 +26,7 @@ namespace inference {
 namespace tensorrt {
 namespace plugin {
 
-class SwishPlugin : public PluginTensorRT {
+class SwishPlugin : public PluginTensorRTV2Ext {
  private:
   float beta_;
 
@@ -55,13 +55,24 @@ class SwishPlugin : public PluginTensorRT {
 
   int initialize() TRT_NOEXCEPT override;
 
-  SwishPlugin* clone() const TRT_NOEXCEPT override {
-    return new SwishPlugin(beta_, with_fp16_);
+  nvinfer1::IPluginV2Ext* clone() const TRT_NOEXCEPT override {
+    auto* plugin = new SwishPlugin(beta_, with_fp16_);
+    plugin->data_format_ = data_format_;
+    plugin->data_type_ = data_type_;
+    plugin->input_dims_ = input_dims_;
+    return plugin;
   }
 
   const char* getPluginType() const TRT_NOEXCEPT override {
     return "swish_plugin";
   }
+
+  nvinfer1::DataType getOutputDataType(
+      int index, const nvinfer1::DataType* input_types,
+      int nb_inputs) const TRT_NOEXCEPT override {
+    return input_types[0];
+  }
+
   int getNbOutputs() const TRT_NOEXCEPT override { return 1; }
   nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims* inputs,
                                      int nbInputDims) TRT_NOEXCEPT override;
@@ -71,6 +82,12 @@ class SwishPlugin : public PluginTensorRT {
   int enqueue(int batchSize, const void* const* inputs, void* const* outputs,
 #endif
               void* workspace, cudaStream_t stream) TRT_NOEXCEPT override;
+
+  void terminate() TRT_NOEXCEPT override;
+  void destroy() TRT_NOEXCEPT override { delete this; }
+  const char* getPluginVersion() const TRT_NOEXCEPT override { return "2"; }
+  bool supportsFormat(nvinfer1::DataType type, nvinfer1::PluginFormat format)
+      const TRT_NOEXCEPT override;
 };
 
 class SwishPluginCreator : public TensorRTPluginCreator {
@@ -79,7 +96,7 @@ class SwishPluginCreator : public TensorRTPluginCreator {
     return "swish_plugin";
   }
 
-  const char* getPluginVersion() const TRT_NOEXCEPT override { return "1"; }
+  const char* getPluginVersion() const TRT_NOEXCEPT override { return "2"; }
 
   nvinfer1::IPluginV2* deserializePlugin(
       const char* name, const void* serial_data,

From 7b828f718a4a5d91ba8d1716e73f5c486065e7b3 Mon Sep 17 00:00:00 2001
From: taixiurong <taixiurong@126.com>
Date: Wed, 11 May 2022 11:00:09 +0800
Subject: [PATCH 03/49] remove old XDNN implementation test=kunlun (#42404)

---
 cmake/external/xpu.cmake                      |   4 +-
 paddle/fluid/framework/data_type_transform.cc |  77 ++++++-
 paddle/fluid/operators/log_loss_op_xpu.cc     |  93 ++++----
 .../operators/metrics/accuracy_op_xpu.cc      |  80 ++-----
 .../fluid/operators/optimizers/lamb_op_xpu.cc | 182 ++++++++-------
 .../operators/optimizers/rmsprop_op_xpu.cc    | 215 +++++++++---------
 .../fluid/operators/optimizers/sgd_op_xpu.cc  |  59 +++--
 .../fluid/platform/device/xpu/xpu1_op_list.h  |   5 -
 .../fluid/platform/device/xpu/xpu2_op_list.h  |   2 +
 .../unittests/xpu/test_accuracy_op_xpu.py     |  73 +++---
 .../tests/unittests/xpu/test_sgd_op_xpu.py    |  49 ++--
 11 files changed, 459 insertions(+), 380 deletions(-)

diff --git a/cmake/external/xpu.cmake b/cmake/external/xpu.cmake
index d5ccf1297922f..2c7f28b3a5223 100644
--- a/cmake/external/xpu.cmake
+++ b/cmake/external/xpu.cmake
@@ -9,7 +9,7 @@ SET(XPU_RT_LIB_NAME             "libxpurt.so")
 
 if(NOT DEFINED XPU_BASE_URL)
   SET(XPU_BASE_URL_WITHOUT_DATE "https://baidu-kunlun-product.cdn.bcebos.com/KL-SDK/klsdk-dev")
-  SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20220425")
+  SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20220510")
 else()
   SET(XPU_BASE_URL "${XPU_BASE_URL}")
 endif()
@@ -17,7 +17,7 @@ endif()
 # ubuntu and centos: use output by XDNN API team
 if(NOT DEFINED XPU_XDNN_BASE_URL)
   SET(XPU_XDNN_BASE_URL_WITHOUT_DATE "https://klx-sdk-release-public.su.bcebos.com/xdnn/dev")
-  SET(XPU_XDNN_BASE_URL "${XPU_XDNN_BASE_URL_WITHOUT_DATE}/20220425")
+  SET(XPU_XDNN_BASE_URL "${XPU_XDNN_BASE_URL_WITHOUT_DATE}/20220510")
 else()
   SET(XPU_XDNN_BASE_URL "${XPU_XDNN_BASE_URL}")
 endif()
diff --git a/paddle/fluid/framework/data_type_transform.cc b/paddle/fluid/framework/data_type_transform.cc
index 14b5662b24aeb..c4ea6a3c6bc66 100644
--- a/paddle/fluid/framework/data_type_transform.cc
+++ b/paddle/fluid/framework/data_type_transform.cc
@@ -18,6 +18,10 @@ limitations under the License. */
 #include "paddle/fluid/framework/selected_rows_utils.h"
 #include "paddle/fluid/platform/transform.h"
 
+#if defined(PADDLE_WITH_XPU)
+#include "paddle/fluid/platform/device/device_wrapper.h"
+#endif
+
 namespace paddle {
 namespace framework {
 
@@ -28,6 +32,49 @@ struct CastDataTypeFunctor {
   }
 };
 
+#if defined(PADDLE_WITH_XPU)
+
+template <typename InType, typename OutType>
+static void XPUCastData(const framework::Tensor& in, framework::Tensor* out,
+                        const platform::XPUDeviceContext* dev_ctx) {
+  using XPUInTDType = typename XPUTypeTrait<InType>::Type;
+  using XPUOutTDType = typename XPUTypeTrait<OutType>::Type;
+  int r = xpu::cast_v2<XPUInTDType, XPUOutTDType>(
+      dev_ctx->x_context(),
+      reinterpret_cast<const XPUInTDType*>(in.data<InType>()),
+      reinterpret_cast<XPUOutTDType*>(out->mutable_data<OutType>(in.place())),
+      in.numel());
+  PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast_v2");
+  dev_ctx->Wait();
+}
+
+template <typename InType>
+static void XPUTransDataType(
+    const framework::Tensor& in, framework::Tensor* out,
+    const paddle::framework::proto::VarType::Type& dst_type,
+    const platform::DeviceContext* ctx) {
+  auto* context = static_cast<const platform::XPUDeviceContext*>(ctx);
+
+#define XPUCastCallback(cpp_type, proto_type)          \
+  do {                                                 \
+    if (dst_type == proto_type) {                      \
+      XPUCastData<InType, cpp_type>(in, out, context); \
+    }                                                  \
+  } while (0)
+
+  if (dst_type == proto::VarType::FP32 && dst_type == proto::VarType::FP16 &&
+      dst_type == proto::VarType::BOOL && dst_type == proto::VarType::INT16 &&
+      dst_type == proto::VarType::INT32 && dst_type == proto::VarType::INT64) {
+    _ForEachDataType_(XPUCastCallback);
+  } else {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Data type (%s) is not supported in XPU when casting data type.",
+        DataTypeToString(dst_type)));
+  }
+}
+
+#endif
+
 template <typename InType>
 struct CastDataType {
   CastDataType(const framework::Tensor& in, framework::Tensor* out,
@@ -88,6 +135,34 @@ void TransDataType(const Tensor& in,
   auto dst_type = type;
   auto ctx = pool.Get(in.place());
 
+#if defined(PADDLE_WITH_XPU)
+  switch (src_type) {
+    case proto::VarType::FP16:
+      XPUTransDataType<platform::float16>(in, out, dst_type, ctx);
+      break;
+    case proto::VarType::FP32:
+      XPUTransDataType<float>(in, out, dst_type, ctx);
+      break;
+    case proto::VarType::BOOL:
+      XPUTransDataType<bool>(in, out, dst_type, ctx);
+      break;
+    case proto::VarType::INT16:
+      XPUTransDataType<int16_t>(in, out, dst_type, ctx);
+      break;
+    case proto::VarType::INT32:
+      XPUTransDataType<int>(in, out, dst_type, ctx);
+      break;
+    case proto::VarType::INT64:
+      XPUTransDataType<int64_t>(in, out, dst_type, ctx);
+      break;
+    default:
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "Data type (%s) is not supported in XPU when casting data type.",
+          DataTypeToString(src_type)));
+  }
+
+#else
+
   switch (src_type) {
     case proto::VarType::FP16:
       framework::VisitDataType(dst_type,
@@ -123,6 +198,7 @@ void TransDataType(const Tensor& in,
           "Data type (%s) is not supported when casting data type.",
           DataTypeToString(src_type)));
   }
+#endif
 }
 
 void TransComplexToReal(const proto::VarType::Type& dst_type,
@@ -131,7 +207,6 @@ void TransComplexToReal(const proto::VarType::Type& dst_type,
   auto& pool = platform::DeviceContextPool::Instance();
   auto* ctx = pool.Get(in.place());
   out->Resize(in.dims());
-
   // complex -> real
   switch (src_type) {
     case proto::VarType::COMPLEX64:
diff --git a/paddle/fluid/operators/log_loss_op_xpu.cc b/paddle/fluid/operators/log_loss_op_xpu.cc
index aa5fdd86745d6..ead6f94417b6e 100644
--- a/paddle/fluid/operators/log_loss_op_xpu.cc
+++ b/paddle/fluid/operators/log_loss_op_xpu.cc
@@ -21,58 +21,67 @@ template <typename DeviceContext, typename T, typename AttrType = T>
 class LogLossXPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* predict = ctx.Input<Tensor>("Predicted");
-    auto* labels = ctx.Input<Tensor>("Labels");
-    auto* loss = ctx.Output<Tensor>("Loss");
-    auto epsilon = static_cast<T>(ctx.Attr<AttrType>("epsilon"));
-    loss->mutable_data<T>(ctx.GetPlace());
-    int n = predict->numel();
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    int r =
-        xpu::log_loss_fwd(dev_ctx.x_context(), n, epsilon, predict->data<T>(),
-                          labels->data<T>(), loss->data<T>());
-    PADDLE_ENFORCE_EQ(
-        r, xpu::Error_t::SUCCESS,
-        platform::errors::External(
-            "XPU log_loss kernel return wrong value[%d], please check whether "
-            "Baidu Kunlun Card is properly installed.",
-            r));
+    /*** TODO wait XDNN new interface
+        auto* predict = ctx.Input<Tensor>("Predicted");
+        auto* labels = ctx.Input<Tensor>("Labels");
+        auto* loss = ctx.Output<Tensor>("Loss");
+        auto epsilon = static_cast<T>(ctx.Attr<AttrType>("epsilon"));
+        loss->mutable_data<T>(ctx.GetPlace());
+        int n = predict->numel();
+        auto& dev_ctx = ctx.template device_context<DeviceContext>();
+        int r =
+            xpu::log_loss_fwd(dev_ctx.x_context(), n, epsilon,
+    predict->data<T>(),
+                              labels->data<T>(), loss->data<T>());
+        PADDLE_ENFORCE_EQ(
+            r, xpu::Error_t::SUCCESS,
+            platform::errors::External(
+                "XPU log_loss kernel return wrong value[%d], please check
+    whether "
+                "Baidu Kunlun Card is properly installed.",
+                r));
+    ***/
   }
 };
 template <typename DeviceContext, typename T, typename AttrType = T>
 class LogLossGradXPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* predict = ctx.Input<Tensor>("Predicted");
-    auto* labels = ctx.Input<Tensor>("Labels");
-    auto* dloss = ctx.Input<Tensor>(framework::GradVarName("Loss"));
-    auto* dpred = ctx.Output<Tensor>(framework::GradVarName("Predicted"));
-    if (!dpred) {
-      return;
-    }
-    auto epsilon = static_cast<T>(ctx.Attr<AttrType>("epsilon"));
-    dpred->mutable_data<T>(ctx.GetPlace());
-    int n = predict->numel();
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    int r = xpu::log_loss_bwd(dev_ctx.x_context(), n, epsilon,
-                              predict->data<T>(), labels->data<T>(),
-                              dloss->data<T>(), dpred->data<T>());
-    PADDLE_ENFORCE_EQ(
-        r, xpu::Error_t::SUCCESS,
-        platform::errors::External(
-            "XPU log_loss kernel return wrong value[%d], please check whether "
-            "Baidu Kunlun Card is properly installed.",
-            r));
+    /*** TODO wait XDNN new interface
+
+        auto* predict = ctx.Input<Tensor>("Predicted");
+        auto* labels = ctx.Input<Tensor>("Labels");
+        auto* dloss = ctx.Input<Tensor>(framework::GradVarName("Loss"));
+        auto* dpred = ctx.Output<Tensor>(framework::GradVarName("Predicted"));
+        if (!dpred) {
+          return;
+        }
+        auto epsilon = static_cast<T>(ctx.Attr<AttrType>("epsilon"));
+        dpred->mutable_data<T>(ctx.GetPlace());
+        int n = predict->numel();
+        auto& dev_ctx = ctx.template device_context<DeviceContext>();
+        int r = xpu::log_loss_bwd(dev_ctx.x_context(), n, epsilon,
+                                  predict->data<T>(), labels->data<T>(),
+                                  dloss->data<T>(), dpred->data<T>());
+        PADDLE_ENFORCE_EQ(
+            r, xpu::Error_t::SUCCESS,
+            platform::errors::External(
+                "XPU log_loss kernel return wrong value[%d], please check
+    whether "
+                "Baidu Kunlun Card is properly installed.",
+                r));
+    ***/
   }
 };
 
 }  // namespace operators
 }  // namespace paddle
-namespace ops = paddle::operators;
-REGISTER_OP_XPU_KERNEL(
-    log_loss, ops::LogLossXPUKernel<paddle::platform::XPUDeviceContext, float>);
-REGISTER_OP_XPU_KERNEL(
-    log_loss_grad,
-    ops::LogLossGradXPUKernel<paddle::platform::XPUDeviceContext, float>);
+// namespace ops = paddle::operators;
+// REGISTER_OP_XPU_KERNEL(
+//     log_loss, ops::LogLossXPUKernel<paddle::platform::XPUDeviceContext,
+//     float>);
+// REGISTER_OP_XPU_KERNEL(
+//     log_loss_grad,
+//     ops::LogLossGradXPUKernel<paddle::platform::XPUDeviceContext, float>);
 
 #endif
diff --git a/paddle/fluid/operators/metrics/accuracy_op_xpu.cc b/paddle/fluid/operators/metrics/accuracy_op_xpu.cc
index 3cc1be4de8a82..82e4b90468a38 100644
--- a/paddle/fluid/operators/metrics/accuracy_op_xpu.cc
+++ b/paddle/fluid/operators/metrics/accuracy_op_xpu.cc
@@ -16,7 +16,7 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/platform/device/xpu/xpu_header.h"
+#include "paddle/fluid/platform/device/device_wrapper.h"
 
 namespace paddle {
 namespace operators {
@@ -42,68 +42,26 @@ class AccuracyXPUKernel : public framework::OpKernel<T> {
     if (num_samples == 0) {
       return;
     }
-    size_t indices_int32_size = num_samples * class_dim * sizeof(int);
-    size_t indices_int64_size = num_samples * class_dim * sizeof(int64_t);
-    size_t label_int32_size = num_samples * sizeof(int);
-    size_t label_int64_size = num_samples * sizeof(int64_t);
     auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    int* indices_int32_device = NULL;
-    PADDLE_ENFORCE_EQ(
-        xpu_malloc(reinterpret_cast<void**>(&indices_int32_device),
-                   indices_int32_size),
-        XPU_SUCCESS,
-        platform::errors::ResourceExhausted(
-            "\n\nOut of memory error on XPU, Cannot allocate %s memory"
-            " on XPU. \n\nPlease check whether there is any other process "
-            "using XPU.\n",
-            string::HumanReadableSize(indices_int32_size)));
-    int* label_int32_device = NULL;
-    PADDLE_ENFORCE_EQ(
-        xpu_malloc(reinterpret_cast<void**>(&label_int32_device),
-                   label_int32_size),
-        XPU_SUCCESS,
-        platform::errors::ResourceExhausted(
-            "\n\nOut of memory error on XPU, Cannot allocate %s memory"
-            " on XPU. \n\nPlease check whether there is any other process "
-            "using XPU.\n",
-            string::HumanReadableSize(label_int32_size)));
+    xpu::ctx_guard RAII_GUARD(dev_ctx.x_context());
+    int size = num_samples * class_dim;
+    int* indices_int32_ptr = RAII_GUARD.alloc_l3_or_gm<int>(size);
+    PADDLE_ENFORCE_XDNN_NOT_NULL(indices_int32_ptr);
+    int* label_int32_ptr = RAII_GUARD.alloc_l3_or_gm<int>(size);
+    PADDLE_ENFORCE_XDNN_NOT_NULL(label_int32_ptr);
 
-    int* indices_int32_host =
-        reinterpret_cast<int*>(std::malloc(indices_int32_size));
-    int64_t* indices_int64_host =
-        reinterpret_cast<int64_t*>(std::malloc(indices_int64_size));
-    int* label_int32_host =
-        reinterpret_cast<int*>(std::malloc(label_int32_size));
-    int64_t* label_int64_host =
-        reinterpret_cast<int64_t*>(std::malloc(label_int64_size));
-    dev_ctx.Wait();
-    memory::Copy(platform::CPUPlace(), indices_int64_host, ctx.GetPlace(),
-                 indices_data, indices_int64_size);
-    memory::Copy(platform::CPUPlace(), label_int64_host, ctx.GetPlace(),
-                 label_data, label_int64_size);
-    for (size_t i = 0; i < num_samples; ++i) {
-      label_int32_host[i] = label_int64_host[i];
-      for (size_t j = 0; j < class_dim; ++j) {
-        indices_int32_host[i * class_dim + j] =
-            indices_int64_host[i * class_dim + j];
-      }
-    }
-    memory::Copy(ctx.GetPlace(), indices_int32_device, platform::CPUPlace(),
-                 indices_int32_host, indices_int32_size);
-    memory::Copy(ctx.GetPlace(), label_int32_device, platform::CPUPlace(),
-                 label_int32_host, label_int32_size);
-    int r = xpu::accuracy(dev_ctx.x_context(), indices_int32_device,
-                          label_int32_device, num_samples, class_dim,
-                          correct_data, total_data, accuracy_data);
-    PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS,
-                      platform::errors::Fatal("XPU accuracy kernel error!"));
-    dev_ctx.Wait();
-    xpu_free(indices_int32_device);
-    xpu_free(label_int32_device);
-    std::free(indices_int32_host);
-    std::free(indices_int64_host);
-    std::free(label_int32_host);
-    std::free(label_int64_host);
+    int r = xpu::cast_v2<int64_t, int32_t>(dev_ctx.x_context(), indices_data,
+                                           indices_int32_ptr, size);
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast_v2");
+
+    r = xpu::cast_v2<int64_t, int32_t>(dev_ctx.x_context(), label_data,
+                                       label_int32_ptr, size);
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast_v2");
+
+    r = xpu::accuracy(dev_ctx.x_context(), indices_int32_ptr, label_int32_ptr,
+                      num_samples, class_dim, correct_data, total_data,
+                      accuracy_data);
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast_v2");
   }
 };
 
diff --git a/paddle/fluid/operators/optimizers/lamb_op_xpu.cc b/paddle/fluid/operators/optimizers/lamb_op_xpu.cc
index e7cbe4aa8dd4b..643f70b260206 100644
--- a/paddle/fluid/operators/optimizers/lamb_op_xpu.cc
+++ b/paddle/fluid/operators/optimizers/lamb_op_xpu.cc
@@ -25,101 +25,111 @@ template <typename DeviceContext, typename T>
 class LambOpXPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    using paddle::framework::LoDTensor;
-    const auto* param_var = ctx.InputVar("Param");
-    PADDLE_ENFORCE_EQ(param_var->IsType<framework::LoDTensor>(), true,
-                      platform::errors::InvalidArgument(
-                          "The Var(%s)'s type should be LoDTensor, "
-                          "but the received is %s",
-                          ctx.InputNames("Param").front(),
-                          framework::ToTypeName(param_var->Type())));
+    /*** TODO wait XDNN new interface
+        using paddle::framework::LoDTensor;
+        const auto* param_var = ctx.InputVar("Param");
+        PADDLE_ENFORCE_EQ(param_var->IsType<framework::LoDTensor>(), true,
+                          platform::errors::InvalidArgument(
+                              "The Var(%s)'s type should be LoDTensor, "
+                              "but the received is %s",
+                              ctx.InputNames("Param").front(),
+                              framework::ToTypeName(param_var->Type())));
 
-    using paddle::framework::LoDTensor;
+        using paddle::framework::LoDTensor;
 
-    // inputs
-    T epsilon = static_cast<T>(ctx.Attr<float>("epsilon"));
-    T weight_decay = static_cast<T>(ctx.Attr<float>("weight_decay"));
-    T beta1 = static_cast<T>(ctx.Attr<float>("beta1"));
-    T beta2 = static_cast<T>(ctx.Attr<float>("beta2"));
-    auto& param = GET_DATA_SAFELY(ctx.Input<LoDTensor>("Param"), "Input",
-                                  "Param", "Lamb");
-    auto* grad_var = ctx.InputVar("Grad");
-    auto& mom1 = GET_DATA_SAFELY(ctx.Input<LoDTensor>("Moment1"), "Input",
-                                 "Moment1", "Lamb");
-    auto& mom2 = GET_DATA_SAFELY(ctx.Input<LoDTensor>("Moment2"), "Input",
-                                 "Moment2", "Lamb");
-    auto& lr = GET_DATA_SAFELY(ctx.Input<LoDTensor>("LearningRate"), "Input",
-                               "LearningRate", "Lamb");
+        // inputs
+        T epsilon = static_cast<T>(ctx.Attr<float>("epsilon"));
+        T weight_decay = static_cast<T>(ctx.Attr<float>("weight_decay"));
+        T beta1 = static_cast<T>(ctx.Attr<float>("beta1"));
+        T beta2 = static_cast<T>(ctx.Attr<float>("beta2"));
+        auto& param = GET_DATA_SAFELY(ctx.Input<LoDTensor>("Param"), "Input",
+                                      "Param", "Lamb");
+        auto* grad_var = ctx.InputVar("Grad");
+        auto& mom1 = GET_DATA_SAFELY(ctx.Input<LoDTensor>("Moment1"), "Input",
+                                     "Moment1", "Lamb");
+        auto& mom2 = GET_DATA_SAFELY(ctx.Input<LoDTensor>("Moment2"), "Input",
+                                     "Moment2", "Lamb");
+        auto& lr = GET_DATA_SAFELY(ctx.Input<LoDTensor>("LearningRate"),
+    "Input",
+                                   "LearningRate", "Lamb");
 
-    auto& beta1_pow = GET_DATA_SAFELY(ctx.Input<LoDTensor>("Beta1Pow"), "Input",
-                                      "Beta1Pow", "Lamb");
-    auto& beta2_pow = GET_DATA_SAFELY(ctx.Input<LoDTensor>("Beta2Pow"), "Input",
-                                      "Beta2Pow", "Lamb");
+        auto& beta1_pow = GET_DATA_SAFELY(ctx.Input<LoDTensor>("Beta1Pow"),
+    "Input",
+                                          "Beta1Pow", "Lamb");
+        auto& beta2_pow = GET_DATA_SAFELY(ctx.Input<LoDTensor>("Beta2Pow"),
+    "Input",
+                                          "Beta2Pow", "Lamb");
 
-    auto& param_out = GET_DATA_SAFELY(ctx.Output<LoDTensor>("ParamOut"),
-                                      "Output", "ParamOut", "Lamb");
-    auto& mom1_out = GET_DATA_SAFELY(ctx.Output<LoDTensor>("Moment1Out"),
-                                     "Output", "Moment1Out", "Lamb");
-    auto& mom2_out = GET_DATA_SAFELY(ctx.Output<LoDTensor>("Moment2Out"),
-                                     "Output", "Moment2Out", "Lamb");
-    auto& beta1_pow_out = GET_DATA_SAFELY(ctx.Output<LoDTensor>("Beta1PowOut"),
-                                          "Output", "Beta1PowOut", "Lamb");
-    auto& beta2_pow_out = GET_DATA_SAFELY(ctx.Output<LoDTensor>("Beta2PowOut"),
-                                          "Output", "Beta2PowOut", "Lamb");
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
+        auto& param_out = GET_DATA_SAFELY(ctx.Output<LoDTensor>("ParamOut"),
+                                          "Output", "ParamOut", "Lamb");
+        auto& mom1_out = GET_DATA_SAFELY(ctx.Output<LoDTensor>("Moment1Out"),
+                                         "Output", "Moment1Out", "Lamb");
+        auto& mom2_out = GET_DATA_SAFELY(ctx.Output<LoDTensor>("Moment2Out"),
+                                         "Output", "Moment2Out", "Lamb");
+        auto& beta1_pow_out =
+    GET_DATA_SAFELY(ctx.Output<LoDTensor>("Beta1PowOut"),
+                                              "Output", "Beta1PowOut", "Lamb");
+        auto& beta2_pow_out =
+    GET_DATA_SAFELY(ctx.Output<LoDTensor>("Beta2PowOut"),
+                                              "Output", "Beta2PowOut", "Lamb");
+        auto& dev_ctx = ctx.template device_context<DeviceContext>();
 
-    if (grad_var->IsType<framework::LoDTensor>()) {
-      auto& grad = *ctx.Input<LoDTensor>("Grad");
-      int r = xpu::lamb(dev_ctx.x_context(), grad.template data<T>(),
-                        mom1.template data<T>(), mom2.template data<T>(),
-                        param.template data<T>(), beta1_pow.template data<T>(),
-                        beta2_pow.template data<T>(), beta1, beta2, epsilon,
-                        weight_decay, lr.template data<T>(),
-                        mom1_out.template mutable_data<T>(ctx.GetPlace()),
-                        mom2_out.template mutable_data<T>(ctx.GetPlace()),
-                        param_out.template mutable_data<T>(ctx.GetPlace()),
-                        beta1_pow_out.template mutable_data<T>(ctx.GetPlace()),
-                        beta2_pow_out.template mutable_data<T>(ctx.GetPlace()),
-                        param.numel());
+        if (grad_var->IsType<framework::LoDTensor>()) {
+          auto& grad = *ctx.Input<LoDTensor>("Grad");
+          int r = xpu::lamb(dev_ctx.x_context(), grad.template data<T>(),
+                            mom1.template data<T>(), mom2.template data<T>(),
+                            param.template data<T>(), beta1_pow.template
+    data<T>(),
+                            beta2_pow.template data<T>(), beta1, beta2, epsilon,
+                            weight_decay, lr.template data<T>(),
+                            mom1_out.template mutable_data<T>(ctx.GetPlace()),
+                            mom2_out.template mutable_data<T>(ctx.GetPlace()),
+                            param_out.template mutable_data<T>(ctx.GetPlace()),
+                            beta1_pow_out.template
+    mutable_data<T>(ctx.GetPlace()),
+                            beta2_pow_out.template
+    mutable_data<T>(ctx.GetPlace()),
+                            param.numel());
 
-      if (r == xpu::Error_t::INVALID_PARAM) {
-        PADDLE_ENFORCE_EQ(
-            r, xpu::Error_t::SUCCESS,
-            platform::errors::InvalidArgument(
-                "XPU kernel error of LambOp, error message: INVALID_PARAM, "
-                "please check your input & output."));
-      } else if (r == xpu::Error_t::RUNTIME_ERROR) {
-        PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS,
-                          platform::errors::Unavailable(
-                              "XPU kernel error of LambOp, error message: "
-                              "RUNTIME_ERROR, please check whether Baidu "
-                              "Kunlun Card is properly installed."));
-      } else if (r == xpu::Error_t::NO_ENOUGH_WORKSPACE) {
-        PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS,
-                          platform::errors::ResourceExhausted(
-                              "XPU kernel error of LambOp, error "
-                              "message: NO_ENOUGH_WORKSPACE, XPU "
-                              "has no enough memory."));
-      } else {
-        PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS,
-                          platform::errors::ResourceExhausted(
-                              "XPU kernel error of LambOp, error "
-                              "message: OTHER "
-                              "XPU API returns error code: %d.",
-                              r));
-      }
-    } else {
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "Variable type not supported by lamb_op. Expect LoDTensor, "
-          "but got %s",
-          framework::ToTypeName(param_var->Type())));
-    }
+          if (r == xpu::Error_t::INVALID_PARAM) {
+            PADDLE_ENFORCE_EQ(
+                r, xpu::Error_t::SUCCESS,
+                platform::errors::InvalidArgument(
+                    "XPU kernel error of LambOp, error message: INVALID_PARAM, "
+                    "please check your input & output."));
+          } else if (r == xpu::Error_t::RUNTIME_ERROR) {
+            PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS,
+                              platform::errors::Unavailable(
+                                  "XPU kernel error of LambOp, error message: "
+                                  "RUNTIME_ERROR, please check whether Baidu "
+                                  "Kunlun Card is properly installed."));
+          } else if (r == xpu::Error_t::NO_ENOUGH_WORKSPACE) {
+            PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS,
+                              platform::errors::ResourceExhausted(
+                                  "XPU kernel error of LambOp, error "
+                                  "message: NO_ENOUGH_WORKSPACE, XPU "
+                                  "has no enough memory."));
+          } else {
+            PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS,
+                              platform::errors::ResourceExhausted(
+                                  "XPU kernel error of LambOp, error "
+                                  "message: OTHER "
+                                  "XPU API returns error code: %d.",
+                                  r));
+          }
+        } else {
+          PADDLE_THROW(platform::errors::InvalidArgument(
+              "Variable type not supported by lamb_op. Expect LoDTensor, "
+              "but got %s",
+              framework::ToTypeName(param_var->Type())));
+        }
+    **/
   }
 };
 }  // namespace operators
 }  // namespace paddle
 
-namespace ops = paddle::operators;
-REGISTER_OP_XPU_KERNEL(
-    lamb, ops::LambOpXPUKernel<paddle::platform::XPUDeviceContext, float>);
+// namespace ops = paddle::operators;
+// REGISTER_OP_XPU_KERNEL(
+//     lamb, ops::LambOpXPUKernel<paddle::platform::XPUDeviceContext, float>);
 #endif
diff --git a/paddle/fluid/operators/optimizers/rmsprop_op_xpu.cc b/paddle/fluid/operators/optimizers/rmsprop_op_xpu.cc
index 85c2d42c841f0..873056c7f67fe 100644
--- a/paddle/fluid/operators/optimizers/rmsprop_op_xpu.cc
+++ b/paddle/fluid/operators/optimizers/rmsprop_op_xpu.cc
@@ -40,113 +40,122 @@ template <typename DeviceContext, typename T>
 class RmspropOpXPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    using paddle::framework::LoDTensor;
-
-    // check Param & Grad tensor type
-    const auto* param_var = ctx.InputVar("Param");
-    PADDLE_ENFORCE_EQ(param_var->IsType<LoDTensor>(), true,
-                      platform::errors::InvalidArgument(
-                          "Tensor holds the wrong type，Expected Var(%s)'s "
-                          "type is LoDTensor, "
-                          "but the received is %s",
-                          ctx.InputNames("Param").front(),
-                          framework::ToTypeName(param_var->Type())));
-
-    const auto* grad_var = ctx.InputVar("Grad");
-    PADDLE_ENFORCE_EQ(grad_var->IsType<LoDTensor>(), true,
-                      platform::errors::InvalidArgument(
-                          "Tensor holds the wrong type，Expected Var(%s)'s "
-                          "type is LoDTensor, "
-                          "but the received is %s",
-                          ctx.InputNames("Grad").front(),
-                          framework::ToTypeName(grad_var->Type())));
-
-    // inputs
-    auto& param = GET_DATA_SAFELY(ctx.Input<LoDTensor>("Param"), "Input",
-                                  "Param", "Rmsprop");
-    auto& meanSquare = GET_DATA_SAFELY(ctx.Input<LoDTensor>("MeanSquare"),
-                                       "Input", "MeanSquare", "Rmsprop");
-    auto& grad = GET_DATA_SAFELY(ctx.Input<LoDTensor>("Grad"), "Input", "Grad",
-                                 "Rmsprop");
-    auto& mom = GET_DATA_SAFELY(ctx.Input<LoDTensor>("Moment"), "Input",
-                                "Moment", "Rmsprop");
-
-    auto* learning_rate = ctx.Input<Tensor>("LearningRate");
-    PADDLE_ENFORCE_EQ(learning_rate->dims().size(), 1,
-                      platform::errors::InvalidArgument(
-                          "learining rate should have dimension = 1."
-                          " But received learning rate dim [%s] ",
-                          learning_rate->dims().size()));
-    T lr = static_cast<T>(GetAttrFromTensor(learning_rate));
-
-    // constants
-    T epsilon = static_cast<T>(ctx.Attr<float>("epsilon"));
-    T decay = static_cast<T>(ctx.Attr<float>("decay"));
-    T momentum = static_cast<T>(ctx.Attr<float>("momentum"));
-
-    // outputs
-    auto& param_out = GET_DATA_SAFELY(ctx.Output<LoDTensor>("ParamOut"),
-                                      "Output", "ParamOut", "Rmsprop");
-    auto& mom_out = GET_DATA_SAFELY(ctx.Output<LoDTensor>("MomentOut"),
-                                    "Output", "MomentOut", "Rmsprop");
-    auto& mom_sqrt_out = GET_DATA_SAFELY(ctx.Output<LoDTensor>("MeanSquareOut"),
-                                         "Output", "MeanSquareOut", "Rmsprop");
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-
-    ///// rmsprop优化算法
-    ///
-    /// ms_out[i] = rho * ms[i] + (1 - rho) * (g[i] * g[i]);
-    ///
-    /// mom_out[i] = momentum * mom[i] + lr *
-    /// (g[i] / ((float)sqrt(ms_out[i] + epsilon)));
-    ///
-    /// p_out[i] = p[i] - mom_out[i];
-    /// DLL_EXPORT int rmsprop(Context* ctx, const float* p,
-    /// const float* ms, const float* g, const float* mom,
-    /// float epsilon, float rho, float momentum, float lr,
-    /// float *ms_out, float *mom_out, float *p_out, int n)
-    int r = xpu::rmsprop(dev_ctx.x_context(), param.template data<T>(),
-                         meanSquare.template data<T>(), grad.template data<T>(),
-                         mom.template data<T>(), epsilon, decay, momentum, lr,
-                         mom_sqrt_out.template mutable_data<T>(ctx.GetPlace()),
-                         mom_out.template mutable_data<T>(ctx.GetPlace()),
-                         param_out.template mutable_data<T>(ctx.GetPlace()),
-                         param.numel());
-
-    if (r == xpu::Error_t::INVALID_PARAM) {
-      PADDLE_ENFORCE_EQ(
-          r, xpu::Error_t::SUCCESS,
-          platform::errors::InvalidArgument(
-              "XPU kernel error of RmspropOp, error message: INVALID_PARAM, "
-              "please check your input & output."));
-    } else if (r == xpu::Error_t::RUNTIME_ERROR) {
-      PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS,
-                        platform::errors::Unavailable(
-                            "XPU kernel error of RmspropOp, error message: "
-                            "RUNTIME_ERROR, please check whether Baidu "
-                            "Kunlun Card is properly installed."));
-    } else if (r == xpu::Error_t::NO_ENOUGH_WORKSPACE) {
-      PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS,
-                        platform::errors::ResourceExhausted(
-                            "XPU kernel error of RmspropOp, error "
-                            "message: NO_ENOUGH_WORKSPACE, XPU "
-                            "has no enough memory."));
-    } else {
-      PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS,
-                        platform::errors::ResourceExhausted(
-                            "XPU kernel error of RmspropOp, error "
-                            "message: OTHER "
-                            "XPU API returns error code: %d.",
-                            r));
-    }
+    /*** TODO wait XDNN new interface
+        using paddle::framework::LoDTensor;
+
+        // check Param & Grad tensor type
+        const auto* param_var = ctx.InputVar("Param");
+        PADDLE_ENFORCE_EQ(param_var->IsType<LoDTensor>(), true,
+                          platform::errors::InvalidArgument(
+                              "Tensor holds the wrong type，Expected Var(%s)'s "
+                              "type is LoDTensor, "
+                              "but the received is %s",
+                              ctx.InputNames("Param").front(),
+                              framework::ToTypeName(param_var->Type())));
+
+        const auto* grad_var = ctx.InputVar("Grad");
+        PADDLE_ENFORCE_EQ(grad_var->IsType<LoDTensor>(), true,
+                          platform::errors::InvalidArgument(
+                              "Tensor holds the wrong type，Expected Var(%s)'s "
+                              "type is LoDTensor, "
+                              "but the received is %s",
+                              ctx.InputNames("Grad").front(),
+                              framework::ToTypeName(grad_var->Type())));
+
+        // inputs
+        auto& param = GET_DATA_SAFELY(ctx.Input<LoDTensor>("Param"), "Input",
+                                      "Param", "Rmsprop");
+        auto& meanSquare = GET_DATA_SAFELY(ctx.Input<LoDTensor>("MeanSquare"),
+                                           "Input", "MeanSquare", "Rmsprop");
+        auto& grad = GET_DATA_SAFELY(ctx.Input<LoDTensor>("Grad"), "Input",
+    "Grad",
+                                     "Rmsprop");
+        auto& mom = GET_DATA_SAFELY(ctx.Input<LoDTensor>("Moment"), "Input",
+                                    "Moment", "Rmsprop");
+
+        auto* learning_rate = ctx.Input<Tensor>("LearningRate");
+        PADDLE_ENFORCE_EQ(learning_rate->dims().size(), 1,
+                          platform::errors::InvalidArgument(
+                              "learining rate should have dimension = 1."
+                              " But received learning rate dim [%s] ",
+                              learning_rate->dims().size()));
+        T lr = static_cast<T>(GetAttrFromTensor(learning_rate));
+
+        // constants
+        T epsilon = static_cast<T>(ctx.Attr<float>("epsilon"));
+        T decay = static_cast<T>(ctx.Attr<float>("decay"));
+        T momentum = static_cast<T>(ctx.Attr<float>("momentum"));
+
+        // outputs
+        auto& param_out = GET_DATA_SAFELY(ctx.Output<LoDTensor>("ParamOut"),
+                                          "Output", "ParamOut", "Rmsprop");
+        auto& mom_out = GET_DATA_SAFELY(ctx.Output<LoDTensor>("MomentOut"),
+                                        "Output", "MomentOut", "Rmsprop");
+        auto& mom_sqrt_out =
+    GET_DATA_SAFELY(ctx.Output<LoDTensor>("MeanSquareOut"),
+                                             "Output", "MeanSquareOut",
+    "Rmsprop");
+        auto& dev_ctx = ctx.template device_context<DeviceContext>();
+
+        ///// rmsprop优化算法
+        ///
+        /// ms_out[i] = rho * ms[i] + (1 - rho) * (g[i] * g[i]);
+        ///
+        /// mom_out[i] = momentum * mom[i] + lr *
+        /// (g[i] / ((float)sqrt(ms_out[i] + epsilon)));
+        ///
+        /// p_out[i] = p[i] - mom_out[i];
+        /// DLL_EXPORT int rmsprop(Context* ctx, const float* p,
+        /// const float* ms, const float* g, const float* mom,
+        /// float epsilon, float rho, float momentum, float lr,
+        /// float *ms_out, float *mom_out, float *p_out, int n)
+        int r = xpu::rmsprop(dev_ctx.x_context(), param.template data<T>(),
+                             meanSquare.template data<T>(), grad.template
+    data<T>(),
+                             mom.template data<T>(), epsilon, decay, momentum,
+    lr,
+                             mom_sqrt_out.template
+    mutable_data<T>(ctx.GetPlace()),
+                             mom_out.template mutable_data<T>(ctx.GetPlace()),
+                             param_out.template mutable_data<T>(ctx.GetPlace()),
+                             param.numel());
+
+        if (r == xpu::Error_t::INVALID_PARAM) {
+          PADDLE_ENFORCE_EQ(
+              r, xpu::Error_t::SUCCESS,
+              platform::errors::InvalidArgument(
+                  "XPU kernel error of RmspropOp, error message: INVALID_PARAM,
+    "
+                  "please check your input & output."));
+        } else if (r == xpu::Error_t::RUNTIME_ERROR) {
+          PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS,
+                            platform::errors::Unavailable(
+                                "XPU kernel error of RmspropOp, error message: "
+                                "RUNTIME_ERROR, please check whether Baidu "
+                                "Kunlun Card is properly installed."));
+        } else if (r == xpu::Error_t::NO_ENOUGH_WORKSPACE) {
+          PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS,
+                            platform::errors::ResourceExhausted(
+                                "XPU kernel error of RmspropOp, error "
+                                "message: NO_ENOUGH_WORKSPACE, XPU "
+                                "has no enough memory."));
+        } else {
+          PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS,
+                            platform::errors::ResourceExhausted(
+                                "XPU kernel error of RmspropOp, error "
+                                "message: OTHER "
+                                "XPU API returns error code: %d.",
+                                r));
+        }
+    ***/
   }
 };
 
 }  // namespace operators
 }  // namespace paddle
 
-namespace ops = paddle::operators;
-REGISTER_OP_XPU_KERNEL(
-    rmsprop,
-    ops::RmspropOpXPUKernel<paddle::platform::XPUDeviceContext, float>);
+// namespace ops = paddle::operators;
+// REGISTER_OP_XPU_KERNEL(
+//     rmsprop,
+//     ops::RmspropOpXPUKernel<paddle::platform::XPUDeviceContext, float>);
 #endif
diff --git a/paddle/fluid/operators/optimizers/sgd_op_xpu.cc b/paddle/fluid/operators/optimizers/sgd_op_xpu.cc
index 9dabca1b66a77..e7c03be95cae1 100644
--- a/paddle/fluid/operators/optimizers/sgd_op_xpu.cc
+++ b/paddle/fluid/operators/optimizers/sgd_op_xpu.cc
@@ -14,11 +14,15 @@ limitations under the License. */
 #ifdef PADDLE_WITH_XPU
 #include "paddle/fluid/operators/optimizers/sgd_op.h"
 #include <string>
+#include "paddle/fluid/platform/device/device_wrapper.h"
+
 namespace paddle {
 namespace operators {
 
 template <typename DeviceContext, typename T>
 class SGDOpXPUKernel : public framework::OpKernel<T> {
+  using XPUType = typename XPUTypeTrait<T>::Type;
+
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
     const auto *learning_rate = ctx.Input<framework::Tensor>("LearningRate");
@@ -48,40 +52,31 @@ class SGDOpXPUKernel : public framework::OpKernel<T> {
                             "numel = [%s], ParamOut's numel = [%s]",
                             grad->numel(), sz));
 
-      const T *lr = learning_rate->data<T>();
+      const T *lr_t = learning_rate->data<T>();
+      auto &dev_ctx = ctx.template device_context<DeviceContext>();
+      xpu::ctx_guard RAII_GUARD(dev_ctx.x_context());
+      const float *lr = nullptr;
+      if (std::is_same<T, paddle::platform::float16>::value) {
+        float *lr_float =
+            RAII_GUARD.alloc_l3_or_gm<float>(learning_rate->numel());
+        int r = xpu::cast_v2<XPUType, float>(
+            dev_ctx.x_context(), reinterpret_cast<const XPUType *>(lr_t),
+            lr_float, learning_rate->numel());
+        PADDLE_ENFORCE_XDNN_SUCCESS(r, "clip_v2");
+        lr = lr_float;
+      } else {
+        lr = reinterpret_cast<const float *>(lr_t);
+      }
+
       const T *param_data = param->data<T>();
       const T *grad_data = grad->data<T>();
       T *out_data = param_out->mutable_data<T>(ctx.GetPlace());
 
-      auto &dev_ctx = ctx.template device_context<DeviceContext>();
-      int r = xpu::sgd(dev_ctx.x_context(), sz, grad_data, param_data, lr,
-                       out_data);
-      if (r == xpu::Error_t::INVALID_PARAM) {
-        PADDLE_ENFORCE_EQ(
-            r, xpu::Error_t::SUCCESS,
-            platform::errors::InvalidArgument(
-                "XPU kernel error of SgdOp, error message: INVALID_PARAM, "
-                "please check your input & output."));
-      } else if (r == xpu::Error_t::RUNTIME_ERROR) {
-        PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS,
-                          platform::errors::Unavailable(
-                              "XPU kernel error of SgdOp, error message: "
-                              "RUNTIME_ERROR, please check whether Baidu "
-                              "Kunlun Card is properly installed."));
-      } else if (r == xpu::Error_t::NO_ENOUGH_WORKSPACE) {
-        PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS,
-                          platform::errors::ResourceExhausted(
-                              "XPU kernel error of SgdOp, error "
-                              "message: NO_ENOUGH_WORKSPACE, XPU "
-                              "has no enough memory."));
-      }
-    } else {
-      PADDLE_ENFORCE_EQ(false, true,
-                        platform::errors::PermissionDenied(
-                            "Unsupported Variable Type of Param & Grad in "
-                            "SgdOp-XPU. Excepted "
-                            "LodTensor, But received [%s] and [%s]",
-                            paddle::framework::ToTypeName(param_var->Type())));
+      int r = xpu::sgd(dev_ctx.x_context(),
+                       reinterpret_cast<const XPUType *>(grad_data),
+                       reinterpret_cast<const XPUType *>(param_data), lr,
+                       reinterpret_cast<XPUType *>(out_data), sz);
+      PADDLE_ENFORCE_XDNN_SUCCESS(r, "sgd");
     }
   }
 };
@@ -90,6 +85,8 @@ class SGDOpXPUKernel : public framework::OpKernel<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+namespace plat = paddle::platform;
 REGISTER_OP_XPU_KERNEL(
-    sgd, ops::SGDOpXPUKernel<paddle::platform::XPUDeviceContext, float>);
+    sgd, ops::SGDOpXPUKernel<paddle::platform::XPUDeviceContext, float>,
+    ops::SGDOpXPUKernel<paddle::platform::XPUDeviceContext, plat::float16>);
 #endif
diff --git a/paddle/fluid/platform/device/xpu/xpu1_op_list.h b/paddle/fluid/platform/device/xpu/xpu1_op_list.h
index a76bdd4ae9679..e8c3eee5b538b 100644
--- a/paddle/fluid/platform/device/xpu/xpu1_op_list.h
+++ b/paddle/fluid/platform/device/xpu/xpu1_op_list.h
@@ -145,7 +145,6 @@ XPUOpMap& get_kl1_ops() {
       {"hard_switch", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"iou_similarity",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"lamb", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"layer_norm_grad",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"layer_norm", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
@@ -175,9 +174,6 @@ XPUOpMap& get_kl1_ops() {
                                   pOpKernelType(vartype::INT32, XPUPlace()),
                                   pOpKernelType(vartype::INT64, XPUPlace()),
                                   pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"log_loss_grad",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"log_loss", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"logsumexp", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"log", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"lookup_table_v2_grad",
@@ -236,7 +232,6 @@ XPUOpMap& get_kl1_ops() {
                                  pOpKernelType(vartype::INT32, XPUPlace()),
                                  pOpKernelType(vartype::BOOL, XPUPlace()),
                                  pOpKernelType(vartype::FP32, XPUPlace())})},
-      {"rmsprop", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"rnn_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"rnn", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"roi_align_grad",
diff --git a/paddle/fluid/platform/device/xpu/xpu2_op_list.h b/paddle/fluid/platform/device/xpu/xpu2_op_list.h
index 0dcab845bc9ca..99f8e5ace9c00 100644
--- a/paddle/fluid/platform/device/xpu/xpu2_op_list.h
+++ b/paddle/fluid/platform/device/xpu/xpu2_op_list.h
@@ -328,6 +328,8 @@ XPUOpMap& get_kl2_ops() {
                               pOpKernelType(vartype::INT64, XPUPlace())})},
       {"scatter", XPUKernelSet({pOpKernelType(vartype::INT64, XPUPlace()),
                                 pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"sgd", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
+                            pOpKernelType(vartype::FP16, XPUPlace())})},
       {"sigmoid_cross_entropy_with_logits_grad",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"sigmoid_cross_entropy_with_logits",
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_accuracy_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_accuracy_op_xpu.py
index 7aaa78856811f..b0bb9a37c16bd 100755
--- a/python/paddle/fluid/tests/unittests/xpu/test_accuracy_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_accuracy_op_xpu.py
@@ -23,41 +23,52 @@
 from paddle.fluid import compiler, Program, program_guard
 import paddle
 
+from op_test_xpu import XPUOpTest
+from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types, XPUOpTestWrapper
+
 paddle.enable_static()
 
 
-@unittest.skipIf(not paddle.is_compiled_with_xpu(),
-                 "core is not compiled with XPU")
-class TestXPUAccuracyOp(OpTest):
-    def setUp(self):
-        self.op_type = "accuracy"
-        self.init_dtype()
-        n = 8192
-        infer = np.random.random((n, 1)).astype(self.dtype)
-        indices = np.random.randint(0, 2, (n, 1)).astype('int64')
-        label = np.random.randint(0, 2, (n, 1)).astype('int64')
-        self.inputs = {'Out': infer, 'Indices': indices, "Label": label}
-        num_correct = 0
-        for rowid in range(n):
-            for ele in indices[rowid]:
-                if ele == label[rowid]:
-                    num_correct += 1
-                    break
-        self.outputs = {
-            'Accuracy': np.array([num_correct / float(n)]).astype(self.dtype),
-            'Correct': np.array([num_correct]).astype("int32"),
-            'Total': np.array([n]).astype("int32")
-        }
-        self.attrs = {'use_xpu': True}
-
-    def init_dtype(self):
-        self.dtype = np.float32
-
-    def test_check_output(self):
-        if paddle.is_compiled_with_xpu():
-            place = paddle.XPUPlace(0)
-            self.check_output_with_place(place)
+class XPUTestAccuracyOp(XPUOpTestWrapper):
+    def __init__(self):
+        self.op_name = 'accuracy'
+        self.use_dynamic_create_class = False
+
+    class TestXPUAccuracyOp(XPUOpTest):
+        def setUp(self):
+            self.op_type = "accuracy"
+            self.init_dtype()
+            n = 8192
+            infer = np.random.random((n, 1)).astype(self.dtype)
+            indices = np.random.randint(0, 2, (n, 1)).astype('int64')
+            label = np.random.randint(0, 2, (n, 1)).astype('int64')
+            self.inputs = {'Out': infer, 'Indices': indices, "Label": label}
+            num_correct = 0
+            for rowid in range(n):
+                for ele in indices[rowid]:
+                    if ele == label[rowid]:
+                        num_correct += 1
+                        break
+            self.outputs = {
+                'Accuracy':
+                np.array([num_correct / float(n)]).astype(self.dtype),
+                'Correct': np.array([num_correct]).astype("int32"),
+                'Total': np.array([n]).astype("int32")
+            }
+            self.attrs = {'use_xpu': True}
+
+        def init_dtype(self):
+            self.dtype = self.in_type
+
+        def test_check_output(self):
+            if paddle.is_compiled_with_xpu():
+                place = paddle.XPUPlace(0)
+                self.check_output_with_place(place)
+
 
+support_types = get_xpu_op_support_types('accuracy')
+for stype in support_types:
+    create_test_class(globals(), XPUTestAccuracyOp, stype)
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_sgd_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_sgd_op_xpu.py
index c29150ef921c2..67fd9f871207b 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_sgd_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_sgd_op_xpu.py
@@ -25,30 +25,43 @@
 from paddle.fluid import core
 from paddle.fluid.op import Operator
 
+from op_test_xpu import XPUOpTest
+from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types, XPUOpTestWrapper
 
-class TestSGDOp(OpTest):
-    def setUp(self):
-        self.op_type = "sgd"
-        self.conf()
-        w = np.random.random((self.h, self.w)).astype("float32")
-        g = np.random.random((self.h, self.w)).astype("float32")
-        lr = np.array([0.1]).astype("float32")
 
-        self.inputs = {'Param': w, 'Grad': g, 'LearningRate': lr}
-        self.outputs = {'ParamOut': w - lr * g}
+class XPUTestSgdOp(XPUOpTestWrapper):
+    def __init__(self):
+        self.op_name = 'sgd'
+        self.use_dynamic_create_class = False
 
-    def conf(self):
-        self.h = 102
-        self.w = 105
+    class TestSGDOp(XPUOpTest):
+        def setUp(self):
+            self.op_type = "sgd"
+            self.dtype = self.in_type
+            self.conf()
+            w = np.random.random((self.h, self.w)).astype(self.dtype)
+            g = np.random.random((self.h, self.w)).astype(self.dtype)
+            lr = np.array([0.1]).astype(self.dtype)
 
-    def test_check_output_with_place(self):
-        self.check_output_with_place(paddle.XPUPlace(0))
+            self.inputs = {'Param': w, 'Grad': g, 'LearningRate': lr}
+            self.outputs = {'ParamOut': w - lr * g}
 
+        def conf(self):
+            self.h = 102
+            self.w = 105
 
-class TestSGDOpCase8X(TestSGDOp):
-    def conf(self):
-        self.h = 10
-        self.w = 64
+        def test_check_output_with_place(self):
+            self.check_output_with_place(paddle.XPUPlace(0))
+
+    class TestSGDOpCase8X(TestSGDOp):
+        def conf(self):
+            self.h = 10
+            self.w = 64
+
+
+support_types = get_xpu_op_support_types('sgd')
+for stype in support_types:
+    create_test_class(globals(), XPUTestSgdOp, stype)
 
 
 class TestSGDOpWithLargeInput(unittest.TestCase):

From c5232b4b537b5eb17f82195221cb57b63a9f5ebd Mon Sep 17 00:00:00 2001
From: Haohongxiang <86215757+haohongxiang@users.noreply.github.com>
Date: Wed, 11 May 2022 11:11:46 +0800
Subject: [PATCH 04/49] [Dygraph] Support diff batch for sparse of EagerReducer
 (#42646)

* support diff batch for sparse of eagerreducer

* fix
---
 .../fluid/distributed/collective/reducer.cc   | 59 +++++++++++++++++--
 .../fluid/tests/unittests/test_dist_base.py   |  2 +
 2 files changed, 57 insertions(+), 4 deletions(-)

diff --git a/paddle/fluid/distributed/collective/reducer.cc b/paddle/fluid/distributed/collective/reducer.cc
index a7c3e2208ab74..96009ce722905 100644
--- a/paddle/fluid/distributed/collective/reducer.cc
+++ b/paddle/fluid/distributed/collective/reducer.cc
@@ -901,6 +901,9 @@ void EagerReducer::AllReduceSparse(EagerGroup *group,
 
   dev_ctx->Wait();
 
+  Tensor src_value_tensor(std::make_shared<phi::DenseTensor>(src->value()));
+  std::vector<int64_t> dst_shape = src_value_tensor.shape();
+
   if (std::all_of(cpu_rows_num_ptr, cpu_rows_num_ptr + size_,
                   [&](int64_t row) { return row == cpu_rows_num_ptr[0]; })) {
     // During sparse communication, the number of each card is same.
@@ -940,8 +943,6 @@ void EagerReducer::AllReduceSparse(EagerGroup *group,
                                        &dst_rows_vector);
     dev_ctx->Wait();
 
-    Tensor src_value_tensor(std::make_shared<phi::DenseTensor>(src->value()));
-    std::vector<int64_t> dst_shape = src_value_tensor.shape();
     dst_shape[dst_shape.size() - 2] = rows_num;
     auto dst_dense_tensor = std::dynamic_pointer_cast<phi::DenseTensor>(
         paddle::experimental::full(IntArray(dst_shape), 0,
@@ -971,8 +972,58 @@ void EagerReducer::AllReduceSparse(EagerGroup *group,
     *(src->mutable_value()) =
         *(std::dynamic_pointer_cast<phi::DenseTensor>(dst_value_tensor.impl()));
   } else {
-    PADDLE_THROW(
-        platform::errors::Unimplemented("This case is not supported."));
+    std::vector<Tensor> rows_tensors;
+    std::vector<Tensor> values_tensors;
+
+    for (int i = 0; i < size_; ++i) {
+      std::vector<int64_t> value_tensor_shape = {
+          cpu_rows_num_ptr[i], dst_shape[dst_shape.size() - 1]};
+      Tensor rows_tensor = paddle::experimental::full(
+          IntArray({static_cast<int64_t>(cpu_rows_num_ptr[i])}), 0,
+          DataType::INT64, inner_place_);
+      Tensor values_tensor = paddle::experimental::full(
+          IntArray(value_tensor_shape), 0, src->value().dtype(), inner_place_);
+      std::vector<phi::DenseTensor> rows_dense_vector;
+      std::vector<phi::DenseTensor> values_dense_vector;
+
+      if (i == rank_) {
+        auto *rows_dense_tensor =
+            std::dynamic_pointer_cast<phi::DenseTensor>(rows_tensor.impl())
+                .get();
+        framework::TensorFromVector<int64_t>(src_rows, *dev_ctx,
+                                             rows_dense_tensor);
+        values_tensor.set_impl(
+            std::make_shared<phi::DenseTensor>(src->value()));
+      }
+      rows_dense_vector.push_back(
+          *std::dynamic_pointer_cast<phi::DenseTensor>(rows_tensor.impl()));
+      values_dense_vector.push_back(
+          *std::dynamic_pointer_cast<phi::DenseTensor>(values_tensor.impl()));
+
+      auto b_opts = BroadcastOptions();
+      b_opts.source_rank = i;
+      process_group_->Broadcast(rows_dense_vector, rows_dense_vector, b_opts);
+      process_group_
+          ->Broadcast(values_dense_vector, values_dense_vector, b_opts)
+          ->Wait();
+      rows_tensors.push_back(rows_tensor);
+      values_tensors.push_back(values_tensor);
+    }
+
+    Tensor dst_rows_tensor =
+        paddle::experimental::concat(rows_tensors, phi::Scalar(0));
+    framework::Vector<int64_t> dst_rows_vector(rows_num, 0);
+    auto *dst_rows_dense_tensor =
+        std::dynamic_pointer_cast<phi::DenseTensor>(dst_rows_tensor.impl())
+            .get();
+    framework::TensorToVector<int64_t>(*dst_rows_dense_tensor, *dev_ctx,
+                                       &dst_rows_vector);
+    src->set_rows(dst_rows_vector);
+
+    Tensor dst_values_tensor =
+        paddle::experimental::concat(values_tensors, phi::Scalar(0));
+    *(src->mutable_value()) = *(
+        std::dynamic_pointer_cast<phi::DenseTensor>(dst_values_tensor.impl()));
   }
 }
 
diff --git a/python/paddle/fluid/tests/unittests/test_dist_base.py b/python/paddle/fluid/tests/unittests/test_dist_base.py
index 11972059c832c..4f21b3220a9d3 100755
--- a/python/paddle/fluid/tests/unittests/test_dist_base.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_base.py
@@ -1461,6 +1461,7 @@ def check_with_place(self,
                          need_envs={},
                          log_name=""):
         if self._dygraph and (self._gloo_mode or self._nccl2_mode):
+            need_envs.update({"FLAGS_enable_eager_mode": "1"})
             with _test_eager_guard():
                 self.check_with_place_func(
                     model_file=model_file,
@@ -1468,6 +1469,7 @@ def check_with_place(self,
                     check_error_log=check_error_log,
                     need_envs=need_envs,
                     log_name=log_name)
+            need_envs.update({"FLAGS_enable_eager_mode": "0"})
             self.check_with_place_func(
                 model_file=model_file,
                 delta=delta,

From c4bed7e4b5e22b00775f6e531627977f70bda772 Mon Sep 17 00:00:00 2001
From: feng_shuai <fengshuai03@baidu.com>
Date: Wed, 11 May 2022 11:13:05 +0800
Subject: [PATCH 05/49] stride_slice don't support trt6 (#42639)

---
 paddle/fluid/inference/tensorrt/op_teller.cc                 | 5 +++++
 .../unittests/ir/inference/test_trt_convert_strided_slice.py | 3 +++
 2 files changed, 8 insertions(+)

diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc
index b44450e7a8212..280a1e3708bdb 100644
--- a/paddle/fluid/inference/tensorrt/op_teller.cc
+++ b/paddle/fluid/inference/tensorrt/op_teller.cc
@@ -941,6 +941,11 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
     }
 
     if (op_type == "strided_slice") {
+#if !IS_TRT_VERSION_GE(7000)
+      VLOG(3)
+          << "strided_slice converter does not support trt versions below 7.0";
+      return false;
+#endif
       if (!with_dynamic_shape) {
         return false;
       }
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_strided_slice.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_strided_slice.py
index 04eb3ab10ba7a..6a204ebbad27d 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_strided_slice.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_strided_slice.py
@@ -103,6 +103,9 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
                 for x in attrs[0]["axes"]:
                     if x == 0:
                         return 0, 3
+            ver = paddle_infer.get_trt_compile_version()
+            if ver[0] * 1000 + ver[1] * 100 + ver[2] * 10 < 7000:
+                return 0, 3
             return 1, 2
 
         attrs = [

From 27acc6c397426020890523c52e1e03e5a24c57d5 Mon Sep 17 00:00:00 2001
From: Allen Guo <alleng@graphcore.ai>
Date: Wed, 11 May 2022 11:31:57 +0800
Subject: [PATCH 06/49] [IPU] update to popart v2.5.0 (#42552)

* update to popart v2.5.0

* use a specific version of sdk2.5.0
---
 paddle/fluid/memory/stats.h                   |  4 +-
 .../fluid/platform/device/ipu/ipu_strategy.cc | 48 +++++++++----
 .../fluid/platform/device/ipu/ipu_strategy.h  |  1 +
 paddle/fluid/pybind/pybind.cc                 |  6 ++
 .../unittests/ipu/test_ipu_strategy_ipu.py    | 12 +++-
 .../unittests/ipu/test_model_parallel_ipu.py  | 67 +++++++++++++++++++
 tools/dockerfile/Dockerfile.ipu               |  3 +-
 7 files changed, 121 insertions(+), 20 deletions(-)

diff --git a/paddle/fluid/memory/stats.h b/paddle/fluid/memory/stats.h
index 0906567dbf6c1..b4850a8e9e919 100644
--- a/paddle/fluid/memory/stats.h
+++ b/paddle/fluid/memory/stats.h
@@ -80,8 +80,8 @@ class Stat : public StatBase {
       while (prev_value < current_value &&
              !peak_value_.compare_exchange_weak(prev_value, current_value)) {
       }
-      VLOG(8) << "Update peak_value, after update, peak_value = " << peak_value_
-              << " , current value = " << current_value;
+      VLOG(8) << "Update peak_value, after update, peak_value = "
+              << peak_value_.load() << " , current value = " << current_value;
     }
   }
 
diff --git a/paddle/fluid/platform/device/ipu/ipu_strategy.cc b/paddle/fluid/platform/device/ipu/ipu_strategy.cc
index 20214428fab36..5bf705864ef3c 100644
--- a/paddle/fluid/platform/device/ipu/ipu_strategy.cc
+++ b/paddle/fluid/platform/device/ipu/ipu_strategy.cc
@@ -341,21 +341,26 @@ IpuStrategy::IpuStrategy() {
         return std::to_string(popart_options.partialsTypeMatMuls == "half");
       });
 
-  RegisterSetter(
-      container_options, "dot_checks",
-      [&](const std::pair<std::string, std::string>& p) {
-        std::uint64_t value = std::stoul(p.first);
-        popart_options.dotChecks.insert(static_cast<popart::DotCheck>(value));
-      });
+  RegisterSetter(container_options, "dot_checks",
+                 [&](const std::pair<std::string, std::string>& p) {
+                   std::vector<std::string> valid_dot{"Fwd0", "Fwd1", "Bwd0",
+                                                      "PreAlias", "Final"};
+                   if (std::find(valid_dot.begin(), valid_dot.end(), p.first) ==
+                       valid_dot.end()) {
+                     PADDLE_THROW(platform::errors::InvalidArgument(
+                         "Unknown dot check: %s", p.first));
+                   }
+                   popart_options.dotChecks.insert(p.first);
+                 });
 
-  RegisterGetter(
-      vector_options_getter, options_type, "dot_checks", "vector", [&]() {
-        std::vector<std::string> res;
-        for (auto x : popart_options.dotChecks) {
-          res.push_back(std::to_string(static_cast<std::uint64_t>(x)));
-        }
-        return res;
-      });
+  RegisterGetter(vector_options_getter, options_type, "dot_checks", "vector",
+                 [&]() {
+                   std::vector<std::string> res;
+                   for (auto x : popart_options.dotChecks) {
+                     res.push_back(x);
+                   }
+                   return res;
+                 });
 
   RegisterSetter(container_options, "hardware_instrumentations",
                  [&](const std::pair<std::string, std::string>& p) {
@@ -516,6 +521,21 @@ void IpuStrategy::SetTensorLocation(const std::string& tensor,
   }
 }
 
+void IpuStrategy::SetReplicatedCollectivesSettings(const std::string& opt,
+                                                   bool value) {
+  VLOG(10) << "Set Replica Setting " << opt << " to " << value;
+  if (opt == "prepare_schedule_for_merging_collectives") {
+    popart_options.replicatedCollectivesSettings
+        .prepareScheduleForMergingCollectives = value;
+  } else if (opt == "merge_all_reduce_collectives") {
+    popart_options.replicatedCollectivesSettings.mergeAllReduceCollectives =
+        value;
+  } else {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "Unknown option ' %s' for replicated collectives settings", opt));
+  }
+}
+
 void IpuStrategy::SetAccumulateOuterFragmentSettings(
     const std::uint64_t& schedule, const std::vector<int>& values) {
   VLOG(10) << "SetAccumulateOuterFragmentSettings schedule:" << schedule;
diff --git a/paddle/fluid/platform/device/ipu/ipu_strategy.h b/paddle/fluid/platform/device/ipu/ipu_strategy.h
index fa57dcd676d81..da08c76fb90d1 100644
--- a/paddle/fluid/platform/device/ipu/ipu_strategy.h
+++ b/paddle/fluid/platform/device/ipu/ipu_strategy.h
@@ -118,6 +118,7 @@ class IpuStrategy {
                               const std::string &value);
   void SetTensorLocation(const std::string &tensor, const std::string &option,
                          std::uint64_t value);
+  void SetReplicatedCollectivesSettings(const std::string &opt, bool value);
   void SetAccumulateOuterFragmentSettings(const std::uint64_t &schedule,
                                           const std::vector<int> &values);
   void AddCustomOp(const std::string &paddle_op, const std::string &popart_op,
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 602a0345b04fe..b7ecf09850838 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -4394,6 +4394,12 @@ All parameter, weight, gradient are variables in Paddle.
                          option_name, option.first.cast<std::string>(),
                          option.second.cast<std::uint64_t>());
                    }
+                 } else if (option_name == "replicated_collectives_settings") {
+                   for (auto option : element.second.cast<py::dict>()) {
+                     self.SetReplicatedCollectivesSettings(
+                         option.first.cast<std::string>(),
+                         option.second.cast<bool>());
+                   }
                  } else if (option_name == "accumulate_outer_fragment") {
                    for (auto option : element.second.cast<py::dict>()) {
                      std::vector<int> values;
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_ipu_strategy_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_ipu_strategy_ipu.py
index 45f75f1b4df81..21a6655406729 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_ipu_strategy_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_ipu_strategy_ipu.py
@@ -27,12 +27,13 @@ def test_set_options(self):
         ipu_strategy = paddle.static.IpuStrategy()
         all_option_names = ipu_strategy._ipu_strategy.get_all_option_names()
         skip_options = []
+        skip_options.append(
+            'mean_accumulation_and_replication_reduction_strategy')
         skip_options.append('random_seed')
 
         for option_name in all_option_names:
             if option_name in skip_options:
                 continue
-
             option = ipu_strategy._ipu_strategy.get_option(option_name)
             option_type = option['type']
             option_value = option['value']
@@ -67,7 +68,7 @@ def test_set_string_options(self):
     def test_set_other_options(self):
         ipu_strategy = paddle.static.IpuStrategy()
         options = {}
-        options['dot_checks'] = ['0', '1', '2', '3']
+        options['dot_checks'] = ['Fwd0', 'Fwd1', 'Bwd0', 'PreAlias', "Final"]
         options['engine_options'] = {
             'debug.allowOutOfMemory': 'true',
             'autoReport.directory': 'path',
@@ -76,7 +77,12 @@ def test_set_other_options(self):
         options['random_seed'] = 1234
         for k, v in options.items():
             ipu_strategy.set_options({k: v})
-            assert v == ipu_strategy.get_option(k), f"set {k} to {v} failed "
+            if (isinstance(v, list)):
+                assert v.sort() == ipu_strategy.get_option(k).sort(
+                ), f"set {k} to {v} failed "
+            else:
+                assert v == ipu_strategy.get_option(
+                    k), f"set {k} to {v} failed "
 
         # The custom logger need 2 int as inputs
         logger = lambda progress, total: print(f"compile progrss: {progress}/{total}")
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_model_parallel_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_model_parallel_ipu.py
index 792b88849faf3..884162d336f35 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_model_parallel_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_model_parallel_ipu.py
@@ -148,6 +148,36 @@ def set_data_feed(self):
         }
 
 
+class TestReplicaCollectiveInference(TestBase):
+    def set_attrs(self):
+        self.ipu_options = {
+            "batches_per_step": 1,
+            "enable_pipelining": False,
+            "enable_gradient_accumulation": False,
+            "accumulation_factor": 1,
+            "enable_replicated_graphs": True,
+            "replicated_graph_count": 2,
+            "accumulate_outer_fragment": {
+                0: []
+            },
+            "replicated_collectives_settings": {
+                "prepare_schedule_for_merging_collectives": True,
+                "merge_all_reduce_collectives": True
+            }
+        }
+        self.cpu_bs = 1
+        self.ipu_bs = 1
+
+    def set_data_feed(self):
+        np_image = np.random.rand(1, 3, 10, 10).astype(np.float32)
+        self.feed_cpu = {"image": np_image}
+        self.feed_ipu = {
+            "image":
+            np.tile(np_image,
+                    [self.ipu_options['replicated_graph_count'], 1, 1, 1])
+        }
+
+
 class TestPipelineInference(TestBase):
     def set_attrs(self):
         self.ipu_options = {
@@ -190,6 +220,36 @@ def set_attrs(self):
 
 
 class TestReplicaTrain(TestTrainBase):
+    def set_attrs(self):
+        self.ipu_options = {
+            "batches_per_step": 1,
+            "enable_pipelining": False,
+            "enable_gradient_accumulation": False,
+            "accumulation_factor": 1,
+            "enable_replicated_graphs": True,
+            "replicated_graph_count": 2
+        }
+        self.cpu_bs = 2
+        self.ipu_bs = 1
+        self.optimizer = 'sgd'
+
+    def set_data_feed(self):
+        np_image = np.random.rand(1, 3, 10, 10).astype(np.float32)
+        self.feed_cpu = {"image": np.tile(np_image, [self.cpu_bs, 1, 1, 1])}
+        self.feed_ipu = {
+            "image":
+            np.tile(np_image,
+                    [self.ipu_options['replicated_graph_count'], 1, 1, 1])
+        }
+
+    def test(self):
+        cpu_outputs = self._test_base(False)
+        ipu_outputs = self._test_base(True)[::2]
+
+        self.assertTrue(np.allclose(cpu_outputs, ipu_outputs, atol=self.atol))
+
+
+class TestReplicaCollectiveTrain(TestTrainBase):
     def set_attrs(self):
         self.ipu_options = {
             "batches_per_step": 1,
@@ -198,6 +258,13 @@ def set_attrs(self):
             "accumulation_factor": 1,
             "enable_replicated_graphs": True,
             "replicated_graph_count": 2,
+            "accumulate_outer_fragment": {
+                0: []
+            },
+            "replicated_collectives_settings": {
+                "prepare_schedule_for_merging_collectives": True,
+                "merge_all_reduce_collectives": True
+            }
         }
         self.cpu_bs = 2
         self.ipu_bs = 1
diff --git a/tools/dockerfile/Dockerfile.ipu b/tools/dockerfile/Dockerfile.ipu
index 08536ae401fe1..d6c46245e501c 100644
--- a/tools/dockerfile/Dockerfile.ipu
+++ b/tools/dockerfile/Dockerfile.ipu
@@ -6,7 +6,7 @@
 # run a container
 # docker run --ulimit memlock=-1:-1 --net=host --cap-add=IPC_LOCK --device=/dev/infiniband/ --ipc=host --rm -it paddlepaddle/paddle:latest-dev-ipu bash
 
-FROM graphcore/poplar:2.3.0
+FROM graphcore/poplar:poplar-extbaidu:2.5.0-ubuntu-18.04-20220407
 MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
 
 # ENV variables
@@ -25,6 +25,7 @@ RUN apt-get update && apt-get install -y curl wget vim git unzip unrar tar xz-ut
             bison graphviz libjpeg-dev zlib1g zlib1g-dev automake locales swig net-tools libtool module-init-tools numactl libnuma-dev \
             openssl libffi-dev pciutils libblas-dev gfortran libblas3 liblapack-dev liblapack3 default-jre screen tmux gdb lldb gcc g++
 RUN apt-get update && apt-get install -y rdma-core librdmacm1
+RUN apt-get update && apt-get install libspdlog-dev
 
 # Downgrade gcc&&g++
 WORKDIR /usr/bin 

From 6c696db16939163edbd8b7b62d1d9d5a4a88e993 Mon Sep 17 00:00:00 2001
From: Weilong Wu <veyron_wu@163.com>
Date: Wed, 11 May 2022 11:49:41 +0800
Subject: [PATCH 07/49] [Eager] Enable_legacy_for_mkldnn_ops_on_off (#42648)

---
 .../tests/unittests/mkldnn/check_flags_mkldnn_ops_on_off.py     | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/python/paddle/fluid/tests/unittests/mkldnn/check_flags_mkldnn_ops_on_off.py b/python/paddle/fluid/tests/unittests/mkldnn/check_flags_mkldnn_ops_on_off.py
index 90614ccb3bc15..11b8858b6b195 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/check_flags_mkldnn_ops_on_off.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/check_flags_mkldnn_ops_on_off.py
@@ -20,6 +20,8 @@
 import os
 from paddle.fluid.layer_helper import LayerHelper
 from paddle.fluid.framework import _global_flags
+from paddle.fluid.framework import _enable_legacy_dygraph
+_enable_legacy_dygraph()
 
 
 def check():

From 00ecb98f55d76d1aa88eefee10fb43bb63e98298 Mon Sep 17 00:00:00 2001
From: Jiabin Yang <360788950@qq.com>
Date: Wed, 11 May 2022 13:31:22 +0800
Subject: [PATCH 08/49] support custom operator run in double grad mode
 (#42653)

---
 paddle/fluid/eager/api/utils/global_utils.h   |   9 +-
 paddle/fluid/eager/backward.cc                |   2 -
 .../custom_operator/custom_operator_node.cc   | 348 +++++++++++++++++-
 .../custom_operator/custom_operator_node.h    |  77 +++-
 paddle/fluid/eager/tensor_wrapper.h           |  20 +-
 paddle/fluid/eager/utils.cc                   |   2 +-
 paddle/fluid/pybind/eager_functions.cc        |  23 +-
 .../custom_op/test_custom_tanh_double_grad.py |  10 +-
 8 files changed, 455 insertions(+), 36 deletions(-)

diff --git a/paddle/fluid/eager/api/utils/global_utils.h b/paddle/fluid/eager/api/utils/global_utils.h
index 44e78c3bbf193..a4337e0826178 100644
--- a/paddle/fluid/eager/api/utils/global_utils.h
+++ b/paddle/fluid/eager/api/utils/global_utils.h
@@ -77,7 +77,8 @@ class Controller {
     op_meta_info_map_.insert(map.begin(), map.end());
   }
 
-  std::unordered_map<std::string, std::vector<std::unordered_map<int, int>>>&
+  std::unordered_map<std::string,
+                     std::vector<std::vector<std::unordered_map<int, int>>>>&
   GetCustomEdgesSlotMap() {
     return custom_edges_slot_map_;
   }
@@ -89,8 +90,10 @@ class Controller {
       new paddle::imperative::Tracer()};
   std::unordered_map<std::string, std::vector<paddle::OpMetaInfo>>
       op_meta_info_map_;
-  /* op_type : {{grad_outputs}, {grad_inputs}, {input}, {output}, {attrs}}*/
-  std::unordered_map<std::string, std::vector<std::unordered_map<int, int>>>
+  /* op_type : {{{grad_outputs}, {grad_inputs}, {input}, {output}, {attrs}},
+   * {{grad_outputs}, {grad_inputs}, {input}, {output}, {attrs}}}*/
+  std::unordered_map<std::string,
+                     std::vector<std::vector<std::unordered_map<int, int>>>>
       custom_edges_slot_map_;
   DISABLE_COPY_AND_ASSIGN(Controller);
 };
diff --git a/paddle/fluid/eager/backward.cc b/paddle/fluid/eager/backward.cc
index c5a121067be72..63b899f6d6b62 100644
--- a/paddle/fluid/eager/backward.cc
+++ b/paddle/fluid/eager/backward.cc
@@ -698,8 +698,6 @@ std::vector<paddle::experimental::Tensor> RunBackward(
       }
     }
 
-    VLOG(6) << "Running GradNode:" << node->name();
-
     // Check input
     EnforceGradNodeHasInput(node);
 
diff --git a/paddle/fluid/eager/custom_operator/custom_operator_node.cc b/paddle/fluid/eager/custom_operator/custom_operator_node.cc
index 2bb86a86e8348..abdd8cadeed4c 100644
--- a/paddle/fluid/eager/custom_operator/custom_operator_node.cc
+++ b/paddle/fluid/eager/custom_operator/custom_operator_node.cc
@@ -15,10 +15,151 @@
 #include "paddle/fluid/eager/custom_operator/custom_operator_node.h"
 #include "paddle/fluid/framework/custom_operator.h"
 #include "paddle/fluid/framework/op_meta_info_helper.h"
+#include "paddle/fluid/platform/profiler/event_tracing.h"
 #include "paddle/phi/api/ext/op_meta_info.h"
 #include "paddle/phi/core/dense_tensor.h"
 
 namespace egr {
+
+static void ConstructFwdAndBwdMap(
+    const std::vector<paddle::OpMetaInfo>& vec_map,
+    const std::string& op_type) {
+  auto& in_out_map = egr::Controller::Instance().GetCustomEdgesSlotMap();
+  if (in_out_map.find(op_type) != in_out_map.end()) {
+    if (in_out_map[op_type].size() == 2) {
+      VLOG(7) << "Find Exist CustomEdgesSlotMap Skip >>>> ";
+      return;
+    }
+  }
+
+  VLOG(7) << "Construct DoubleGrad's CustomEdgesSlotMap ";
+  auto inputs_names =
+      paddle::framework::OpMetaInfoHelper::GetInputs(vec_map[1]);
+  auto outputs_names =
+      paddle::framework::OpMetaInfoHelper::GetOutputs(vec_map[1]);
+  auto attrs_names = paddle::framework::OpMetaInfoHelper::GetAttrs(vec_map[1]);
+  auto grad_outputs_names =
+      paddle::framework::OpMetaInfoHelper::GetOutputs(vec_map[2]);
+  auto grad_inputs_names =
+      paddle::framework::OpMetaInfoHelper::GetInputs(vec_map[2]);
+  auto grad_attrs_names =
+      paddle::framework::OpMetaInfoHelper::GetAttrs(vec_map[2]);
+  std::vector<std::unordered_map<int, int>> res(5);
+  in_out_map[op_type].push_back(res);
+  // Prepare pos map for grad_outputs
+  VLOG(7) << "Prepare pos map for grad_outputs";
+  PADDLE_ENFORCE_LE(
+      grad_outputs_names.size(), inputs_names.size(),
+      paddle::platform::errors::InvalidArgument(
+          "Grad outputs num should be less equal than forward inputs num."));
+  for (size_t i = 0; i < grad_outputs_names.size(); i++) {
+    auto end = grad_outputs_names[i].find("@GRAD@GRAD");
+    if (end != std::string::npos) {
+      for (size_t j = 0; j < inputs_names.size(); j++) {
+        if (grad_outputs_names[i].substr(0, end + 5) == inputs_names[j]) {
+          VLOG(7) << " ==== Custom Operator: " << op_type << "_grad "
+                  << "'s No." << j << " inputs: " << inputs_names[j]
+                  << " related to No." << i
+                  << " grad_outputs: " << grad_outputs_names[i];
+          in_out_map[op_type][1][0][j] = i;
+        }
+      }
+    } else {
+      size_t end_n = grad_outputs_names[i].find("@GRAD@NEW");
+      if (end_n != std::string::npos) {
+        for (size_t j = 0; j < inputs_names.size(); j++) {
+          if (grad_outputs_names[i].substr(0, end_n) == inputs_names[j]) {
+            VLOG(7) << " ==== Custom Operator: " << op_type << "_grad "
+                    << "'s No." << j << " inputs: " << inputs_names[j]
+                    << " related to No." << i
+                    << " grad_outputs: " << grad_outputs_names[i];
+            in_out_map[op_type][1][0][j] = i;
+          }
+        }
+      } else {
+        size_t end_one_grad = grad_outputs_names[i].find("@GRAD");
+        if (end_one_grad != std::string::npos) {
+          for (size_t j = 0; j < inputs_names.size(); j++) {
+            if (grad_outputs_names[i].substr(0, end_one_grad) ==
+                inputs_names[j]) {
+              VLOG(7) << " ==== Custom Operator: " << op_type << "_grad "
+                      << "'s No." << j << " inputs: " << inputs_names[j]
+                      << " related to No." << i
+                      << " grad_outputs: " << grad_outputs_names[i];
+              in_out_map[op_type][1][0][j] = i;
+            }
+          }
+        } else {
+          PADDLE_THROW(paddle::platform::errors::NotFound(
+              "All Grad outputs should be end of @GRAD@GRAD or @GRAD@NEW or "
+              "@GRAD and we got %s is not one of them, "
+              "please check your op and change to fit the rule.",
+              grad_outputs_names[i]));
+        }
+      }
+    }
+  }
+  // Prepare pos map for grad_inputs
+  for (size_t i = 0; i < grad_inputs_names.size(); i++) {
+    size_t end = grad_inputs_names[i].find("@GRAD@GRAD");
+    if (end != std::string::npos) {
+      for (size_t j = 0; j < outputs_names.size(); j++) {
+        if (grad_inputs_names[i].substr(0, end + 5) == outputs_names[j]) {
+          VLOG(7) << " ==== Custom Operator: " << op_type << "_grad "
+                  << "'s No." << j << " outputs: " << outputs_names[j]
+                  << " related to No." << i
+                  << " grad_inputs's grad: " << grad_inputs_names[i];
+          in_out_map[op_type][1][1][j] = i;
+        }
+      }
+    } else {
+      if (std::find(outputs_names.begin(), outputs_names.end(),
+                    grad_inputs_names[i]) != outputs_names.end()) {
+        for (size_t j = 0; j < outputs_names.size(); j++) {
+          if (grad_inputs_names[i] == outputs_names[j]) {
+            VLOG(7) << " ==== Custom Operator: " << op_type << "_grad "
+                    << "'s No." << j << " outputs: " << outputs_names[j]
+                    << " related to No." << i
+                    << " grad_inputs fwd outputs: " << grad_inputs_names[i];
+            in_out_map[op_type][1][2][j] = i;
+          }
+        }
+      } else {
+        for (size_t j = 0; j < inputs_names.size(); j++) {
+          if (grad_inputs_names[i] == inputs_names[j]) {
+            VLOG(7) << " ==== Custom Operator: " << op_type << "_grad "
+                    << "'s No." << j << " inputs: " << inputs_names[j]
+                    << " related to No." << i
+                    << " grad_inputs fwd inputs: " << grad_inputs_names[i];
+            in_out_map[op_type][1][3][j] = i;
+          }
+        }
+      }
+    }
+  }
+
+  // Prepare pos map for grad attrs_
+  for (size_t i = 0; i < grad_attrs_names.size(); i++) {
+    auto end =
+        std::find(attrs_names.begin(), attrs_names.end(), grad_attrs_names[i]);
+    PADDLE_ENFORCE_NE(end, attrs_names.end(),
+                      paddle::platform::errors::NotFound(
+                          "All Grad attrs should be one of forward attrs and "
+                          "we got %s is not one of them, please check your "
+                          "op and change to fit the rule.",
+                          grad_attrs_names[i]));
+    for (size_t j = 0; j < attrs_names.size(); j++) {
+      if (grad_attrs_names[i] == attrs_names[j]) {
+        VLOG(7) << " ==== Custom Operator: " << op_type << "_grad "
+                << "'s No." << j << " attrs: " << attrs_names[j]
+                << " related to No." << i
+                << " grad_attrs: " << grad_attrs_names[i];
+        in_out_map[op_type][1][4][j] = i;
+      }
+    }
+  }
+}
+
 paddle::small_vector<std::vector<paddle::experimental::Tensor>,
                      kSlotSmallVectorSize>
 RunCustomOpNode::operator()(
@@ -38,10 +179,11 @@ RunCustomOpNode::operator()(
       tmp_ins(grad_inputs_name.size());
   VLOG(7) << " Prepare Backward inputs of grads with size: " << grads.size()
           << ", whose grad_inputs_name size is: " << grad_inputs_name.size();
-  for (size_t i = 0; i < grads.size(); i++) {
-    if (map[1].find(i) != map[1].end()) {
-      VLOG(7) << "Insert grad: " << i << " to grad_inputs: " << map[1][i];
-      tmp_ins[map[1][i]] = grads[i];
+  auto hooked_grads = ApplyGradientHooks(grads);
+  for (size_t i = 0; i < hooked_grads.size(); i++) {
+    if (map[0][1].find(i) != map[0][1].end()) {
+      VLOG(7) << "Insert grad: " << i << " to grad_inputs: " << map[0][1][i];
+      tmp_ins[map[0][1][i]] = hooked_grads[i];
     }
   }
 
@@ -69,28 +211,218 @@ RunCustomOpNode::operator()(
       tmp_outs(grad_outputs_names.size());
   VLOG(6) << "Prepare Grad outputs for size: " << grad_outputs_names.size();
   for (size_t i = 0; i < OutputMeta().size(); i++) {
-    if (map[0].find(i) != map[0].end()) {
+    if (map[0][0].find(i) != map[0][0].end()) {
       VLOG(7) << "Insert grad outputs: " << i
               << " with size: " << OutputMeta()[i].size()
-              << " to tmp_outputs: " << map[0][i];
+              << " to tmp_outputs: " << map[0][0][i];
       for (size_t j = 0; j < OutputMeta()[i].size(); j++) {
         outs[i].emplace_back(/* init it incase of copy nullptr of shared_ptr */
                              std::make_shared<phi::DenseTensor>(
                                  phi::DataType::UNDEFINED),
                              egr::Controller::Instance().GenerateUniqueName(
                                  "custom_tmp_grad"));
+        egr::EagerUtils::autograd_meta(&(outs[i][j]));
       }
-      tmp_outs[map[0][i]] = outs[i];
+      tmp_outs[map[0][0][i]] = outs[i];
     }
   }
   for (size_t i = 0; i < tmp_outs.size(); i++) {
     VLOG(7) << "Prepare grad outputs size: " << tmp_outs[i].size();
     ctx.EmplaceBackOutputs(tmp_outs[i]);
   }
-  VLOG(7) << "Run Kernel of Grad Custom Op: " << op_type_;
+  VLOG(7) << "Run Kernel of Grad Custom Op: " << op_type_ << "_grad";
 
   (*paddle::framework::OpMetaInfoHelper::GetKernelFn(
       kernel_map.at(op_type_)[1]))(&ctx);
+
+  VLOG(7) << "Get AutogradMeta for inputs and outputs for Custom Op";
+  std::vector<std::vector<egr::AutogradMeta*>> ins_auto_grad_metas;
+  std::vector<std::vector<egr::AutogradMeta*>> outs_auto_grad_metas;
+  VLOG(7) << "We got slot num of ins is: " << ctx.InputRange().size();
+  ins_auto_grad_metas.resize(ctx.InputRange().size());
+  VLOG(7) << "We got slot num of outs is: " << ctx.OutputRange().size();
+  outs_auto_grad_metas.resize(ctx.OutputRange().size());
+
+  for (size_t i = 0; i < ctx.InputRange().size(); i++) {
+    ins_auto_grad_metas[i] =
+        egr::EagerUtils::nullable_autograd_meta(ctx.InputsBetween(
+            ctx.InputRangeAt(i).first, ctx.InputRangeAt(i).second));
+  }
+
+  for (size_t i = 0; i < ctx.OutputRange().size(); i++) {
+    outs_auto_grad_metas[i] =
+        egr::EagerUtils::unsafe_autograd_meta(ctx.OutputsBetweeen(
+            ctx.OutputRangeAt(i).first, ctx.OutputRangeAt(i).second));
+  }
+  bool require_any_grad = false;
+  bool trace_backward = egr::Controller::Instance().HasGrad() && create_graph;
+  for (size_t i = 0; i < ins_auto_grad_metas.size(); i++) {
+    require_any_grad =
+        require_any_grad || egr::EagerUtils::ComputeRequireGrad(
+                                trace_backward, &(ins_auto_grad_metas[i]));
+  }
+
+  if (require_any_grad) {
+    auto meta_info_map = egr::Controller::Instance().GetOpMetaInfoMap();
+    const auto& vec_map = meta_info_map.at(op_type_);
+    paddle::platform::RecordEvent node_creation_record_event(
+        "Custom Op " + op_type_ + " double_grad node_creation",
+        paddle::platform::TracerEventType::OperatorInner, 1);
+    VLOG(6) << " Construct Grad for Custom Op: " << op_type_;
+    ConstructFwdAndBwdMap(vec_map, op_type_);
+    for (size_t i = 0; i < outs_auto_grad_metas.size(); i++) {
+      egr::EagerUtils::PassStopGradient(false, &(outs_auto_grad_metas[i]));
+    }
+    auto grad_node = std::make_shared<egr::RunCustomOpDoubleGradNode>(
+        outs_auto_grad_metas.size(), ins_auto_grad_metas.size(), op_type_);
+
+    auto slot_map =
+        egr::Controller::Instance().GetCustomEdgesSlotMap().at(op_type_);
+    // Prepare Grad outputs
+    size_t no_grad_cnt = 0;
+    for (size_t i = 0; i < ins_auto_grad_metas.size(); i++) {
+      const std::vector<paddle::experimental::Tensor>& in_tensors =
+          ctx.InputsBetween(ctx.InputRangeAt(i).first,
+                            ctx.InputRangeAt(i).second);
+
+      if (slot_map[1][0].find(i) != slot_map[1][0].end()) {
+        grad_node->SetGradOutMeta(in_tensors, slot_map[1][0][i]);
+      } else {
+        grad_node->SetGradOutMeta(in_tensors,
+                                  ins_auto_grad_metas.size() - 1 - no_grad_cnt);
+        no_grad_cnt++;
+      }
+    }
+
+    // Prepare Grad inputs with grad of fwd outputs
+    for (size_t i = 0; i < outs_auto_grad_metas.size(); i++) {
+      const std::vector<paddle::experimental::Tensor>& out_tensors =
+          ctx.OutputsBetweeen(ctx.OutputRangeAt(i).first,
+                              ctx.OutputRangeAt(i).second);
+      egr::EagerUtils::SetOutRankWithSlot(&(outs_auto_grad_metas[i]), i);
+      egr::EagerUtils::SetHistory(&(outs_auto_grad_metas[i]), grad_node);
+      grad_node->SetGradInMeta(out_tensors, i);
+      egr::EagerUtils::CheckAndRetainGrad(out_tensors);
+    }
+
+    // Prepare Grad inputs with fwd outputs
+    for (auto it = slot_map[1][2].begin(); it != slot_map[1][2].end(); it++) {
+      VLOG(7) << "Prepare fwd_outs: " << it->first
+              << " to grad_inputs: " << it->second;
+      grad_node->fwd_outs[it->second] =
+          egr::RunCustomOpNode::ConstructTensorWrapper(
+              ctx.OutputsBetweeen(ctx.OutputRangeAt(it->first).first,
+                                  ctx.OutputRangeAt(it->first).second));
+    }
+
+    // Prepare Grad inputs with fwd inputs
+    for (auto it = slot_map[1][3].begin(); it != slot_map[1][3].end(); it++) {
+      VLOG(7) << "Prepare fwd_ins: " << it->first
+              << " to grad_inputs: " << it->second;
+      grad_node->fwd_ins[it->second] =
+          egr::RunCustomOpNode::ConstructTensorWrapper(
+              ctx.InputsBetween(ctx.InputRangeAt(it->first).first,
+                                ctx.InputRangeAt(it->first).second));
+    }
+
+    auto attrs_names = paddle::framework::OpMetaInfoHelper::GetAttrs(
+        meta_info_map.at(op_type_)[2]);
+    std::vector<paddle::any> attrs(attrs_names.size());
+    // Prepare attrs for Grad node
+    for (auto it = slot_map[1][4].begin(); it != slot_map[1][4].end(); it++) {
+      VLOG(7) << "Prepare fwd attrs: " << it->first
+              << " to grad_attrs: " << it->second;
+      attrs[it->second] = attrs_[it->first];
+    }
+    grad_node->SetAttrs(attrs);
+  }
+
+  return outs;
+}
+
+paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                     kSlotSmallVectorSize>
+RunCustomOpDoubleGradNode::operator()(
+    paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                         kSlotSmallVectorSize>& grads,
+    bool create_graph, bool is_new_grad) {  // NOLINT
+  paddle::CustomOpKernelContext ctx;
+  auto meta_info_map = egr::Controller::Instance().GetOpMetaInfoMap();
+  const auto& vec_map = meta_info_map.at(op_type_);
+  auto grad_inputs_name =
+      paddle::framework::OpMetaInfoHelper::GetInputs(vec_map[2]);
+  auto grad_outputs_names =
+      paddle::framework::OpMetaInfoHelper::GetOutputs(vec_map[2]);
+  auto map = egr::Controller::Instance().GetCustomEdgesSlotMap().at(op_type_);
+  auto kernel_map = egr::Controller::Instance().GetOpMetaInfoMap();
+
+  paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                       kSlotSmallVectorSize>
+      tmp_ins(grad_inputs_name.size());
+  VLOG(7) << " Prepare Backward inputs of grads with size: " << grads.size()
+          << ", whose grad_inputs_name size is: " << grad_inputs_name.size();
+
+  auto hooked_grads = ApplyGradientHooks(grads);
+
+  for (size_t i = 0; i < hooked_grads.size(); i++) {
+    if (map[1][1].find(i) != map[1][1].end()) {
+      VLOG(7) << "Insert grad: " << i << " to grad_inputs: " << map[1][1][i];
+      tmp_ins[map[1][1][i]] = hooked_grads[i];
+    }
+  }
+
+  for (auto it : fwd_outs) {
+    VLOG(7) << "Insert fwd_outs to grad_inputs: " << it.first;
+    tmp_ins[it.first] = RunCustomOpDoubleGradNode::Recover(&(it.second));
+  }
+
+  for (auto it : fwd_ins) {
+    VLOG(7) << "Insert fwd_ins to grad_inputs: " << it.first;
+    tmp_ins[it.first] = RunCustomOpDoubleGradNode::Recover(&(it.second));
+  }
+
+  VLOG(6) << "Prepare Grad inputs";
+  for (const auto& in : tmp_ins) {
+    ctx.EmplaceBackInputs(in);
+  }
+  VLOG(6) << "Prepare Grad attrs";
+  ctx.EmplaceBackAttrs(attrs_);
+  paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                       kSlotSmallVectorSize>
+      outs(OutputMeta().size());
+  paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                       kSlotSmallVectorSize>
+      tmp_outs(grad_outputs_names.size());
+  VLOG(6) << "Prepare Grad outputs for size: " << grad_outputs_names.size();
+
+  for (const auto& name : grad_outputs_names) {
+    VLOG(6) << "Prepare Grad outputs name is: " << name;
+  }
+
+  for (size_t i = 0; i < OutputMeta().size(); i++) {
+    if (map[1][0].find(i) != map[1][0].end()) {
+      VLOG(7) << "Insert grad outputs: " << i
+              << " with size: " << OutputMeta()[i].size()
+              << " to tmp_outputs: " << map[1][0][i];
+      for (size_t j = 0; j < OutputMeta()[i].size(); j++) {
+        outs[i].emplace_back(/* init it incase of copy nullptr of shared_ptr */
+                             std::make_shared<phi::DenseTensor>(
+                                 phi::DataType::UNDEFINED),
+                             egr::Controller::Instance().GenerateUniqueName(
+                                 "custom_tmp_grad"));
+      }
+      tmp_outs[map[1][0][i]] = outs[i];
+    }
+  }
+  for (size_t i = 0; i < tmp_outs.size(); i++) {
+    VLOG(7) << "Prepare grad outputs size: " << tmp_outs[i].size();
+    ctx.EmplaceBackOutputs(tmp_outs[i]);
+  }
+  VLOG(7) << "Run Kernel of Grad Custom Op: " << name();
+
+  (*paddle::framework::OpMetaInfoHelper::GetKernelFn(
+      kernel_map.at(op_type_)[2]))(&ctx);
+
   return outs;
 }
 }  // namespace egr
diff --git a/paddle/fluid/eager/custom_operator/custom_operator_node.h b/paddle/fluid/eager/custom_operator/custom_operator_node.h
index 4801088e51ba5..feea23730676e 100644
--- a/paddle/fluid/eager/custom_operator/custom_operator_node.h
+++ b/paddle/fluid/eager/custom_operator/custom_operator_node.h
@@ -67,7 +67,11 @@ class RunCustomOpNode : public GradNodeBase {
     return res;
   }
 
-  void ClearTensorWrappers() override { VLOG(6) << "Do nothing here now"; }
+  void ClearTensorWrappers() override {
+    fwd_outs.clear();
+    fwd_ins.clear();
+    grads2grad_in_map.clear();
+  }
 
   void SetAttrs(const std::vector<paddle::any>& attr) { attrs_ = attr; }
 
@@ -87,4 +91,75 @@ class RunCustomOpNode : public GradNodeBase {
   std::string op_type_{""};
 };
 
+class RunCustomOpDoubleGradNode : public GradNodeBase {
+ public:
+  // Constructor: configure fwd input tensors to grad node
+  explicit RunCustomOpDoubleGradNode(size_t bwd_in_slot_num,
+                                     size_t bwd_out_slot_num,
+                                     const std::string& op_type)
+      : GradNodeBase(bwd_in_slot_num, bwd_out_slot_num), op_type_(op_type) {
+    VLOG(6) << "Construct RunCustomOpDoubleGradNode for op: " << op_type;
+  }
+
+  ~RunCustomOpDoubleGradNode() override {
+    VLOG(6) << "Destruct RunCustomOpDoubleGradNode for op: " << op_type_;
+  }
+
+  // Functor: perform backward computations
+  virtual paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                               kSlotSmallVectorSize>
+  operator()(  // NOLINT
+      paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                           kSlotSmallVectorSize>& grads,  // NOLINT
+      bool create_graph = false,
+      bool is_new_grad = false)  // NOLINT
+      override;
+
+  std::string name() {
+    return paddle::string::Sprintf("RunCustomOpDoubleGradNode: %s_grad_grad",
+                                   op_type_);
+  }
+
+  static std::vector<egr::TensorWrapper> ConstructTensorWrapper(
+      const std::vector<paddle::experimental::Tensor>& fwd_var) {
+    std::vector<egr::TensorWrapper> res;
+    for (auto const& var : fwd_var) {
+      res.emplace_back(var);
+    }
+    return res;
+  }
+
+  static std::vector<paddle::experimental::Tensor> Recover(
+      std::vector<egr::TensorWrapper>* fwd_var) {
+    std::vector<paddle::experimental::Tensor> res;
+    for (size_t i = 0; i < fwd_var->size(); i++) {
+      res.emplace_back(fwd_var->at(i).recover());
+    }
+    return res;
+  }
+
+  void ClearTensorWrappers() override {
+    fwd_outs.clear();
+    fwd_ins.clear();
+    grads2grad_in_map.clear();
+  }
+
+  void SetAttrs(const std::vector<paddle::any>& attr) { attrs_ = attr; }
+
+  std::shared_ptr<GradNodeBase> Copy() const override {
+    auto copied_node = std::shared_ptr<RunCustomOpDoubleGradNode>(
+        new RunCustomOpDoubleGradNode(*this));
+    return copied_node;
+  }
+
+ public:
+  std::unordered_map<int, std::vector<egr::TensorWrapper>> fwd_outs;
+  std::unordered_map<int, std::vector<egr::TensorWrapper>> fwd_ins;
+  std::unordered_map<int, int> grads2grad_in_map;
+
+ private:
+  std::vector<paddle::any> attrs_;
+  std::string op_type_{""};
+};
+
 }  // namespace egr
diff --git a/paddle/fluid/eager/tensor_wrapper.h b/paddle/fluid/eager/tensor_wrapper.h
index 28b116b41ea91..a90b7bc7d7202 100644
--- a/paddle/fluid/eager/tensor_wrapper.h
+++ b/paddle/fluid/eager/tensor_wrapper.h
@@ -119,18 +119,24 @@ class TensorWrapper {
       paddle::experimental::Tensor recovered_tensor = intermidiate_tensor_;
 
       std::shared_ptr<GradNodeBase> new_grad_node = weak_grad_node_.lock();
-      auto* intermediate_autograd_meta =
-          EagerUtils::unsafe_autograd_meta(intermidiate_tensor_);
-      auto p_ab_autograd_meta =
-          std::make_shared<AutogradMeta>(*intermediate_autograd_meta);
       if (new_grad_node) {
         VLOG(3) << "Recovered TensorWrapper with GradNode "
                 << new_grad_node->name() << " addr: " << new_grad_node.get();
-        p_ab_autograd_meta->SetGradNode(new_grad_node);
       } else {
-        VLOG(3) << "Recovered TensorWrapper with Empth GradNode";
+        VLOG(3) << "Recovered TensorWrapper with Empty GradNode";
+      }
+      auto* intermediate_autograd_meta =
+          EagerUtils::nullable_autograd_meta(intermidiate_tensor_);
+
+      if (intermediate_autograd_meta) {
+        auto p_ab_autograd_meta =
+            std::make_shared<AutogradMeta>(*intermediate_autograd_meta);
+        if (new_grad_node) {
+          p_ab_autograd_meta->SetGradNode(new_grad_node);
+        }
+        recovered_tensor.set_autograd_meta(p_ab_autograd_meta);
       }
-      recovered_tensor.set_autograd_meta(p_ab_autograd_meta);
+
       return recovered_tensor;
     }
   }
diff --git a/paddle/fluid/eager/utils.cc b/paddle/fluid/eager/utils.cc
index 033af5c496c98..65294a8eb7abc 100644
--- a/paddle/fluid/eager/utils.cc
+++ b/paddle/fluid/eager/utils.cc
@@ -157,7 +157,7 @@ void EagerUtils::SetHistory(std::vector<AutogradMeta*>* autograd_metas,
     if (autograd_meta->GradNode()) {
       VLOG(7) << "Should not set grad node twice, original node is:"
               << autograd_meta->GradNode()->name()
-              << "current is: " << grad_node->name();
+              << " current is: " << grad_node->name();
     }
     autograd_meta->SetGradNode(grad_node);
   }
diff --git a/paddle/fluid/pybind/eager_functions.cc b/paddle/fluid/pybind/eager_functions.cc
index ac33eb2359c8c..5395b4f31c83b 100644
--- a/paddle/fluid/pybind/eager_functions.cc
+++ b/paddle/fluid/pybind/eager_functions.cc
@@ -207,7 +207,8 @@ static void ConstructFwdAndBwdMap(
     auto grad_attrs_names =
         paddle::framework::OpMetaInfoHelper::GetAttrs(vec_map[1]);
     std::vector<std::unordered_map<int, int>> res(5);
-    in_out_map.insert({op_type, res});
+
+    in_out_map.insert({op_type, {res}});
     // Prepare pos map for grad_outputs
     VLOG(7) << "Prepare pos map for grad_outputs";
     PADDLE_ENFORCE_LE(
@@ -227,7 +228,7 @@ static void ConstructFwdAndBwdMap(
           VLOG(7) << " ==== Custom Operator: " << op_type << "'s No." << j
                   << " inputs: " << inputs_names[j] << " related to No." << i
                   << " grad_outputs: " << grad_outputs_names[i];
-          in_out_map[op_type][0][j] = i;
+          in_out_map[op_type][0][0][j] = i;
         }
       }
     }
@@ -240,7 +241,7 @@ static void ConstructFwdAndBwdMap(
             VLOG(7) << " ==== Custom Operator: " << op_type << "'s No." << j
                     << " outputs: " << outputs_names[j] << " related to No."
                     << i << " grad_inputs's grad: " << grad_inputs_names[i];
-            in_out_map[op_type][1][j] = i;
+            in_out_map[op_type][0][1][j] = i;
           }
         }
       } else {
@@ -252,7 +253,7 @@ static void ConstructFwdAndBwdMap(
                       << " outputs: " << outputs_names[j] << " related to No."
                       << i
                       << " grad_inputs fwd outputs: " << grad_inputs_names[i];
-              in_out_map[op_type][2][j] = i;
+              in_out_map[op_type][0][2][j] = i;
             }
           }
         } else {
@@ -262,7 +263,7 @@ static void ConstructFwdAndBwdMap(
                       << " inputs: " << inputs_names[j] << " related to No."
                       << i
                       << " grad_inputs fwd inputs: " << grad_inputs_names[i];
-              in_out_map[op_type][3][j] = i;
+              in_out_map[op_type][0][3][j] = i;
             }
           }
         }
@@ -284,7 +285,7 @@ static void ConstructFwdAndBwdMap(
           VLOG(7) << " ==== Custom Operator: " << op_type << "'s No." << j
                   << " attrs: " << attrs_names[j] << " related to No." << i
                   << " grad_attrs: " << grad_attrs_names[i];
-          in_out_map[op_type][4][j] = i;
+          in_out_map[op_type][0][4][j] = i;
         }
       }
     }
@@ -402,8 +403,8 @@ static PyObject* eager_api_run_costum_op(PyObject* self, PyObject* args,
           ctx.InputsBetween(ctx.InputRangeAt(i).first,
                             ctx.InputRangeAt(i).second);
 
-      if (slot_map[0].find(i) != slot_map[0].end()) {
-        grad_node->SetGradOutMeta(in_tensors, slot_map[0][i]);
+      if (slot_map[0][0].find(i) != slot_map[0][0].end()) {
+        grad_node->SetGradOutMeta(in_tensors, slot_map[0][0][i]);
       } else {
         grad_node->SetGradOutMeta(in_tensors,
                                   ins_auto_grad_metas.size() - 1 - no_grad_cnt);
@@ -423,7 +424,7 @@ static PyObject* eager_api_run_costum_op(PyObject* self, PyObject* args,
     }
 
     // Prepare Grad inputs with fwd outputs
-    for (auto it = slot_map[2].begin(); it != slot_map[2].end(); it++) {
+    for (auto it = slot_map[0][2].begin(); it != slot_map[0][2].end(); it++) {
       VLOG(7) << "Prepare fwd_outs: " << it->first
               << " to grad_inputs: " << it->second;
       grad_node->fwd_outs[it->second] =
@@ -433,7 +434,7 @@ static PyObject* eager_api_run_costum_op(PyObject* self, PyObject* args,
     }
 
     // Prepare Grad inputs with fwd inputs
-    for (auto it = slot_map[3].begin(); it != slot_map[3].end(); it++) {
+    for (auto it = slot_map[0][3].begin(); it != slot_map[0][3].end(); it++) {
       VLOG(7) << "Prepare fwd_ins: " << it->first
               << " to grad_inputs: " << it->second;
       grad_node->fwd_ins[it->second] =
@@ -446,7 +447,7 @@ static PyObject* eager_api_run_costum_op(PyObject* self, PyObject* args,
         meta_info_map.at(op_type)[1]);
     std::vector<paddle::any> attrs(attrs_names.size());
     // Prepare attrs for Grad node
-    for (auto it = slot_map[4].begin(); it != slot_map[4].end(); it++) {
+    for (auto it = slot_map[0][4].begin(); it != slot_map[0][4].end(); it++) {
       VLOG(7) << "Prepare fwd attrs: " << it->first
               << " to grad_attrs: " << it->second;
       attrs[it->second] = res_attrs[it->first];
diff --git a/python/paddle/fluid/tests/custom_op/test_custom_tanh_double_grad.py b/python/paddle/fluid/tests/custom_op/test_custom_tanh_double_grad.py
index 5664c00d74f89..3b3a0e2edec98 100644
--- a/python/paddle/fluid/tests/custom_op/test_custom_tanh_double_grad.py
+++ b/python/paddle/fluid/tests/custom_op/test_custom_tanh_double_grad.py
@@ -21,8 +21,7 @@
 from paddle.utils.cpp_extension import load, get_build_directory
 from paddle.utils.cpp_extension.extension_utils import run_cmd
 from utils import paddle_includes, extra_cc_args, extra_nvcc_args
-from paddle.fluid.framework import _test_eager_guard, _enable_legacy_dygraph
-_enable_legacy_dygraph()
+from paddle.fluid.framework import _test_eager_guard
 
 # Because Windows don't use docker, the shared lib already exists in the
 # cache dir, it will not be compiled again unless the shared lib is removed.
@@ -64,7 +63,7 @@ def setUp(self):
         self.dtypes = ['float32', 'float64']
         self.devices = ['cpu']
 
-    def test_func_double_grad_dynamic(self):
+    def func_double_grad_dynamic(self):
         for device in self.devices:
             for dtype in self.dtypes:
                 x = np.random.uniform(-1, 1, [4, 8]).astype(dtype)
@@ -85,6 +84,11 @@ def test_func_double_grad_dynamic(self):
                     "custom op out grad: {},\n paddle api out grad: {}".format(
                         dout, pd_dout))
 
+    def test_func_double_grad_dynamic(self):
+        with _test_eager_guard():
+            self.func_double_grad_dynamic()
+        self.func_double_grad_dynamic()
+
 
 if __name__ == "__main__":
     unittest.main()

From 5d55ebdefe7950a8b719474b5422b9b614e6d9c0 Mon Sep 17 00:00:00 2001
From: Leo Chen <chenqiuliang@baidu.com>
Date: Wed, 11 May 2022 14:14:36 +0800
Subject: [PATCH 09/49] print start time, end time for each compilation command
 (#42647)

* print start time, end time for each compilation command

* add process info
---
 tools/get_build_time.sh | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/tools/get_build_time.sh b/tools/get_build_time.sh
index a89c024f97ea2..1563fefff3799 100755
--- a/tools/get_build_time.sh
+++ b/tools/get_build_time.sh
@@ -15,4 +15,8 @@
 # limitations under the License.
 
 CUR_ROOT=$(dirname "$0")/..
-/usr/bin/time -f '%C, %E elapsed, %U user, %S sys' "$@" >> $CUR_ROOT/build/build-time 2>&1
+start=$(date +%s.%N)
+duration=$("/usr/bin/time" -f "%C, %E elapsed, %U user, %S sys" "$@" 2>&1)
+end=$(date +%s.%N)
+
+echo ${duration}, 'start', $start, 'end', $end, 'process', $$ >> $CUR_ROOT/build/build-time

From cbb8df78873bda7b4837dc4ed804981a0fca08a1 Mon Sep 17 00:00:00 2001
From: Feng Ni <nemonameless@qq.com>
Date: Wed, 11 May 2022 15:40:13 +0800
Subject: [PATCH 10/49] [New API] add API paddle.vision.transforms.RandomAffine
 and paddle.vision.transforms.affine (#42278)

* add affine codes

* adjustment codes

* fix test case

* fix F_cv2.affine

* clean codes, add UT

* fix UT

* fix UT

* fix UT shear

* add functional test_errors

* fix typos and coments, test=develop
---
 python/paddle/tests/test_transforms.py        | 145 ++++++++++++++
 python/paddle/vision/transforms/__init__.py   |   4 +
 python/paddle/vision/transforms/functional.py | 160 +++++++++++++++
 .../vision/transforms/functional_cv2.py       |  80 ++++++++
 .../vision/transforms/functional_pil.py       |  26 +++
 .../vision/transforms/functional_tensor.py    |  45 ++++-
 python/paddle/vision/transforms/transforms.py | 183 ++++++++++++++++++
 7 files changed, 641 insertions(+), 2 deletions(-)

diff --git a/python/paddle/tests/test_transforms.py b/python/paddle/tests/test_transforms.py
index 242680bc7c738..38cad05bfcb89 100644
--- a/python/paddle/tests/test_transforms.py
+++ b/python/paddle/tests/test_transforms.py
@@ -123,6 +123,44 @@ def test_color_jitter(self):
         ])
         self.do_transform(trans)
 
+    def test_affine(self):
+        trans = transforms.Compose([
+            transforms.RandomAffine(90),
+            transforms.RandomAffine(
+                [-10, 10], translate=[0.1, 0.3]),
+            transforms.RandomAffine(
+                45, translate=[0.2, 0.2], scale=[0.2, 0.5]),
+            transforms.RandomAffine(
+                10, translate=[0.2, 0.2], scale=[0.5, 0.5], shear=[-10, 10]),
+            transforms.RandomAffine(
+                10,
+                translate=[0.5, 0.3],
+                scale=[0.7, 1.3],
+                shear=[-10, 10, 20, 40]),
+            transforms.RandomAffine(
+                10,
+                translate=[0.5, 0.3],
+                scale=[0.7, 1.3],
+                shear=[-10, 10, 20, 40],
+                interpolation='bilinear'),
+            transforms.RandomAffine(
+                10,
+                translate=[0.5, 0.3],
+                scale=[0.7, 1.3],
+                shear=[-10, 10, 20, 40],
+                interpolation='bilinear',
+                fill=114),
+            transforms.RandomAffine(
+                10,
+                translate=[0.5, 0.3],
+                scale=[0.7, 1.3],
+                shear=[-10, 10, 20, 40],
+                interpolation='bilinear',
+                fill=114,
+                center=(60, 80)),
+        ])
+        self.do_transform(trans)
+
     def test_rotate(self):
         trans = transforms.Compose([
             transforms.RandomRotation(90),
@@ -278,6 +316,35 @@ def test_exception(self):
             tensor_img = paddle.rand((3, 100, 100))
             F.pad(tensor_img, [1.0, 2.0, 3.0])
 
+        with self.assertRaises(ValueError):
+            transforms.RandomAffine(-10)
+
+        with self.assertRaises(ValueError):
+            transforms.RandomAffine([-30, 60], translate=[2, 2])
+
+        with self.assertRaises(ValueError):
+            transforms.RandomAffine(10, translate=[0.2, 0.2], scale=[1, 2, 3]),
+
+        with self.assertRaises(ValueError):
+            transforms.RandomAffine(
+                10, translate=[0.2, 0.2], scale=[0.5, 0.5], shear=[1, 2, 3]),
+
+        with self.assertRaises(ValueError):
+            transforms.RandomAffine(
+                10,
+                translate=[0.5, 0.3],
+                scale=[0.7, 1.3],
+                shear=[-10, 10, 0, 20, 40])
+
+        with self.assertRaises(ValueError):
+            transforms.RandomAffine(
+                10,
+                translate=[0.5, 0.3],
+                scale=[0.7, 1.3],
+                shear=[-10, 10, 20, 40],
+                fill=114,
+                center=(1, 2, 3))
+
         with self.assertRaises(ValueError):
             transforms.RandomRotation(-2)
 
@@ -479,6 +546,29 @@ def test_exception(self):
             tensor_img = paddle.rand((3, 100, 100))
             F.pad(tensor_img, [1.0, 2.0, 3.0])
 
+        with self.assertRaises(ValueError):
+            transforms.RandomAffine(-10)
+
+        with self.assertRaises(ValueError):
+            transforms.RandomAffine([-30, 60], translate=[2, 2])
+
+        with self.assertRaises(ValueError):
+            transforms.RandomAffine(10, translate=[0.2, 0.2], scale=[-2, -1]),
+
+        with self.assertRaises(ValueError):
+            transforms.RandomAffine(10, translate=[0.2, 0.2], scale=[1, 2, 3]),
+
+        with self.assertRaises(ValueError):
+            transforms.RandomAffine(
+                10, translate=[0.2, 0.2], scale=[0.5, 0.5], shear=[1, 2, 3]),
+
+        with self.assertRaises(ValueError):
+            transforms.RandomAffine(
+                10,
+                translate=[0.5, 0.3],
+                scale=[0.7, 1.3],
+                shear=[-10, 10, 0, 20, 40])
+
         with self.assertRaises(ValueError):
             transforms.RandomRotation(-2)
 
@@ -547,6 +637,36 @@ def test_errors(self):
         with self.assertRaises(TypeError):
             F.adjust_saturation(1, 0.1)
 
+        with self.assertRaises(TypeError):
+            F.affine('45')
+
+        with self.assertRaises(TypeError):
+            F.affine(45, translate=0.3)
+
+        with self.assertRaises(TypeError):
+            F.affine(45, translate=[0.2, 0.2, 0.3])
+
+        with self.assertRaises(TypeError):
+            F.affine(45, translate=[0.2, 0.2], scale=-0.5)
+
+        with self.assertRaises(TypeError):
+            F.affine(45, translate=[0.2, 0.2], scale=0.5, shear=10)
+
+        with self.assertRaises(TypeError):
+            F.affine(45, translate=[0.2, 0.2], scale=0.5, shear=[-10, 0, 10])
+
+        with self.assertRaises(TypeError):
+            F.affine(
+                45,
+                translate=[0.2, 0.2],
+                scale=0.5,
+                shear=[-10, 10],
+                interpolation=2)
+
+        with self.assertRaises(TypeError):
+            F.affine(
+                45, translate=[0.2, 0.2], scale=0.5, shear=[-10, 10], center=0)
+
         with self.assertRaises(TypeError):
             F.rotate(1, 0.1)
 
@@ -785,6 +905,31 @@ def test_image_load(self):
 
         os.remove(path)
 
+    def test_affine(self):
+        np_img = (np.random.rand(32, 26, 3) * 255).astype('uint8')
+        pil_img = Image.fromarray(np_img).convert('RGB')
+        tensor_img = F.to_tensor(pil_img, data_format='CHW') * 255
+
+        np.testing.assert_almost_equal(
+            np_img, tensor_img.transpose((1, 2, 0)), decimal=4)
+
+        np_affined_img = F.affine(
+            np_img, 45, translate=[0.2, 0.2], scale=0.5, shear=[-10, 10])
+        pil_affined_img = F.affine(
+            pil_img, 45, translate=[0.2, 0.2], scale=0.5, shear=[-10, 10])
+        tensor_affined_img = F.affine(
+            tensor_img, 45, translate=[0.2, 0.2], scale=0.5, shear=[-10, 10])
+
+        np.testing.assert_equal(np_affined_img.shape,
+                                np.array(pil_affined_img).shape)
+        np.testing.assert_equal(np_affined_img.shape,
+                                tensor_affined_img.transpose((1, 2, 0)).shape)
+
+        np.testing.assert_almost_equal(
+            np.array(pil_affined_img),
+            tensor_affined_img.numpy().transpose((1, 2, 0)),
+            decimal=4)
+
     def test_rotate(self):
         np_img = (np.random.rand(28, 28, 3) * 255).astype('uint8')
         pil_img = Image.fromarray(np_img).convert('RGB')
diff --git a/python/paddle/vision/transforms/__init__.py b/python/paddle/vision/transforms/__init__.py
index b255e663e6876..41e9b188e34ed 100644
--- a/python/paddle/vision/transforms/__init__.py
+++ b/python/paddle/vision/transforms/__init__.py
@@ -28,6 +28,7 @@
 from .transforms import ColorJitter  # noqa: F401
 from .transforms import RandomCrop  # noqa: F401
 from .transforms import Pad  # noqa: F401
+from .transforms import RandomAffine  # noqa: F401
 from .transforms import RandomRotation  # noqa: F401
 from .transforms import Grayscale  # noqa: F401
 from .transforms import ToTensor  # noqa: F401
@@ -37,6 +38,7 @@
 from .functional import vflip  # noqa: F401
 from .functional import resize  # noqa: F401
 from .functional import pad  # noqa: F401
+from .functional import affine  # noqa: F401
 from .functional import rotate  # noqa: F401
 from .functional import to_grayscale  # noqa: F401
 from .functional import crop  # noqa: F401
@@ -64,6 +66,7 @@
     'ColorJitter',
     'RandomCrop',
     'Pad',
+    'RandomAffine',
     'RandomRotation',
     'Grayscale',
     'ToTensor',
@@ -73,6 +76,7 @@
     'vflip',
     'resize',
     'pad',
+    'affine',
     'rotate',
     'to_grayscale',
     'crop',
diff --git a/python/paddle/vision/transforms/functional.py b/python/paddle/vision/transforms/functional.py
index 29a857ba570f6..83f756e6ed2a6 100644
--- a/python/paddle/vision/transforms/functional.py
+++ b/python/paddle/vision/transforms/functional.py
@@ -537,6 +537,166 @@ def adjust_hue(img, hue_factor):
         return F_t.adjust_hue(img, hue_factor)
 
 
+def _get_affine_matrix(center, angle, translate, scale, shear):
+    # Affine matrix is : M = T * C * RotateScaleShear * C^-1
+    # Ihe inverse one is : M^-1 = C * RotateScaleShear^-1 * C^-1 * T^-1
+    rot = math.radians(angle)
+    sx = math.radians(shear[0])
+    sy = math.radians(shear[1])
+
+    # Rotate and Shear without scaling 
+    a = math.cos(rot - sy) / math.cos(sy)
+    b = -math.cos(rot - sy) * math.tan(sx) / math.cos(sy) - math.sin(rot)
+    c = math.sin(rot - sy) / math.cos(sy)
+    d = -math.sin(rot - sy) * math.tan(sx) / math.cos(sy) + math.cos(rot)
+
+    # Center Translation
+    cx, cy = center
+    tx, ty = translate
+
+    # Inverted rotation matrix with scale and shear
+    # det([[a, b], [c, d]]) == 1, since det(rotation) = 1 and det(shear) = 1
+    matrix = [d, -b, 0.0, -c, a, 0.0]
+    matrix = [x / scale for x in matrix]
+    # Apply inverse of translation and of center translation: RSS^-1 * C^-1 * T^-1
+    matrix[2] += matrix[0] * (-cx - tx) + matrix[1] * (-cy - ty)
+    matrix[5] += matrix[3] * (-cx - tx) + matrix[4] * (-cy - ty)
+    # Apply center translation: C * RSS^-1 * C^-1 * T^-1
+    matrix[2] += cx
+    matrix[5] += cy
+
+    return matrix
+
+
+def affine(img,
+           angle,
+           translate,
+           scale,
+           shear,
+           interpolation="nearest",
+           fill=0,
+           center=None):
+    """Apply affine transformation on the image.
+
+    Args:
+        img (PIL.Image|np.array|paddle.Tensor): Image to be affined.
+        angle (int|float): The angle of the random rotation in clockwise order.
+        translate (list[float]): Maximum absolute fraction for horizontal and vertical translations.
+        scale (float): Scale factor for the image, scale should be positive.
+        shear (list[float]): Shear angle values which are parallel to the x-axis and y-axis in clockwise order.
+        interpolation (str, optional): Interpolation method. If omitted, or if the 
+            image has only one channel, it is set to PIL.Image.NEAREST or cv2.INTER_NEAREST 
+            according the backend. 
+            When use pil backend, support method are as following: 
+            - "nearest": Image.NEAREST, 
+            - "bilinear": Image.BILINEAR, 
+            - "bicubic": Image.BICUBIC
+            When use cv2 backend, support method are as following: 
+            - "nearest": cv2.INTER_NEAREST, 
+            - "bilinear": cv2.INTER_LINEAR, 
+            - "bicubic": cv2.INTER_CUBIC
+        fill (int|list|tuple, optional): Pixel fill value for the area outside the transformed
+            image. If given a number, the value is used for all bands respectively.
+        center (2-tuple, optional): Optional center of rotation, (x, y).
+            Origin is the upper left corner.
+            Default is the center of the image.
+
+    Returns:
+        PIL.Image|np.array|paddle.Tensor: Affine Transformed image.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            from paddle.vision.transforms import functional as F
+
+            fake_img = paddle.randn((3, 256, 300)).astype(paddle.float32)
+
+            affined_img = F.affine(fake_img, 45, translate=[0.2, 0.2], scale=0.5, shear=[-10, 10])
+            print(affined_img.shape)
+    """
+
+    if not (_is_pil_image(img) or _is_numpy_image(img) or
+            _is_tensor_image(img)):
+        raise TypeError(
+            'img should be PIL Image or Tensor Image or ndarray with dim=[2 or 3]. Got {}'.
+            format(type(img)))
+
+    if not isinstance(angle, (int, float)):
+        raise TypeError("Argument angle should be int or float")
+
+    if not isinstance(translate, (list, tuple)):
+        raise TypeError("Argument translate should be a sequence")
+
+    if len(translate) != 2:
+        raise ValueError("Argument translate should be a sequence of length 2")
+
+    if scale <= 0.0:
+        raise ValueError("Argument scale should be positive")
+
+    if not isinstance(shear, (numbers.Number, (list, tuple))):
+        raise TypeError(
+            "Shear should be either a single value or a sequence of two values")
+
+    if not isinstance(interpolation, str):
+        raise TypeError("Argument interpolation should be a string")
+
+    if isinstance(angle, int):
+        angle = float(angle)
+
+    if isinstance(translate, tuple):
+        translate = list(translate)
+
+    if isinstance(shear, numbers.Number):
+        shear = [shear, 0.0]
+
+    if isinstance(shear, tuple):
+        shear = list(shear)
+
+    if len(shear) == 1:
+        shear = [shear[0], shear[0]]
+
+    if len(shear) != 2:
+        raise ValueError(
+            f"Shear should be a sequence containing two values. Got {shear}")
+
+    if center is not None and not isinstance(center, (list, tuple)):
+        raise TypeError("Argument center should be a sequence")
+
+    if _is_pil_image(img):
+        width, height = img.size
+        # center = (width * 0.5 + 0.5, height * 0.5 + 0.5)
+        # it is visually better to estimate the center without 0.5 offset
+        # otherwise image rotated by 90 degrees is shifted vs output image of F_t.affine
+        if center is None:
+            center = [width * 0.5, height * 0.5]
+        matrix = _get_affine_matrix(center, angle, translate, scale, shear)
+        return F_pil.affine(img, matrix, interpolation, fill)
+
+    if _is_numpy_image(img):
+        # get affine_matrix in F_cv2.affine() using cv2's functions
+        width, height = img.shape[0:2]
+        # center = (width * 0.5 + 0.5, height * 0.5 + 0.5)
+        # it is visually better to estimate the center without 0.5 offset
+        # otherwise image rotated by 90 degrees is shifted vs output image of F_t.affine
+        if center is None:
+            center = (width * 0.5, height * 0.5)
+        return F_cv2.affine(img, angle, translate, scale, shear, interpolation,
+                            fill, center)
+
+    if _is_tensor_image(img):
+        center_f = [0.0, 0.0]
+        if center is not None:
+            height, width = img.shape[-1], img.shape[-2]
+            # Center values should be in pixel coordinates but translated such that (0, 0) corresponds to image center.
+            center_f = [
+                1.0 * (c - s * 0.5) for c, s in zip(center, [width, height])
+            ]
+        translate_f = [1.0 * t for t in translate]
+        matrix = _get_affine_matrix(center_f, angle, translate_f, scale, shear)
+        return F_t.affine(img, matrix, interpolation, fill)
+
+
 def rotate(img,
            angle,
            interpolation="nearest",
diff --git a/python/paddle/vision/transforms/functional_cv2.py b/python/paddle/vision/transforms/functional_cv2.py
index 8343a8c340ffb..d20bf3e60d907 100644
--- a/python/paddle/vision/transforms/functional_cv2.py
+++ b/python/paddle/vision/transforms/functional_cv2.py
@@ -411,6 +411,86 @@ def adjust_hue(img, hue_factor):
     return cv2.cvtColor(hsv_img, cv2.COLOR_HSV2BGR_FULL).astype(dtype)
 
 
+def affine(img,
+           angle,
+           translate,
+           scale,
+           shear,
+           interpolation='nearest',
+           fill=0,
+           center=None):
+    """Affine the image by matrix.
+
+    Args:
+        img (PIL.Image): Image to be affined.
+        translate (sequence or int): horizontal and vertical translations
+        scale (float): overall scale ratio
+        shear (sequence or float): shear angle value in degrees between -180 to 180, clockwise direction.
+            If a sequence is specified, the first value corresponds to a shear parallel to the x axis, while
+            the second value corresponds to a shear parallel to the y axis.
+        interpolation (int|str, optional): Interpolation method. If omitted, or if the 
+            image has only one channel, it is set to cv2.INTER_NEAREST.
+            when use cv2 backend, support method are as following: 
+            - "nearest": cv2.INTER_NEAREST, 
+            - "bilinear": cv2.INTER_LINEAR, 
+            - "bicubic": cv2.INTER_CUBIC
+        fill (3-tuple or int): RGB pixel fill value for area outside the affined image.
+            If int, it is used for all channels respectively.
+        center (sequence, optional): Optional center of rotation. Origin is the upper left corner.
+            Default is the center of the image.
+
+    Returns:
+        np.array: Affined image.
+
+    """
+    cv2 = try_import('cv2')
+    _cv2_interp_from_str = {
+        'nearest': cv2.INTER_NEAREST,
+        'bilinear': cv2.INTER_LINEAR,
+        'area': cv2.INTER_AREA,
+        'bicubic': cv2.INTER_CUBIC,
+        'lanczos': cv2.INTER_LANCZOS4
+    }
+
+    h, w = img.shape[0:2]
+
+    if isinstance(fill, int):
+        fill = tuple([fill] * 3)
+
+    if center is None:
+        center = (w / 2.0, h / 2.0)
+
+    M = np.ones([2, 3])
+    # Rotate and Scale
+    R = cv2.getRotationMatrix2D(angle=angle, center=center, scale=scale)
+
+    # Shear
+    sx = math.tan(shear[0] * math.pi / 180)
+    sy = math.tan(shear[1] * math.pi / 180)
+    M[0] = R[0] + sy * R[1]
+    M[1] = R[1] + sx * R[0]
+
+    # Translation
+    tx, ty = translate
+    M[0, 2] = tx
+    M[1, 2] = ty
+
+    if len(img.shape) == 3 and img.shape[2] == 1:
+        return cv2.warpAffine(
+            img,
+            M,
+            dsize=(w, h),
+            flags=_cv2_interp_from_str[interpolation],
+            borderValue=fill)[:, :, np.newaxis]
+    else:
+        return cv2.warpAffine(
+            img,
+            M,
+            dsize=(w, h),
+            flags=_cv2_interp_from_str[interpolation],
+            borderValue=fill)
+
+
 def rotate(img,
            angle,
            interpolation='nearest',
diff --git a/python/paddle/vision/transforms/functional_pil.py b/python/paddle/vision/transforms/functional_pil.py
index 71f7759f11b66..4c342e31b7f89 100644
--- a/python/paddle/vision/transforms/functional_pil.py
+++ b/python/paddle/vision/transforms/functional_pil.py
@@ -410,6 +410,32 @@ def adjust_hue(img, hue_factor):
     return img
 
 
+def affine(img, matrix, interpolation="nearest", fill=0):
+    """Affine the image by matrix.
+
+    Args:
+        img (PIL.Image): Image to be affined.
+        matrix (float or int): Affine matrix.
+        interpolation (str, optional): Interpolation method. If omitted, or if the 
+            image has only one channel, it is set to PIL.Image.NEAREST . when use pil backend, 
+            support method are as following: 
+            - "nearest": Image.NEAREST, 
+            - "bilinear": Image.BILINEAR, 
+            - "bicubic": Image.BICUBIC
+        fill (3-tuple or int): RGB pixel fill value for area outside the affined image.
+            If int, it is used for all channels respectively.
+
+    Returns:
+        PIL.Image: Affined image.
+
+    """
+    if isinstance(fill, int):
+        fill = tuple([fill] * 3)
+
+    return img.transform(img.size, Image.AFFINE, matrix,
+                         _pil_interp_from_str[interpolation], fill)
+
+
 def rotate(img,
            angle,
            interpolation="nearest",
diff --git a/python/paddle/vision/transforms/functional_tensor.py b/python/paddle/vision/transforms/functional_tensor.py
index 2e276883cd376..cafb2655659b0 100644
--- a/python/paddle/vision/transforms/functional_tensor.py
+++ b/python/paddle/vision/transforms/functional_tensor.py
@@ -226,8 +226,8 @@ def _affine_grid(theta, w, h, ow, oh):
 
 def _grid_transform(img, grid, mode, fill):
     if img.shape[0] > 1:
-        grid = grid.expand(img.shape[0], grid.shape[1], grid.shape[2],
-                           grid.shape[3])
+        grid = grid.expand(
+            shape=[img.shape[0], grid.shape[1], grid.shape[2], grid.shape[3]])
 
     if fill is not None:
         dummy = paddle.ones(
@@ -255,6 +255,47 @@ def _grid_transform(img, grid, mode, fill):
     return img
 
 
+def affine(img, matrix, interpolation="nearest", fill=None, data_format='CHW'):
+    """Affine to the image by matrix.
+
+    Args:
+        img (paddle.Tensor): Image to be rotated.
+        matrix (float or int): Affine matrix.
+        interpolation (str, optional): Interpolation method. If omitted, or if the 
+            image has only one channel, it is set NEAREST . when use pil backend, 
+            support method are as following: 
+            - "nearest" 
+            - "bilinear"
+            - "bicubic"
+        fill (3-tuple or int): RGB pixel fill value for area outside the rotated image.
+            If int, it is used for all channels respectively.
+        data_format (str, optional): Data format of img, should be 'HWC' or 
+            'CHW'. Default: 'CHW'.
+
+    Returns:
+        paddle.Tensor: Affined image.
+
+    """
+    img = img.unsqueeze(0)
+    img = img if data_format.lower() == 'chw' else img.transpose((0, 3, 1, 2))
+
+    matrix = paddle.to_tensor(matrix, place=img.place)
+    matrix = matrix.reshape((1, 2, 3))
+    shape = img.shape
+
+    grid = _affine_grid(
+        matrix, w=shape[-1], h=shape[-2], ow=shape[-1], oh=shape[-2])
+
+    if isinstance(fill, int):
+        fill = tuple([fill] * 3)
+
+    out = _grid_transform(img, grid, mode=interpolation, fill=fill)
+
+    out = out if data_format.lower() == 'chw' else out.transpose((0, 2, 3, 1))
+
+    return out.squeeze(0)
+
+
 def rotate(img,
            angle,
            interpolation='nearest',
diff --git a/python/paddle/vision/transforms/transforms.py b/python/paddle/vision/transforms/transforms.py
index ce356449c594e..42dfd6dfa4f81 100644
--- a/python/paddle/vision/transforms/transforms.py
+++ b/python/paddle/vision/transforms/transforms.py
@@ -1205,6 +1205,189 @@ def _apply_image(self, img):
         return F.pad(img, self.padding, self.fill, self.padding_mode)
 
 
+def _check_sequence_input(x, name, req_sizes):
+    msg = req_sizes[0] if len(req_sizes) < 2 else " or ".join(
+        [str(s) for s in req_sizes])
+    if not isinstance(x, Sequence):
+        raise TypeError(f"{name} should be a sequence of length {msg}.")
+    if len(x) not in req_sizes:
+        raise ValueError(f"{name} should be sequence of length {msg}.")
+
+
+def _setup_angle(x, name, req_sizes=(2, )):
+    if isinstance(x, numbers.Number):
+        if x < 0:
+            raise ValueError(
+                f"If {name} is a single number, it must be positive.")
+        x = [-x, x]
+    else:
+        _check_sequence_input(x, name, req_sizes)
+
+    return [float(d) for d in x]
+
+
+class RandomAffine(BaseTransform):
+    """Random affine transformation of the image.
+
+    Args:
+        degrees (int|float|tuple): The angle interval of the random rotation.
+            If set as a number instead of sequence like (min, max), the range of degrees
+            will be (-degrees, +degrees) in clockwise order. If set 0, will not rotate.
+        translate (tuple, optional): Maximum absolute fraction for horizontal and vertical translations.
+            For example translate=(a, b), then horizontal shift is randomly sampled in the range -img_width * a < dx < img_width * a
+            and vertical shift is randomly sampled in the range -img_height * b < dy < img_height * b. 
+            Default is None, will not translate.
+        scale (tuple, optional): Scaling factor interval, e.g (a, b), then scale is randomly sampled from the range a <= scale <= b. 
+            Default is None, will keep original scale and not scale.
+        shear (sequence or number, optional): Range of degrees to shear, ranges from -180 to 180 in clockwise order.
+            If set as a number, a shear parallel to the x axis in the range (-shear, +shear) will be applied. 
+            Else if set as a sequence of 2 values a shear parallel to the x axis in the range (shear[0], shear[1]) will be applied. 
+            Else if set as a sequence of 4 values, a x-axis shear in (shear[0], shear[1]) and y-axis shear in (shear[2], shear[3]) will be applied.
+            Default is None, will not apply shear.
+        interpolation (str, optional): Interpolation method. If omitted, or if the 
+            image has only one channel, it is set to PIL.Image.NEAREST or cv2.INTER_NEAREST 
+            according the backend. 
+            When use pil backend, support method are as following: 
+            - "nearest": Image.NEAREST, 
+            - "bilinear": Image.BILINEAR, 
+            - "bicubic": Image.BICUBIC
+            When use cv2 backend, support method are as following: 
+            - "nearest": cv2.INTER_NEAREST, 
+            - "bilinear": cv2.INTER_LINEAR, 
+            - "bicubic": cv2.INTER_CUBIC
+        fill (int|list|tuple, optional): Pixel fill value for the area outside the transformed
+            image. If given a number, the value is used for all bands respectively.
+        center (2-tuple, optional): Optional center of rotation, (x, y).
+            Origin is the upper left corner.
+            Default is the center of the image.
+        keys (list[str]|tuple[str], optional): Same as ``BaseTransform``. Default: None.
+
+    Shape:
+        - img(PIL.Image|np.ndarray|Paddle.Tensor): The input image with shape (H x W x C).
+        - output(PIL.Image|np.ndarray|Paddle.Tensor): An affined image.
+
+    Returns:
+        A callable object of RandomAffine.
+
+    Examples:
+    
+        .. code-block:: python
+
+            import paddle
+            from paddle.vision.transforms import RandomAffine
+
+            transform = RandomAffine([-90, 90], translate=[0.2, 0.2], scale=[0.5, 0.5], shear=[-10, 10])
+
+            fake_img = paddle.randn((3, 256, 300)).astype(paddle.float32)
+
+            fake_img = transform(fake_img)
+            print(fake_img.shape)
+    """
+
+    def __init__(self,
+                 degrees,
+                 translate=None,
+                 scale=None,
+                 shear=None,
+                 interpolation='nearest',
+                 fill=0,
+                 center=None,
+                 keys=None):
+        self.degrees = _setup_angle(degrees, name="degrees", req_sizes=(2, ))
+
+        super(RandomAffine, self).__init__(keys)
+        assert interpolation in ['nearest', 'bilinear', 'bicubic']
+        self.interpolation = interpolation
+
+        if translate is not None:
+            _check_sequence_input(translate, "translate", req_sizes=(2, ))
+            for t in translate:
+                if not (0.0 <= t <= 1.0):
+                    raise ValueError(
+                        "translation values should be between 0 and 1")
+        self.translate = translate
+
+        if scale is not None:
+            _check_sequence_input(scale, "scale", req_sizes=(2, ))
+            for s in scale:
+                if s <= 0:
+                    raise ValueError("scale values should be positive")
+        self.scale = scale
+
+        if shear is not None:
+            self.shear = _setup_angle(shear, name="shear", req_sizes=(2, 4))
+        else:
+            self.shear = shear
+
+        if fill is None:
+            fill = 0
+        elif not isinstance(fill, (Sequence, numbers.Number)):
+            raise TypeError("Fill should be either a sequence or a number.")
+        self.fill = fill
+
+        if center is not None:
+            _check_sequence_input(center, "center", req_sizes=(2, ))
+        self.center = center
+
+    def _get_param(self,
+                   img_size,
+                   degrees,
+                   translate=None,
+                   scale_ranges=None,
+                   shears=None):
+        """Get parameters for affine transformation
+
+        Returns:
+            params to be passed to the affine transformation
+        """
+        angle = random.uniform(degrees[0], degrees[1])
+
+        if translate is not None:
+            max_dx = float(translate[0] * img_size[0])
+            max_dy = float(translate[1] * img_size[1])
+            tx = int(random.uniform(-max_dx, max_dx))
+            ty = int(random.uniform(-max_dy, max_dy))
+            translations = (tx, ty)
+        else:
+            translations = (0, 0)
+
+        if scale_ranges is not None:
+            scale = random.uniform(scale_ranges[0], scale_ranges[1])
+        else:
+            scale = 1.0
+
+        shear_x, shear_y = 0.0, 0.0
+        if shears is not None:
+            shear_x = random.uniform(shears[0], shears[1])
+            if len(shears) == 4:
+                shear_y = random.uniform(shears[2], shears[3])
+        shear = (shear_x, shear_y)
+
+        return angle, translations, scale, shear
+
+    def _apply_image(self, img):
+        """
+        Args:
+            img (PIL.Image|np.array): Image to be affine transformed.
+
+        Returns:
+            PIL.Image or np.array: Affine transformed image.
+        """
+
+        w, h = _get_image_size(img)
+        img_size = [w, h]
+
+        ret = self._get_param(img_size, self.degrees, self.translate,
+                              self.scale, self.shear)
+
+        return F.affine(
+            img,
+            *ret,
+            interpolation=self.interpolation,
+            fill=self.fill,
+            center=self.center)
+
+
 class RandomRotation(BaseTransform):
     """Rotates the image by angle.
 

From 754820fe9e78c922d09bb44f9dc2e5579c68fa20 Mon Sep 17 00:00:00 2001
From: Feng Ni <nemonameless@qq.com>
Date: Wed, 11 May 2022 15:40:26 +0800
Subject: [PATCH 11/49] [New API] add API
 paddle.vision.transforms.RandomPerspective and
 paddle.vision.transforms.perspective (#42390)

* add RandomPerspective and perspective

* fix UT, clean codes

* fix UT

* add batch transform

* remove batch in tensor func

* fix typos and coments, test=develop
---
 python/paddle/tests/test_transforms.py        |  39 ++++++
 python/paddle/vision/transforms/__init__.py   |   4 +
 python/paddle/vision/transforms/functional.py |  89 +++++++++++++
 .../vision/transforms/functional_cv2.py       |  50 ++++++++
 .../vision/transforms/functional_pil.py       |  27 ++++
 .../vision/transforms/functional_tensor.py    |  63 ++++++++++
 python/paddle/vision/transforms/transforms.py | 119 ++++++++++++++++++
 7 files changed, 391 insertions(+)

diff --git a/python/paddle/tests/test_transforms.py b/python/paddle/tests/test_transforms.py
index 38cad05bfcb89..82ae3cb6b68f6 100644
--- a/python/paddle/tests/test_transforms.py
+++ b/python/paddle/tests/test_transforms.py
@@ -172,6 +172,14 @@ def test_rotate(self):
         ])
         self.do_transform(trans)
 
+    def test_perspective(self):
+        trans = transforms.Compose([
+            transforms.RandomPerspective(prob=1.0),
+            transforms.RandomPerspective(
+                prob=1.0, distortion_scale=0.9),
+        ])
+        self.do_transform(trans)
+
     def test_pad(self):
         trans = transforms.Compose([transforms.Pad(2)])
         self.do_transform(trans)
@@ -964,6 +972,37 @@ def test_rotate1(self):
         np.testing.assert_equal(rotated_np_img.shape,
                                 np.array(rotated_pil_img).shape)
 
+    def test_perspective(self):
+        np_img = (np.random.rand(32, 26, 3) * 255).astype('uint8')
+        pil_img = Image.fromarray(np_img).convert('RGB')
+        tensor_img = F.to_tensor(pil_img, data_format='CHW') * 255
+
+        np.testing.assert_almost_equal(
+            np_img, tensor_img.transpose((1, 2, 0)), decimal=4)
+
+        startpoints = [[0, 0], [13, 0], [13, 15], [0, 15]]
+        endpoints = [[3, 2], [12, 3], [10, 14], [2, 15]]
+
+        np_perspectived_img = F.perspective(np_img, startpoints, endpoints)
+        pil_perspectived_img = F.perspective(pil_img, startpoints, endpoints)
+        tensor_perspectived_img = F.perspective(tensor_img, startpoints,
+                                                endpoints)
+
+        np.testing.assert_equal(np_perspectived_img.shape,
+                                np.array(pil_perspectived_img).shape)
+        np.testing.assert_equal(np_perspectived_img.shape,
+                                tensor_perspectived_img.transpose(
+                                    (1, 2, 0)).shape)
+
+        result_pil = np.array(pil_perspectived_img)
+        result_tensor = tensor_perspectived_img.numpy().transpose(
+            (1, 2, 0)).astype('uint8')
+        num_diff_pixels = (result_pil != result_tensor).sum() / 3.0
+        ratio_diff_pixels = num_diff_pixels / result_tensor.shape[
+            0] / result_tensor.shape[1]
+        # Tolerance : less than 6% of different pixels
+        assert ratio_diff_pixels < 0.06
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/vision/transforms/__init__.py b/python/paddle/vision/transforms/__init__.py
index 41e9b188e34ed..5992a4f977411 100644
--- a/python/paddle/vision/transforms/__init__.py
+++ b/python/paddle/vision/transforms/__init__.py
@@ -30,6 +30,7 @@
 from .transforms import Pad  # noqa: F401
 from .transforms import RandomAffine  # noqa: F401
 from .transforms import RandomRotation  # noqa: F401
+from .transforms import RandomPerspective  # noqa: F401
 from .transforms import Grayscale  # noqa: F401
 from .transforms import ToTensor  # noqa: F401
 from .transforms import RandomErasing  # noqa: F401
@@ -40,6 +41,7 @@
 from .functional import pad  # noqa: F401
 from .functional import affine  # noqa: F401
 from .functional import rotate  # noqa: F401
+from .functional import perspective  # noqa: F401
 from .functional import to_grayscale  # noqa: F401
 from .functional import crop  # noqa: F401
 from .functional import center_crop  # noqa: F401
@@ -68,6 +70,7 @@
     'Pad',
     'RandomAffine',
     'RandomRotation',
+    'RandomPerspective',
     'Grayscale',
     'ToTensor',
     'RandomErasing',
@@ -78,6 +81,7 @@
     'pad',
     'affine',
     'rotate',
+    'perspective',
     'to_grayscale',
     'crop',
     'center_crop',
diff --git a/python/paddle/vision/transforms/functional.py b/python/paddle/vision/transforms/functional.py
index 83f756e6ed2a6..90fba1c4130e5 100644
--- a/python/paddle/vision/transforms/functional.py
+++ b/python/paddle/vision/transforms/functional.py
@@ -767,6 +767,95 @@ def rotate(img,
         return F_cv2.rotate(img, angle, interpolation, expand, center, fill)
 
 
+def _get_perspective_coeffs(startpoints, endpoints):
+    """
+    get coefficients (a, b, c, d, e, f, g, h) of the perspective transforms.
+
+    In Perspective Transform each pixel (x, y) in the original image gets transformed as,
+     (x, y) -> ( (ax + by + c) / (gx + hy + 1), (dx + ey + f) / (gx + hy + 1) )
+
+    Args:
+        startpoints (list[list[int]]): [top-left, top-right, bottom-right, bottom-left] of the original image,
+        endpoints (list[list[int]]): [top-left, top-right, bottom-right, bottom-left] of the transformed image.
+
+    Returns:
+        output (list): octuple (a, b, c, d, e, f, g, h) for transforming each pixel.
+    """
+    a_matrix = np.zeros((2 * len(startpoints), 8))
+
+    for i, (p1, p2) in enumerate(zip(endpoints, startpoints)):
+        a_matrix[2 * i, :] = [
+            p1[0], p1[1], 1, 0, 0, 0, -p2[0] * p1[0], -p2[0] * p1[1]
+        ]
+        a_matrix[2 * i + 1, :] = [
+            0, 0, 0, p1[0], p1[1], 1, -p2[1] * p1[0], -p2[1] * p1[1]
+        ]
+
+    b_matrix = np.array(startpoints).reshape([8])
+    res = np.linalg.lstsq(a_matrix, b_matrix)[0]
+
+    output = list(res)
+    return output
+
+
+def perspective(img, startpoints, endpoints, interpolation='nearest', fill=0):
+    """Perform perspective transform of the given image.
+
+    Args:
+        img (PIL.Image|np.array|paddle.Tensor): Image to be transformed.
+        startpoints (list of list of ints): List containing four lists of two integers corresponding to four corners
+            ``[top-left, top-right, bottom-right, bottom-left]`` of the original image.
+        endpoints (list of list of ints): List containing four lists of two integers corresponding to four corners
+            ``[top-left, top-right, bottom-right, bottom-left]`` of the transformed image.
+        interpolation (str, optional): Interpolation method. If omitted, or if the 
+            image has only one channel, it is set to PIL.Image.NEAREST or cv2.INTER_NEAREST 
+            according the backend. 
+            When use pil backend, support method are as following: 
+            - "nearest": Image.NEAREST, 
+            - "bilinear": Image.BILINEAR, 
+            - "bicubic": Image.BICUBIC
+            When use cv2 backend, support method are as following: 
+            - "nearest": cv2.INTER_NEAREST, 
+            - "bilinear": cv2.INTER_LINEAR, 
+            - "bicubic": cv2.INTER_CUBIC
+        fill (int|list|tuple, optional): Pixel fill value for the area outside the transformed
+            image. If given a number, the value is used for all bands respectively.
+
+    Returns:
+        PIL.Image|np.array|paddle.Tensor: transformed Image.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            from paddle.vision.transforms import functional as F
+
+            fake_img = paddle.randn((3, 256, 300)).astype(paddle.float32)
+
+            startpoints = [[0, 0], [33, 0], [33, 25], [0, 25]]
+            endpoints = [[3, 2], [32, 3], [30, 24], [2, 25]]
+
+            perspectived_img = F.perspective(fake_img, startpoints, endpoints)
+            print(perspectived_img.shape)
+
+    """
+    if not (_is_pil_image(img) or _is_numpy_image(img) or
+            _is_tensor_image(img)):
+        raise TypeError(
+            'img should be PIL Image or Tensor Image or ndarray with dim=[2 or 3]. Got {}'.
+            format(type(img)))
+
+    if _is_pil_image(img):
+        coeffs = _get_perspective_coeffs(startpoints, endpoints)
+        return F_pil.perspective(img, coeffs, interpolation, fill)
+    elif _is_tensor_image(img):
+        coeffs = _get_perspective_coeffs(startpoints, endpoints)
+        return F_t.perspective(img, coeffs, interpolation, fill)
+    else:
+        return F_cv2.perspective(img, startpoints, endpoints, interpolation,
+                                 fill)
+
+
 def to_grayscale(img, num_output_channels=1):
     """Converts image to grayscale version of image.
 
diff --git a/python/paddle/vision/transforms/functional_cv2.py b/python/paddle/vision/transforms/functional_cv2.py
index d20bf3e60d907..1b2485541c499 100644
--- a/python/paddle/vision/transforms/functional_cv2.py
+++ b/python/paddle/vision/transforms/functional_cv2.py
@@ -589,6 +589,56 @@ def transform(x, y, matrix):
             borderValue=fill)
 
 
+def perspective(img, startpoints, endpoints, interpolation='nearest', fill=0):
+    """Perspective the image.
+
+    Args:
+        img (np.array): Image to be perspectived.
+        startpoints (list[list[int]]): [top-left, top-right, bottom-right, bottom-left] of the original image,
+        endpoints (list[list[int]]): [top-left, top-right, bottom-right, bottom-left] of the transformed image.
+        interpolation (int|str, optional): Interpolation method. If omitted, or if the 
+            image has only one channel, it is set to cv2.INTER_NEAREST.
+            when use cv2 backend, support method are as following: 
+            - "nearest": cv2.INTER_NEAREST, 
+            - "bilinear": cv2.INTER_LINEAR, 
+            - "bicubic": cv2.INTER_CUBIC
+        fill (3-tuple or int): RGB pixel fill value for area outside the rotated image.
+            If int, it is used for all channels respectively.
+
+    Returns:
+        np.array: Perspectived image.
+
+    """
+    cv2 = try_import('cv2')
+    _cv2_interp_from_str = {
+        'nearest': cv2.INTER_NEAREST,
+        'bilinear': cv2.INTER_LINEAR,
+        'area': cv2.INTER_AREA,
+        'bicubic': cv2.INTER_CUBIC,
+        'lanczos': cv2.INTER_LANCZOS4
+    }
+    h, w = img.shape[0:2]
+
+    startpoints = np.array(startpoints, dtype="float32")
+    endpoints = np.array(endpoints, dtype="float32")
+    matrix = cv2.getPerspectiveTransform(startpoints, endpoints)
+
+    if len(img.shape) == 3 and img.shape[2] == 1:
+        return cv2.warpPerspective(
+            img,
+            matrix,
+            dsize=(w, h),
+            flags=_cv2_interp_from_str[interpolation],
+            borderValue=fill)[:, :, np.newaxis]
+    else:
+        return cv2.warpPerspective(
+            img,
+            matrix,
+            dsize=(w, h),
+            flags=_cv2_interp_from_str[interpolation],
+            borderValue=fill)
+
+
 def to_grayscale(img, num_output_channels=1):
     """Converts image to grayscale version of image.
 
diff --git a/python/paddle/vision/transforms/functional_pil.py b/python/paddle/vision/transforms/functional_pil.py
index 4c342e31b7f89..4b86e14039ebe 100644
--- a/python/paddle/vision/transforms/functional_pil.py
+++ b/python/paddle/vision/transforms/functional_pil.py
@@ -479,6 +479,33 @@ def rotate(img,
         fillcolor=fill)
 
 
+def perspective(img, coeffs, interpolation="nearest", fill=0):
+    """Perspective the image.
+
+    Args:
+        img (PIL.Image): Image to be perspectived.
+        coeffs (list[float]): coefficients (a, b, c, d, e, f, g, h) of the perspective transforms.
+        interpolation (str, optional): Interpolation method. If omitted, or if the 
+            image has only one channel, it is set to PIL.Image.NEAREST . when use pil backend, 
+            support method are as following: 
+            - "nearest": Image.NEAREST, 
+            - "bilinear": Image.BILINEAR, 
+            - "bicubic": Image.BICUBIC
+        fill (3-tuple or int): RGB pixel fill value for area outside the rotated image.
+            If int, it is used for all channels respectively.
+
+    Returns:
+        PIL.Image: Perspectived image.
+
+    """
+
+    if isinstance(fill, int):
+        fill = tuple([fill] * 3)
+
+    return img.transform(img.size, Image.PERSPECTIVE, coeffs,
+                         _pil_interp_from_str[interpolation], fill)
+
+
 def to_grayscale(img, num_output_channels=1):
     """Converts image to grayscale version of image.
 
diff --git a/python/paddle/vision/transforms/functional_tensor.py b/python/paddle/vision/transforms/functional_tensor.py
index cafb2655659b0..df2529d1224b3 100644
--- a/python/paddle/vision/transforms/functional_tensor.py
+++ b/python/paddle/vision/transforms/functional_tensor.py
@@ -395,6 +395,69 @@ def rotate(img,
     return out.squeeze(0)
 
 
+def _perspective_grid(img, coeffs, ow, oh, dtype):
+    theta1 = coeffs[:6].reshape([1, 2, 3])
+    tmp = paddle.tile(coeffs[6:].reshape([1, 2]), repeat_times=[2, 1])
+    dummy = paddle.ones((2, 1), dtype=dtype)
+    theta2 = paddle.concat((tmp, dummy), axis=1).unsqueeze(0)
+
+    d = 0.5
+    base_grid = paddle.ones((1, oh, ow, 3), dtype=dtype)
+
+    x_grid = paddle.linspace(d, ow * 1.0 + d - 1.0, ow)
+    base_grid[..., 0] = x_grid
+    y_grid = paddle.linspace(d, oh * 1.0 + d - 1.0, oh).unsqueeze_(-1)
+    base_grid[..., 1] = y_grid
+
+    scaled_theta1 = theta1.transpose(
+        (0, 2, 1)) / paddle.to_tensor([0.5 * ow, 0.5 * oh])
+    output_grid1 = base_grid.reshape((1, oh * ow, 3)).bmm(scaled_theta1)
+    output_grid2 = base_grid.reshape(
+        (1, oh * ow, 3)).bmm(theta2.transpose((0, 2, 1)))
+
+    output_grid = output_grid1 / output_grid2 - 1.0
+    return output_grid.reshape((1, oh, ow, 2))
+
+
+def perspective(img,
+                coeffs,
+                interpolation="nearest",
+                fill=None,
+                data_format='CHW'):
+    """Perspective the image.
+
+    Args:
+        img (paddle.Tensor): Image to be rotated.
+        coeffs (list[float]): coefficients (a, b, c, d, e, f, g, h) of the perspective transforms.
+        interpolation (str, optional): Interpolation method. If omitted, or if the 
+            image has only one channel, it is set NEAREST. When use pil backend, 
+            support method are as following: 
+            - "nearest" 
+            - "bilinear"
+            - "bicubic"
+        fill (3-tuple or int): RGB pixel fill value for area outside the rotated image.
+            If int, it is used for all channels respectively.
+
+    Returns:
+        paddle.Tensor: Perspectived image.
+
+    """
+
+    img = img.unsqueeze(0)
+
+    img = img if data_format.lower() == 'chw' else img.transpose((0, 3, 1, 2))
+    ow, oh = img.shape[-1], img.shape[-2]
+    dtype = img.dtype if paddle.is_floating_point(img) else paddle.float32
+
+    coeffs = paddle.to_tensor(coeffs, place=img.place)
+    grid = _perspective_grid(img, coeffs, ow=ow, oh=oh, dtype=dtype)
+    out = _grid_transform(img, grid, mode=interpolation, fill=fill)
+
+    out = out if data_format.lower() == 'chw' else out.transpose((0, 2, 3, 1))
+
+    return out.squeeze(0)
+
+
 def vflip(img, data_format='CHW'):
     """Vertically flips the given paddle tensor.
 
diff --git a/python/paddle/vision/transforms/transforms.py b/python/paddle/vision/transforms/transforms.py
index 42dfd6dfa4f81..79d3b1bc92ece 100644
--- a/python/paddle/vision/transforms/transforms.py
+++ b/python/paddle/vision/transforms/transforms.py
@@ -1481,6 +1481,125 @@ def _apply_image(self, img):
                         self.center, self.fill)
 
 
+class RandomPerspective(BaseTransform):
+    """Random perspective transformation with a given probability.
+
+    Args:
+        prob (float, optional): Probability of using transformation, ranges from
+            0 to 1, default is 0.5.
+        distortion_scale (float, optional): Degree of distortion, ranges from
+            0 to 1, default is 0.5.
+        interpolation (str, optional): Interpolation method. If omitted, or if
+            the image has only one channel, it is set to PIL.Image.NEAREST or
+            cv2.INTER_NEAREST.
+            When use pil backend, support method are as following: 
+            - "nearest": Image.NEAREST, 
+            - "bilinear": Image.BILINEAR, 
+            - "bicubic": Image.BICUBIC
+            When use cv2 backend, support method are as following: 
+            - "nearest": cv2.INTER_NEAREST, 
+            - "bilinear": cv2.INTER_LINEAR, 
+            - "bicubic": cv2.INTER_CUBIC
+        fill (int|list|tuple, optional): Pixel fill value for the area outside the transformed
+            image. If given a number, the value is used for all bands respectively.
+        keys (list[str]|tuple[str], optional): Same as ``BaseTransform``. Default: None.
+
+    Shape:
+        - img(PIL.Image|np.ndarray|Paddle.Tensor): The input image with shape (H x W x C).
+        - output(PIL.Image|np.ndarray|Paddle.Tensor): A perspectived image.
+
+    Returns:
+        A callable object of RandomPerspective.
+
+    Examples:
+    
+        .. code-block:: python
+
+            import paddle
+            from paddle.vision.transforms import RandomPerspective
+
+            transform = RandomPerspective(prob=1.0, distortion_scale=0.9)
+
+            fake_img = paddle.randn((3, 200, 150)).astype(paddle.float32)
+
+            fake_img = transform(fake_img)
+            print(fake_img.shape)
+    """
+
+    def __init__(self,
+                 prob=0.5,
+                 distortion_scale=0.5,
+                 interpolation='nearest',
+                 fill=0,
+                 keys=None):
+        super(RandomPerspective, self).__init__(keys)
+        assert 0 <= prob <= 1, "probability must be between 0 and 1"
+        assert 0 <= distortion_scale <= 1, "distortion_scale must be between 0 and 1"
+        assert interpolation in ['nearest', 'bilinear', 'bicubic']
+        assert isinstance(fill, (numbers.Number, str, list, tuple))
+
+        self.prob = prob
+        self.distortion_scale = distortion_scale
+        self.interpolation = interpolation
+        self.fill = fill
+
+    def get_params(self, width, height, distortion_scale):
+        """
+        Returns:
+            startpoints (list[list[int]]): [top-left, top-right, bottom-right, bottom-left] of the original image,
+            endpoints (list[list[int]]): [top-left, top-right, bottom-right, bottom-left] of the transformed image.
+        """
+        half_height = height // 2
+        half_width = width // 2
+        topleft = [
+            int(random.uniform(0, int(distortion_scale * half_width) + 1)),
+            int(random.uniform(0, int(distortion_scale * half_height) + 1)),
+        ]
+        topright = [
+            int(
+                random.uniform(width - int(distortion_scale * half_width) - 1,
+                               width)),
+            int(random.uniform(0, int(distortion_scale * half_height) + 1)),
+        ]
+        botright = [
+            int(
+                random.uniform(width - int(distortion_scale * half_width) - 1,
+                               width)),
+            int(
+                random.uniform(height - int(distortion_scale * half_height) - 1,
+                               height)),
+        ]
+        botleft = [
+            int(random.uniform(0, int(distortion_scale * half_width) + 1)),
+            int(
+                random.uniform(height - int(distortion_scale * half_height) - 1,
+                               height)),
+        ]
+        startpoints = [[0, 0], [width - 1, 0], [width - 1, height - 1],
+                       [0, height - 1]]
+        endpoints = [topleft, topright, botright, botleft]
+
+        return startpoints, endpoints
+
+    def _apply_image(self, img):
+        """
+        Args:
+            img (PIL.Image|np.array|paddle.Tensor): Image to be Perspectively transformed.
+
+        Returns:
+            PIL.Image|np.array|paddle.Tensor: Perspectively transformed image.
+        """
+
+        width, height = _get_image_size(img)
+
+        if random.random() < self.prob:
+            startpoints, endpoints = self.get_params(width, height,
+                                                     self.distortion_scale)
+            return F.perspective(img, startpoints, endpoints,
+                                 self.interpolation, self.fill)
+        return img
+
+
 class Grayscale(BaseTransform):
     """Converts image to grayscale.
 

From 5131b11f73491de72826828dc68140e1f2919dbf Mon Sep 17 00:00:00 2001
From: Aurelius84 <zhangliujie@baidu.com>
Date: Wed, 11 May 2022 15:53:49 +0800
Subject: [PATCH 12/49] [Eager]Fix EagerTensor _copy_to memory overlap problem
 (#42668)

---
 paddle/fluid/pybind/eager_method.cc | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/paddle/fluid/pybind/eager_method.cc b/paddle/fluid/pybind/eager_method.cc
index d3393b7cb57ac..0661da775df84 100644
--- a/paddle/fluid/pybind/eager_method.cc
+++ b/paddle/fluid/pybind/eager_method.cc
@@ -361,12 +361,33 @@ static PyObject* tensor_method__is_dense_tensor_hold_allocation(
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
 
+static void IncreaseTensorReferenceCountUntilCopyComplete(
+    const paddle::experimental::Tensor& tensor, const platform::Place& place) {
+  auto place_ = platform::is_gpu_place(place) ? place : tensor.place();
+
+  auto tracer = egr::Controller::Instance().GetCurrentTracer();
+  auto gc = tracer->MutableGarbageCollectorIfNotExists(place_);
+
+  // Note(dev): This is an empty callback, the only way is to "reference"
+  // inner memory Holder, so it will not be destructed until the kernels
+  // launched at current stream of given place is finished, such as
+  // CUDAPinned Mem -> CUDA by cudamemcpyAsync.
+  auto callback = [tensor, place_]() {
+    VLOG(3) << "Run callback of Tensor:" << tensor.name() << " at place "
+            << place_;
+  };
+  gc->DirectClearCallback(callback);
+}
+
 static PyObject* tensor_method__copy_to(TensorObject* self, PyObject* args,
                                         PyObject* kwargs) {
   EAGER_TRY
   auto place = CastPyArg2Place(PyTuple_GET_ITEM(args, 0), 0);
   bool blocking = CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 1), 1);
   auto cp_tensor = self->tensor.copy_to(place, blocking);
+  if (!blocking) {
+    IncreaseTensorReferenceCountUntilCopyComplete(self->tensor, place);
+  }
   egr::EagerUtils::autograd_meta(&cp_tensor)->SetStopGradient(true);
   egr::EagerUtils::autograd_meta(&cp_tensor)
       ->SetPersistable(

From 3ac9e754df3f5b713aca2895405321b5139719b8 Mon Sep 17 00:00:00 2001
From: pangyoki <pangyoki@126.com>
Date: Wed, 11 May 2022 16:01:28 +0800
Subject: [PATCH 13/49] change Readme (#42661)

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 21e0aba8b48bf..048a273a7d78b 100644
--- a/README.md
+++ b/README.md
@@ -20,7 +20,7 @@ PaddlePaddle is originated from industrial practices with dedication and commitm
 
 ## Installation
 
-### Latest PaddlePaddle Release: [v2.2](https://github.com/PaddlePaddle/Paddle/tree/release/2.2)
+### Latest PaddlePaddle Release: [v2.3](https://github.com/PaddlePaddle/Paddle/tree/release/2.3)
 
 Our vision is to enable deep learning for everyone via PaddlePaddle.
 Please refer to our [release announcement](https://github.com/PaddlePaddle/Paddle/releases) to track the latest features of PaddlePaddle.

From c16345cbf44c86b519144dba9471350188141bb6 Mon Sep 17 00:00:00 2001
From: chentianyu03 <chentianyu03@baidu.com>
Date: Wed, 11 May 2022 16:21:17 +0800
Subject: [PATCH 14/49] [Yaml]add Double grad (#42638)

* add conv2d_transpose_double_grad yaml

* add test_conv_transpose double_grad test case
---
 .../unittests/test_conv_transpose_nn_grad.py  | 44 +++++++++++++++++++
 python/paddle/utils/code_gen/backward.yaml    | 11 +++++
 2 files changed, 55 insertions(+)

diff --git a/python/paddle/fluid/tests/unittests/test_conv_transpose_nn_grad.py b/python/paddle/fluid/tests/unittests/test_conv_transpose_nn_grad.py
index a4ef15b1f0db3..b9e9224b9e402 100644
--- a/python/paddle/fluid/tests/unittests/test_conv_transpose_nn_grad.py
+++ b/python/paddle/fluid/tests/unittests/test_conv_transpose_nn_grad.py
@@ -27,6 +27,9 @@
 
 
 class TestConvTransposeDoubleGradCheck(unittest.TestCase):
+    def conv_transpose_wrapper(self, x):
+        return paddle.nn.functional.conv2d_transpose(x[0], x[1], groups=1)
+
     @prog_scope()
     def func(self, place):
         shape = [2, 4, 3, 3]
@@ -55,6 +58,11 @@ def func(self, place):
         else:
             gradient_checker.double_grad_check(
                 [x] + w, y, x_init=[x_arr] + w_arr, place=place, eps=eps)
+        gradient_checker.double_grad_check_for_dygraph(
+            self.conv_transpose_wrapper, [x] + w,
+            y,
+            x_init=[x_arr] + w_arr,
+            place=place)
 
     def test_grad(self):
         places = []
@@ -67,6 +75,10 @@ def test_grad(self):
 
 class TestConvTranspose2DoubleGradCheck_AsyPadding(
         TestConvTransposeDoubleGradCheck):
+    def conv_transpose_wrapper(self, x):
+        return paddle.nn.functional.conv2d_transpose(
+            x[0], x[1], groups=1, padding=[1, 0, 0, 1])
+
     @prog_scope()
     def func(self, place):
         shape = [2, 2, 3, 3]
@@ -100,10 +112,19 @@ def func(self, place):
         else:
             gradient_checker.double_grad_check(
                 [x] + w, y, x_init=[x_arr] + w_arr, place=place, eps=eps)
+        gradient_checker.double_grad_check_for_dygraph(
+            self.conv_transpose_wrapper, [x] + w,
+            y,
+            x_init=[x_arr] + w_arr,
+            place=place)
 
 
 class TestConvTranspose2DoubleGradCheck_PaddingSAME(
         TestConvTransposeDoubleGradCheck):
+    def conv_transpose_wrapper(self, x):
+        return paddle.nn.functional.conv2d_transpose(
+            x[0], x[1], groups=1, padding="SAME")
+
     @prog_scope()
     def func(self, place):
         shape = [2, 2, 3, 3]
@@ -137,10 +158,19 @@ def func(self, place):
         else:
             gradient_checker.double_grad_check(
                 [x] + w, y, x_init=[x_arr] + w_arr, place=place, eps=eps)
+        gradient_checker.double_grad_check_for_dygraph(
+            self.conv_transpose_wrapper, [x] + w,
+            y,
+            x_init=[x_arr] + w_arr,
+            place=place)
 
 
 class TestConvTranspose2DoubleGradCheck_PaddingVALID(
         TestConvTransposeDoubleGradCheck):
+    def conv_transpose_wrapper(self, x):
+        return paddle.nn.functional.conv2d_transpose(
+            x[0], x[1], groups=1, padding="VALID")
+
     @prog_scope()
     def func(self, place):
         shape = [2, 2, 3, 3]
@@ -174,10 +204,19 @@ def func(self, place):
         else:
             gradient_checker.double_grad_check(
                 [x] + w, y, x_init=[x_arr] + w_arr, place=place, eps=eps)
+        gradient_checker.double_grad_check_for_dygraph(
+            self.conv_transpose_wrapper, [x] + w,
+            y,
+            x_init=[x_arr] + w_arr,
+            place=place)
 
 
 class TestConvTranspose2DoubleGradCheck_ChannelLast(
         TestConvTransposeDoubleGradCheck):
+    def conv_transpose_wrapper(self, x):
+        return paddle.nn.functional.conv2d_transpose(
+            x[0], x[1], groups=1, padding=[1, 1], data_format="NHWC")
+
     @prog_scope()
     def func(self, place):
         shape = [2, 3, 3, 2]
@@ -213,6 +252,11 @@ def func(self, place):
         else:
             gradient_checker.double_grad_check(
                 [x] + w, y, x_init=[x_arr] + w_arr, place=place, eps=eps)
+        gradient_checker.double_grad_check_for_dygraph(
+            self.conv_transpose_wrapper, [x] + w,
+            y,
+            x_init=[x_arr] + w_arr,
+            place=place)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/utils/code_gen/backward.yaml b/python/paddle/utils/code_gen/backward.yaml
index 3de9e323c2ed9..ff49fd426146b 100644
--- a/python/paddle/utils/code_gen/backward.yaml
+++ b/python/paddle/utils/code_gen/backward.yaml
@@ -301,6 +301,16 @@
     use_gpudnn : true
   optional : grad_input_grad, grad_filter_grad
 
+- backward_api : conv2d_transpose_double_grad
+  forward : conv2d_transpose_grad(Tensor x, Tensor filter, Tensor grad_out, int[] strides, int[] paddings, int[] output_padding, int[] output_size, str padding_algorithm, int groups, int[] dilations, str data_format) -> Tensor(grad_x), Tensor(grad_filter)
+  args : (Tensor x, Tensor filter, Tensor grad_out, Tensor grad_x_grad, Tensor grad_filter_grad, int[] strides, int[] paddings, int[] output_padding, int[] output_size, str padding_algorithm, int groups, int[] dilations, str data_format)
+  output : Tensor(x_grad), Tensor(filter_grad), Tensor(grad_out_grad)
+  infer_meta :
+    func : Conv2dTransposeDoubleGradInferMeta
+  kernel :
+    func : conv2d_transpose_grad_grad
+    use_gpudnn : true
+
 - backward_api : conv2d_transpose_grad
   forward : conv2d_transpose(Tensor x, Tensor filter, int[] strides, int[] paddings, int[] output_padding, int[] output_size, str padding_algorithm, int groups, int[] dilations, str data_format) -> Tensor(out)
   args : (Tensor x, Tensor filter, Tensor out_grad, int[] strides, int[] paddings, int[] output_padding, int[] output_size, str padding_algorithm, int groups, int[] dilations, str data_format)
@@ -310,6 +320,7 @@
   kernel :
     func : conv2d_transpose_grad
     use_gpudnn : true
+  backward : conv2d_transpose_double_grad
 
 - backward_api : conv3d_transpose_grad
   forward : conv3d_transpose(Tensor x, Tensor filter, int[] strides, int[] paddings, int[] output_padding, int[] output_size, str padding_algorithm, int groups, int[] dilations, str data_format) -> Tensor(out)

From c06529722ddfc2948d3db0689c6d00425251788c Mon Sep 17 00:00:00 2001
From: Zuza Gawrysiak <gawrysiak.zuzanna@gmail.com>
Date: Wed, 11 May 2022 12:20:13 +0200
Subject: [PATCH 15/49] Move weights and biases scale computing into pass
 (#42241)

* Add int8 scales gathering pass for convolution

* Fix typo

* Add unittest

* Add corrected unit test

* Change test name

* Remove enabling mkldnn in test

* Speed up test

* Change max examples

* Add functional test

* Change test name

* Add new test case

* Rename pass
---
 paddle/fluid/framework/ir/CMakeLists.txt      |   2 +
 .../framework/ir/graph_pattern_detector.h     |   2 +-
 .../int8_scale_calculation_mkldnn_pass.cc     | 179 ++++++++++++++++++
 .../int8_scale_calculation_mkldnn_pass.h      |  42 ++++
 ...t8_scale_calculation_mkldnn_pass_tester.cc | 149 +++++++++++++++
 .../fluid/inference/api/mkldnn_quantizer.cc   |   1 +
 .../fluid/operators/mkldnn/conv_mkldnn_op.cc  |  28 ++-
 .../quantization/quant2_int8_mkldnn_pass.py   |   1 +
 ...test_mkldnn_int8_scale_calculation_pass.py | 146 ++++++++++++++
 tools/static_mode_white_list.py               |   1 +
 10 files changed, 545 insertions(+), 6 deletions(-)
 create mode 100644 paddle/fluid/framework/ir/mkldnn/int8_scale_calculation_mkldnn_pass.cc
 create mode 100644 paddle/fluid/framework/ir/mkldnn/int8_scale_calculation_mkldnn_pass.h
 create mode 100644 paddle/fluid/framework/ir/mkldnn/int8_scale_calculation_mkldnn_pass_tester.cc
 create mode 100644 python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_int8_scale_calculation_pass.py

diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt
index 283e79b81e7c6..d000dc7085365 100755
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -123,6 +123,7 @@ if(WITH_MKLDNN)
     pass_library(conv_activation_mkldnn_fuse_pass inference DIR mkldnn)
     pass_library(conv_concat_relu_mkldnn_fuse_pass inference DIR mkldnn)
     pass_library(conv_elementwise_add_mkldnn_fuse_pass inference DIR mkldnn)
+    pass_library(int8_scale_calculation_mkldnn_pass inference DIR mkldnn)
     pass_library(fc_elementwise_add_mkldnn_fuse_pass inference DIR mkldnn)
     pass_library(scale_matmul_fuse_pass inference DIR mkldnn)
     pass_library(cpu_bfloat16_placement_pass inference DIR mkldnn)
@@ -209,6 +210,7 @@ if (WITH_MKLDNN)
     cc_test(test_conv_activation_mkldnn_fuse_pass SRCS mkldnn/conv_activation_mkldnn_fuse_pass_tester.cc DEPS conv_activation_mkldnn_fuse_pass)
     cc_test(test_conv_concat_relu_mkldnn_fuse_pass SRCS mkldnn/conv_concat_relu_mkldnn_fuse_pass_tester.cc DEPS conv_concat_relu_mkldnn_fuse_pass)
     cc_test(test_conv_elementwise_add_mkldnn_fuse_pass SRCS mkldnn/conv_elementwise_add_mkldnn_fuse_pass_tester.cc DEPS conv_elementwise_add_mkldnn_fuse_pass pass_test_util)
+    cc_test(test_int8_scale_calculation_mkldnn_pass SRCS mkldnn/int8_scale_calculation_mkldnn_pass_tester.cc DEPS int8_scale_calculation_mkldnn_pass pass_test_util)
     cc_test(test_fc_elementwise_add_mkldnn_fuse_pass SRCS mkldnn/fc_elementwise_add_mkldnn_fuse_pass_tester.cc DEPS fc_elementwise_add_mkldnn_fuse_pass pass_test_util)
     cc_test(test_fc_act_mkldnn_fuse_pass SRCS mkldnn/fc_act_mkldnn_fuse_pass_tester.cc DEPS fc_act_mkldnn_fuse_pass pass_test_util)
     cc_test(test_batch_norm_act_fuse_pass SRCS mkldnn/batch_norm_act_fuse_pass_tester.cc DEPS batch_norm_act_fuse_pass pass_test_util)
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h
index 96a1e5c0719dc..c9fea057d444d 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.h
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.h
@@ -1057,7 +1057,7 @@ struct Pool : public PatternBase {
 
 // Elementwise ops
 // Forward pass for element-wise operators (add, mul)
-// elementwise_mul_out is the result of the operator
+// elementwise_out is the result of the operator
 struct Elementwise : public PatternBase {
   Elementwise(PDPattern* pattern, const std::string& name_scope)
       : PatternBase(pattern, name_scope, "elementwise") {}
diff --git a/paddle/fluid/framework/ir/mkldnn/int8_scale_calculation_mkldnn_pass.cc b/paddle/fluid/framework/ir/mkldnn/int8_scale_calculation_mkldnn_pass.cc
new file mode 100644
index 0000000000000..678a8fb4a6955
--- /dev/null
+++ b/paddle/fluid/framework/ir/mkldnn/int8_scale_calculation_mkldnn_pass.cc
@@ -0,0 +1,179 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/mkldnn/int8_scale_calculation_mkldnn_pass.h"
+
+#include "paddle/fluid/framework/op_version_registry.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/mkldnn_helper.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+Int8ScaleCalculationMkldnnPass::Int8ScaleCalculationMkldnnPass() {
+  AddOpCompat(OpCompat("conv2d"))
+      .AddInput("Input")
+      .IsTensor()
+      .End()
+      .AddInput("Filter")
+      .IsTensor()
+      .End()
+      .AddInput("Bias")
+      .IsTensor()
+      .IsOptional()
+      .End()
+      .AddInput("ResidualData")
+      .IsTensor()
+      .IsOptional()
+      .End()
+      .AddOutput("Output")
+      .IsTensor()
+      .End()
+      .AddAttr("strides")
+      .IsType<std::vector<int>>()
+      .End()
+      .AddAttr("paddings")
+      .IsType<std::vector<int>>()
+      .End()
+      .AddAttr("padding_algorithm")
+      .IsOptional()
+      .IsStringIn({"EXPLICIT", "SAME", "VALID"})
+      .End()
+      .AddAttr("groups")
+      .IsNumGE(1)
+      .End()
+      .AddAttr("dilations")
+      .IsType<std::vector<int>>()
+      .End()
+      .AddAttr("data_format")
+      .IsStringIn({"NCHW", "AnyLayout"})
+      .End();
+}
+
+void Int8ScaleCalculationMkldnnPass::ApplyImpl(ir::Graph* graph) const {
+  PADDLE_ENFORCE_NOT_NULL(graph,
+                          platform::errors::InvalidArgument(
+                              "Pointer to graph argument should not be NULL."));
+  FusePassBase::Init("int8_scale_calculation_mkldnn_pass", graph);
+  GraphPatternDetector gpd;
+  patterns::Conv conv_pattern(gpd.mutable_pattern(),
+                              "int8_scale_calculation_mkldnn_pass");
+  conv_pattern();
+
+  int found_int8_scales_count = 0;
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* g) {
+    if (!IsCompat(subgraph, g)) {
+      LOG(WARNING) << "Pass in op compat failed.";
+      return;
+    }
+    GET_IR_NODE_FROM_SUBGRAPH(conv_op, conv_op, conv_pattern);
+
+    if (!platform::HasOpINT8DataType(conv_op->Op()) ||
+        conv_op->Op()->HasAttr("Sum_scale")) {
+      return;
+    }
+
+    GET_IR_NODE_FROM_SUBGRAPH(conv_input, conv_input, conv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(conv_filter, conv_filter, conv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(conv_output, conv_output, conv_pattern);
+
+    auto input_names = conv_op->Op()->InputNames();
+    bool has_bias = std::find(input_names.begin(), input_names.end(), "Bias") !=
+                    input_names.end();
+    std::vector<int64_t> weights_tz = conv_filter->Var()->GetShape();
+    const int groups =
+        std::max(conv_op->Op()->GetAttrIfExists<int>("groups"), 1);
+
+    const auto& scale_weights_data =
+        conv_op->Op()->GetAttrIfExists<std::vector<float>>("Scale_weights");
+    const auto& scale_in_data =
+        conv_op->Op()->GetAttrIfExists<float>("Scale_in");
+
+    bool is_multi_channel = scale_weights_data.size() > 1;
+
+    int count = 1;
+    if (is_multi_channel) {
+      count *= weights_tz[0];
+      if (groups > 1) {
+        count *= weights_tz[1];
+      }
+    }
+
+    if (has_bias && conv_op->Op()->Input("Bias").size() > 0) {
+      auto bias_scales = std::vector<float>(count);
+      for (int i = 0; i < count; i++) {
+        bias_scales[i] = scale_in_data * scale_weights_data[i];
+      }
+      conv_op->Op()->SetAttr("Bias_scales", bias_scales);
+    }
+
+    const bool& force_fp32_output =
+        conv_op->Op()->GetAttrIfExists<bool>("force_fp32_output");
+    const bool& fuse_residual_conn =
+        conv_op->Op()->GetAttrIfExists<bool>("fuse_residual_connection");
+    const auto& scale_in_eltwise_data =
+        conv_op->Op()->GetAttrIfExists<float>("Scale_in_eltwise");
+    bool has_activation =
+        !conv_op->Op()->GetAttrIfExists<std::string>("fuse_activation").empty();
+    float activation_scale =
+        force_fp32_output
+            ? 1.0f
+            : has_activation
+                  ? conv_op->Op()->GetAttrIfExists<float>("Scale_out")
+                  : 1.0f;
+    auto scale_out_data =
+        force_fp32_output
+            ? 1.0f
+            : has_activation
+                  ? 1.0f
+                  : conv_op->Op()->GetAttrIfExists<float>("Scale_out");
+    float sum_scale =
+        fuse_residual_conn ? scale_out_data / scale_in_eltwise_data : 1.0f;
+
+    std::vector<float> output_shift_scale(count);
+
+#pragma omp parallel for if (count > 50)
+    for (int i = 0; i < count; i++) {
+      if (scale_weights_data[i] == 0.0)
+        // weights data will contain 0 in some models, then weights
+        // scale couldn't be calculated
+        output_shift_scale[i] = scale_out_data;
+      else
+        output_shift_scale[i] =
+            static_cast<float>(static_cast<double>(scale_out_data) /
+                               (static_cast<double>(scale_in_data) *
+                                static_cast<double>(scale_weights_data[i])));
+    }
+
+    conv_op->Op()->SetAttr("Sum_scale", sum_scale);
+    conv_op->Op()->SetAttr("Output_shift_scale", output_shift_scale);
+    conv_op->Op()->SetAttr("Activation_scale", activation_scale);
+    found_int8_scales_count++;
+  };
+  gpd(graph, handler);
+  AddStatis(found_int8_scales_count);
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(int8_scale_calculation_mkldnn_pass,
+              paddle::framework::ir::Int8ScaleCalculationMkldnnPass);
+REGISTER_PASS_CAPABILITY(int8_scale_calculation_mkldnn_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination().LE(
+            "conv2d", 1));
diff --git a/paddle/fluid/framework/ir/mkldnn/int8_scale_calculation_mkldnn_pass.h b/paddle/fluid/framework/ir/mkldnn/int8_scale_calculation_mkldnn_pass.h
new file mode 100644
index 0000000000000..9233650a2db3c
--- /dev/null
+++ b/paddle/fluid/framework/ir/mkldnn/int8_scale_calculation_mkldnn_pass.h
@@ -0,0 +1,42 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+// #include <memory>
+// #include <string>
+// #include <unordered_map>
+
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+class Graph;
+/*
+ * compute quantization scales for biases and weights
+ */
+class Int8ScaleCalculationMkldnnPass : public FusePassBase {
+ public:
+  Int8ScaleCalculationMkldnnPass();
+  virtual ~Int8ScaleCalculationMkldnnPass() {}
+
+ protected:
+  void ApplyImpl(ir::Graph* graph) const override;
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/mkldnn/int8_scale_calculation_mkldnn_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/int8_scale_calculation_mkldnn_pass_tester.cc
new file mode 100644
index 0000000000000..804d04e35f690
--- /dev/null
+++ b/paddle/fluid/framework/ir/mkldnn/int8_scale_calculation_mkldnn_pass_tester.cc
@@ -0,0 +1,149 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/mkldnn/int8_scale_calculation_mkldnn_pass.h"
+#include <gtest/gtest.h>
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name,
+           const std::vector<std::string>& inputs,
+           const std::vector<std::string>& outputs,
+           std::vector<float> scale_weights = {1.5f}) {
+  auto* op = prog->MutableBlock(0)->AppendOp();
+
+  op->SetType(type);
+  if (type == "conv2d") {
+    op->SetAttr("use_mkldnn", true);
+    op->SetAttr("name", name);
+    op->SetAttr("strides", std::vector<int>({1, 1}));
+    op->SetAttr("groups", 1);
+    op->SetAttr("paddings", std::vector<int>({0, 0}));
+    op->SetAttr("padding_algorithm", std::string("EXPLICIT"));
+    op->SetAttr("dilations", std::vector<int>({1, 1}));
+    op->SetAttr("data_format", std::string("NCHW"));
+    op->SetInput("Input", {inputs[0]});
+    op->SetInput("Filter", {inputs[1]});
+    if (inputs.size() > 2)
+      op->SetInput("Bias", {inputs[2]});
+    else
+      op->SetInput("Bias", {});
+
+    op->SetOutput("Output", outputs);
+    op->SetAttr("Scale_in", 1.0f);
+    op->SetAttr("Scale_out", 1.0f);
+    op->SetAttr("Scale_weights", scale_weights);
+    op->SetAttr("use_mkldnn", true);
+    op->SetAttr("mkldnn_data_type", std::string("int8"));
+  } else {
+    FAIL() << "Unexpected operator type.";
+  }
+}
+
+ProgramDesc BuildProgramDesc(bool convWithExistingBias,
+                             std::vector<float> scale_weights = {1.5}) {
+  ProgramDesc prog;
+  std::vector<std::string> nodes{"c", "weights", "f"};
+  if (convWithExistingBias) nodes.push_back("conv_bias");
+  for (auto& v : nodes) {
+    auto* var = prog.MutableBlock(0)->Var(v);
+    var->SetType(proto::VarType::LOD_TENSOR);
+    if (v == "weights") {
+      var->SetPersistable(true);
+      var->SetShape({1, static_cast<int>(scale_weights.size()), 1, 1});
+    }
+  }
+
+  if (convWithExistingBias) {
+    SetOp(&prog, "conv2d", "conv",
+          std::vector<std::string>({"c", "weights", "conv_bias"}),
+          std::vector<std::string>({"f"}), scale_weights);
+  } else if (scale_weights.size() > 1) {
+    SetOp(&prog, "conv2d", "conv",
+          std::vector<std::string>({"c", "weights", "conv_bias"}),
+          std::vector<std::string>({"f"}), scale_weights);
+  } else {
+    SetOp(&prog, "conv2d", "conv", std::vector<std::string>({"c", "weights"}),
+          std::vector<std::string>({"f"}));
+  }
+
+  return prog;
+}
+
+void MainTest(bool convWithExistingBias, int removed_nodes_count, float scale,
+              std::vector<float> scale_weights = {1.5f}) {
+  auto prog = BuildProgramDesc(convWithExistingBias, scale_weights);
+  std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
+  auto pass =
+      PassRegistry::Instance().Get("int8_scale_calculation_mkldnn_pass");
+  int original_nodes_num = graph->Nodes().size();
+  graph.reset(pass->Apply(graph.release()));
+  int current_nodes_num = graph->Nodes().size();
+
+  EXPECT_EQ(original_nodes_num, current_nodes_num);
+
+  for (auto* node : graph->Nodes()) {
+    if (node->IsOp() && node->Op()->Type() == "conv2d") {
+      auto* op = node->Op();
+      ASSERT_TRUE(op->HasAttr("use_mkldnn"));
+
+      EXPECT_EQ(op->GetAttrIfExists<std::vector<float>>("Scale_weights"),
+                scale_weights);
+      EXPECT_EQ(op->GetAttrIfExists<float>("Scale_in"), scale);
+      EXPECT_EQ(op->GetAttrIfExists<float>("Scale_out"), scale);
+
+      EXPECT_EQ(op->GetAttrIfExists<float>("Sum_scale"), scale);
+      EXPECT_EQ(
+          op->GetAttrIfExists<std::vector<float>>("Output_shift_scale")[0],
+          scale / scale_weights[0]);
+      EXPECT_EQ(op->GetAttrIfExists<float>("Activation_scale"), scale);
+
+      if (convWithExistingBias) {
+        EXPECT_EQ(op->GetAttrIfExists<std::vector<float>>("Bias_scales")[0],
+                  scale * scale_weights[0]);
+      }
+    }
+  }
+  EXPECT_EQ(original_nodes_num - removed_nodes_count, current_nodes_num);
+}
+
+TEST(Int8ScaleCalculationMkldnnPass, int8_scale_calculation_with_no_bias) {
+  auto scale = 1.0f;
+  int removed_nodes_count = 0;
+  auto scale_weights = {1.5f};
+  MainTest(false, removed_nodes_count, scale, scale_weights);
+}
+
+TEST(Int8ScaleCalculationMkldnnPass, int8_scale_calculation_with_bias) {
+  auto scale = 1.0f;
+  int removed_nodes_count = 0;
+  auto scale_weights = {1.5f};
+  MainTest(true, removed_nodes_count, scale, scale_weights);
+}
+
+TEST(Int8ScaleCalculationMkldnnPass,
+     int8_scale_calculation_with_bias_scale_weights) {
+  auto scale = 1.0f;
+  int removed_nodes_count = 0;
+  std::vector<float> scale_weights = {1.5f, 2.3f};
+  MainTest(true, removed_nodes_count, scale, scale_weights);
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+USE_PASS(int8_scale_calculation_mkldnn_pass);
diff --git a/paddle/fluid/inference/api/mkldnn_quantizer.cc b/paddle/fluid/inference/api/mkldnn_quantizer.cc
index 3a3e6a0908ea1..4dc80a1d75390 100644
--- a/paddle/fluid/inference/api/mkldnn_quantizer.cc
+++ b/paddle/fluid/inference/api/mkldnn_quantizer.cc
@@ -571,6 +571,7 @@ void AnalysisPredictor::MkldnnQuantizer::PrepareArgument() const {
   auto* builder = predictor_.config_.pass_builder();
   builder->SetPasses({
       "cpu_quantize_pass", "cpu_quantize_squash_pass",
+      "int8_scale_calculation_mkldnn_pass",
   });
   if (predictor_.config_.ir_debug_) builder->TurnOnDebug();
   auto passes = builder->AllPasses();
diff --git a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
index 67d1aaa4baf52..fba17d303f282 100644
--- a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
@@ -223,9 +223,17 @@ class ConvMKLDNNHandlerT
       float sum_scale = 1.0f;
       float activation_scale = 1.0f;
       std::vector<float> output_shift_scale;
-      if (platform::is_int8<T>())
-        std::tie(sum_scale, output_shift_scale, activation_scale) =
-            get_int8_scales(ctx);
+      if (platform::is_int8<T>()) {
+        if (ctx.HasAttr("Sum_scale")) {
+          sum_scale = ctx.Attr<float>("Sum_scale");
+          activation_scale = ctx.Attr<float>("Activation_scale");
+          output_shift_scale =
+              ctx.Attr<std::vector<float>>("Output_shift_scale");
+        } else {
+          std::tie(sum_scale, output_shift_scale, activation_scale) =
+              get_int8_scales(ctx);
+        }
+      }
 
       const dnnl::primitive_attr conv_attr = CreatePostOps(
           fuse_activation, fuse_alpha, fuse_beta, fuse_residual_conn,
@@ -872,8 +880,18 @@ class ConvMKLDNNOpKernel : public framework::OpKernel<T> {
         {DNNL_ARG_DST, *dst_memory_p}};
 
     if (bias) {
-      auto p_scales_tuple = handler.get_int8_bias_scales(ctx);
-
+      std::vector<float> bias_scales;
+      auto p_scales_tuple =
+          std::make_shared<std::tuple<float, std::vector<float>>>(
+              std::make_tuple(static_cast<float>(mask_reorder), bias_scales));
+      if (ctx.HasAttr("Bias_scales")) {
+        bias_scales = ctx.Attr<std::vector<float>>("Bias_scales");
+        p_scales_tuple =
+            std::make_shared<std::tuple<float, std::vector<float>>>(
+                std::make_tuple(static_cast<float>(mask_reorder), bias_scales));
+      } else {
+        p_scales_tuple = handler.get_int8_bias_scales(ctx);
+      }
       auto bias_memory_p = handler.AcquireBiasMemoryWithReorder(
           bias, true, std::get<1>(*p_scales_tuple),
           std::get<0>(*p_scales_tuple));
diff --git a/python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py b/python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py
index e8a9300635e2c..e543bc1e17b2c 100644
--- a/python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py
+++ b/python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py
@@ -668,4 +668,5 @@ def _quantize_fp32_graph(self, graph):
             graph, 'cpu_quantize_pass', ['quant_var_scales', 'data_layout'],
             [self._var_quant_scales, self._get_data_layout(graph)])
         graph = self._apply_pass(graph, 'cpu_quantize_squash_pass')
+        graph = self._apply_pass(graph, 'int8_scale_calculation_mkldnn_pass')
         return graph
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_int8_scale_calculation_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_int8_scale_calculation_pass.py
new file mode 100644
index 0000000000000..31415f6472587
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_int8_scale_calculation_pass.py
@@ -0,0 +1,146 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from auto_scan_test import PassAutoScanTest
+from program_config import TensorConfig, ProgramConfig, OpConfig
+import unittest
+
+import hypothesis.strategies as st
+
+
+class TestInt8ScaleCalculationMkldnnPass(PassAutoScanTest):
+    def sample_predictor_configs(self, program_config):
+        config = self.create_inference_config(use_gpu=False)
+        config.pass_builder().append_pass("int8_scale_calculation_mkldnn_pass")
+        yield config, ["conv2d"], (1e-4, 1e-5)
+
+    def is_program_valid(self, prog_config):
+        paddings = prog_config.ops[0].attrs["paddings"]
+        strides = prog_config.ops[0].attrs["strides"]
+        groups = prog_config.ops[0].attrs["groups"]
+        padding_algorithm = prog_config.ops[0].attrs["padding_algorithm"]
+        dilations = prog_config.ops[0].attrs["dilations"]
+        data_format = prog_config.ops[0].attrs["data_format"]
+        filter_shape = prog_config.weights["filter"].shape
+        input_shape = prog_config.inputs["input_x"].shape
+        if padding_algorithm == "VALID":
+            if ((input_shape[2] - (dilations[0] * (filter_shape[2] - 1) + 1)) / strides[0] + 1) <= 1 or \
+            ((input_shape[3] - (dilations[1] * (filter_shape[3] - 1) + 1)) / strides[1] + 1) <= 1:
+                return False
+        if padding_algorithm == "EXPLICIT":
+            if ((input_shape[2] + paddings[0] + paddings[1] - (dilations[0] * (filter_shape[2] - 1) + 1)) / strides[0] + 1) <= 1 or \
+                ((input_shape[3] + paddings[2] + paddings[3] - (dilations[1] * (filter_shape[3] - 1) + 1)) / strides[1] + 1) <= 1:
+                return False
+        if data_format == "NCHW":
+            if input_shape[1] != filter_shape[1] * groups:
+                return False
+            if filter_shape[0] % groups != 0:
+                return False
+        else:
+            if input_shape[3] != filter_shape[1] * groups:
+                return False
+            if filter_shape[0] % groups != 0:
+                return False
+        return True
+
+    def sample_program_config(self, draw):
+        x_shape = draw(
+            st.lists(
+                st.integers(
+                    min_value=5, max_value=100), min_size=4, max_size=4))
+        x_shape[1] = draw(st.integers(min_value=5, max_value=10))
+
+        data_format = draw(st.sampled_from(["NCHW", "NHWC"]))
+
+        f_shape = draw(
+            st.lists(
+                st.integers(
+                    min_value=1, max_value=4), min_size=4, max_size=4))
+        if data_format == "NCHW":
+            f_shape[1] = x_shape[1]
+        else:
+            f_shape[1] = x_shape[3]
+
+        strides = draw(
+            st.lists(
+                st.integers(
+                    min_value=1, max_value=4), min_size=2, max_size=2))
+
+        padding_algorithm = draw(st.sampled_from(["EXPLICIT", "SAME", "VALID"]))
+
+        padding = draw(
+            st.lists(
+                st.integers(
+                    min_value=1, max_value=4), min_size=4, max_size=4))
+
+        groups = draw(st.integers(min_value=1, max_value=3))
+
+        dilations = draw(
+            st.lists(
+                st.integers(
+                    min_value=1, max_value=4), min_size=2, max_size=2))
+
+        bias_shape = [f_shape[0]]
+        inputs = dict()
+        weights = dict()
+        use_mkldnn = True
+
+        has_bias = draw(st.booleans())
+        if has_bias:
+            inputs = {
+                "Input": ["input_x"],
+                "Filter": ["filter"],
+            }
+            weights = {
+                "filter": TensorConfig(shape=f_shape),
+                "bias": TensorConfig(shape=bias_shape),
+            }
+        else:
+            inputs = {
+                "Input": ["input_x"],
+                "Filter": ["filter"],
+            }
+            weights = {"filter": TensorConfig(shape=f_shape), }
+
+        conv2d_op = OpConfig(
+            "conv2d",
+            inputs=inputs,
+            outputs={"Output": ["conv2d_out"]},
+            strides=strides,
+            padding_algorithm=padding_algorithm,
+            paddings=padding,
+            groups=groups,
+            dilations=dilations,
+            data_format=data_format,
+            use_mkldnn=use_mkldnn,
+            mkldnn_data_type="int8")
+
+        ops = [conv2d_op]
+
+        program_config = ProgramConfig(
+            ops=ops,
+            weights=weights,
+            inputs={"input_x": TensorConfig(shape=x_shape)},
+            outputs=["conv2d_out"])
+        return program_config
+
+    def test(self):
+        self.run_and_statis(
+            quant=False,
+            max_examples=100,
+            passes=["int8_scale_calculation_mkldnn_pass"])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tools/static_mode_white_list.py b/tools/static_mode_white_list.py
index 5070ea2ef06a3..6067b40f0a7c1 100755
--- a/tools/static_mode_white_list.py
+++ b/tools/static_mode_white_list.py
@@ -655,6 +655,7 @@
     'test_transpose_mkldnn_op',
     'test_mkldnn_conv_activation_fuse_pass',
     'test_mkldnn_conv_concat_relu_mkldnn_fuse_pass',
+    'test_mkldnn_int8_scale_calculation_pass',
     'test_mkldnn_matmul_op_output_fuse_pass',
     'test_mkldnn_matmul_transpose_reshape_fuse_pass',
     'test_mkldnn_scale_matmul_fuse_pass',

From ba71fbea5c73040639f5c8074c426e2603067a65 Mon Sep 17 00:00:00 2001
From: zyfncg <zhangyunfei07@baidu.com>
Date: Wed, 11 May 2022 20:27:53 +0800
Subject: [PATCH 16/49] [Phi] Change the output format of C++ backward api
 (Part1) (#42677)

* change the output format of C++ backward api

* fix merge conflict

* fix sparse api code auto-gen

* fix eager_gen bug

* fix bug of output is null

* fix bug of conv2d_grad_impl

* fix optional grad

* fix bug of eager-gen double_grad

* fix bug

* fix multiply_double_grad bug

* remove node pruning
---
 .../final_state_generator/CMakeLists.txt      |   4 +-
 .../final_state_generator/eager_gen.py        | 169 +++++++++---------
 paddle/phi/api/lib/CMakeLists.txt             |   1 +
 paddle/phi/api/lib/api_custom_impl.cc         |  65 +++----
 paddle/phi/api/lib/api_custom_impl.h          |  42 ++---
 paddle/phi/api/lib/api_gen_utils.cc           |  21 ++-
 paddle/phi/api/lib/api_gen_utils.h            |   3 +
 paddle/phi/infermeta/multiary.cc              |   4 +-
 .../impl/elementwise_grad_kernel_impl.h       |  14 ++
 paddle/phi/tests/api/test_matmul_api.cc       |  14 +-
 paddle/phi/tests/api/test_sparse_conv_api.cc  |   6 +-
 python/paddle/utils/code_gen/api_base.py      |  42 +++--
 python/paddle/utils/code_gen/api_gen.py       |  10 +-
 python/paddle/utils/code_gen/backward.yaml    |  10 +-
 .../paddle/utils/code_gen/backward_api_gen.py |  62 +++++--
 .../paddle/utils/code_gen/sparse_api_gen.py   |  16 +-
 .../utils/code_gen/sparse_bw_api_gen.py       |  41 ++---
 .../paddle/utils/code_gen/strings_api_gen.py  |   2 +-
 18 files changed, 296 insertions(+), 230 deletions(-)

diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/CMakeLists.txt b/paddle/fluid/eager/auto_code_generator/final_state_generator/CMakeLists.txt
index 8e89ea3f19762..94f7f717fb24a 100644
--- a/paddle/fluid/eager/auto_code_generator/final_state_generator/CMakeLists.txt
+++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/CMakeLists.txt
@@ -16,9 +16,9 @@ add_custom_target(eager_final_state_codegen
     COMMAND "${PYTHON_EXECUTABLE}" "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py" 
             "--api_yaml_path=${api_yaml_path}"
             "--backward_yaml_path=${backward_yaml_path}"
-            "--forwards_cc_path=${tmp_forwards_cc_path}" 
+            "--forwards_cc_path=${tmp_forwards_cc_path}"
             "--forwards_h_path=${tmp_forwards_h_path}"
-            "--nodes_cc_path=${tmp_nodes_cc_path}" 
+            "--nodes_cc_path=${tmp_nodes_cc_path}"
             "--nodes_h_path=${tmp_nodes_h_path}"
     COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_forwards_cc_path} ${forwards_cc_path}
     COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_forwards_h_path} ${forwards_h_path}
diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py
index 9d95b9488d298..092c4b6e605db 100644
--- a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py
+++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py
@@ -146,10 +146,7 @@ class {} : public egr::GradNodeBase {{
 {}
 
   // Call grad_api function
-  VLOG(3) << \"Final State Running: \" << \"{}\";
-{}
-
-  // Get Output
+  VLOG(3) << \"Final State Running: {}\";
 {}
   // Get GradIn autograd_meta
 {}
@@ -641,7 +638,7 @@ def GenerateNodeCreationCodes(self):
             pass_stop_gradient_args_list.append(output_autograd_meta_name)
         pass_stop_gradient_args_str = ",".join(pass_stop_gradient_args_list)
 
-        # Node Construction        
+        # Node Construction
         num_backward_inputs = len(forward_outputs_position_map.keys())
         num_backward_outputs = len(forward_inputs_position_map.keys())
         grad_node_name = GetGradNodeName(forward_api_name)
@@ -701,6 +698,7 @@ def GenerateNodeCreationCodes(self):
             set_output_tensor_wrappers_list)
 
         # SetGradOutMeta & SetEdges
+        grad_node_out_list = []
         set_grad_out_meta_list = []
         set_edges_list = []
         for name, (_, pos) in forward_inputs_position_map.items():
@@ -713,7 +711,7 @@ def GenerateNodeCreationCodes(self):
             if not has_corresponding_grad_output:
                 continue
 
-            input_autograd_meta_name = GetAutoGradMetaName(name)
+            grad_node_out_list.append(name)
             is_optional = (name in self.optional_inputs)
             if is_optional:
                 set_grad_out_meta = f"{indent}if({name}.get_ptr() != nullptr) grad_node->SetGradOutMeta(*({name}.get_ptr()), {pos});"
@@ -755,6 +753,7 @@ def GenerateNodeCreationCodes(self):
             set_input_tensor_wrappers_str, set_grad_out_meta_str,
             set_out_rank_str, set_history_str, set_grad_in_meta_str,
             set_retain_grad_str, set_output_tensor_wrappers_str)
+        self.grad_node_out_list = grad_node_out_list
 
     def run(self):
         # Basic Validation Check
@@ -1140,6 +1139,7 @@ def GenerateHigherOrderNodeCreationCode(self):
         next_grad_api_contents = self.next_grad_api_contents
 
         grad_node_creation_str = ""
+        grad_node_out_list = []
         if next_grad_api_contents:
             forward_api_contents = grad_api_contents
             forward_api_contents['api'] = forward_api_contents['backward_api']
@@ -1150,10 +1150,11 @@ def GenerateHigherOrderNodeCreationCode(self):
             next_node_generator.run()
             next_node_generator.GenerateNodeCreationCodes()
             grad_node_creation_str = next_node_generator.node_creation_str
+            grad_node_out_list = next_node_generator.grad_node_out_list
 
             self.RecordGrad2NextGradNameMapping(next_node_generator)
 
-        return grad_node_creation_str
+        return grad_node_creation_str, grad_node_out_list
 
     def GenerateNodeDeclaration(self):
         forward_op_name = self.forward_api_name
@@ -1214,7 +1215,8 @@ def GenerateNodeDeclaration(self):
             set_attribute_methods_str, tensor_wrapper_members_str,
             attribute_members_str)
 
-    def GenerateNodeDefinition(self, grad_node_creation_str):
+    def GenerateNodeDefinition(self, grad_node_creation_str,
+                               grad_node_out_list):
         namespace = self.namespace
         forward_api_name = self.forward_api_name
         backward_api_name = self.backward_api_name
@@ -1290,28 +1292,41 @@ def GenerateNodeDefinition(self, grad_node_creation_str):
             get_grad_in_args_list.append(get_attr_str)
 
         get_grad_in_args_str = "\n".join(get_grad_in_args_list)
-        grad_api_args_str = ", ".join(grad_api_args)
-
-        # Grad Function Call String
-        grad_api_namespace = f"paddle::experimental::{namespace}"
-        grad_function_call_str = f"{indent}auto grad_api_result = {grad_api_namespace}{backward_api_name}({grad_api_args_str});"
 
-        # Get Grad Outputs
-        get_outputs_str = ""
-        num_outputs = len(backward_grad_outputs_map.keys())
+        # Grad Outputs
         for name, (ttype, fwd_position,
                    grad_api_position) in backward_grad_outputs_map.items():
             transformed_tensor_name = self.TransformToNextGradName(name)
-
-            if num_outputs == 1:
-                get_tensor_str = f"{indent}auto& {transformed_tensor_name} = grad_api_result;"
+            if IsPlainTensorType(ttype):
+                grad_api_args.append(f"api_output[{fwd_position}][0]")
             else:
-                if IsPlainTensorType(ttype):
-                    get_tensor_str = f"{indent}auto& {transformed_tensor_name} = grad_api_result[{grad_api_position}][0];"
-                else:
-                    assert IsVectorTensorType(ttype)
-                    get_tensor_str = f"{indent}auto& {transformed_tensor_name} = grad_api_result[{grad_api_position}];"
-            get_outputs_str += get_tensor_str + "\n"
+                assert IsVectorTensorType(ttype)
+                grad_api_args.append(f"api_output[{fwd_position}]")
+
+        grad_api_args_str = ", ".join(grad_api_args)
+
+        # Grad Function Call String
+        slot_num_bwd_outputs = len(self.forward_inputs_position_map.keys())
+        grad_api_namespace = f"paddle::experimental::{namespace}"
+        grad_function_call_str = f"""
+  const auto& out_metas = OutputMeta();
+  paddle::small_vector<std::vector<paddle::experimental::Tensor>, egr::kSlotSmallVectorSize> returns({slot_num_bwd_outputs});
+  paddle::small_vector<std::vector<paddle::experimental::Tensor*>, egr::kSlotSmallVectorSize> api_output({slot_num_bwd_outputs});
+  for (int i = 0; i < {slot_num_bwd_outputs}; ++i) {{
+    returns[i].resize(out_metas[i].size());
+    if(returns[i].size() == 0) {{
+      api_output[i].reserve(1);
+      api_output[i].push_back(nullptr);
+      continue;
+    }}
+    api_output[i].reserve(returns[i].size());
+    for (size_t j = 0; j < returns[i].size(); ++j) {{
+      api_output[i].push_back(&returns[i][j]);
+    }}
+  }}
+"""
+
+        grad_function_call_str = grad_function_call_str + f"{indent}{grad_api_namespace}{backward_api_name}({grad_api_args_str});"
 
         # Prepare for Node Creation if Necessary
         inputs_autograd_meta_str = ""
@@ -1324,38 +1339,41 @@ def GenerateNodeDefinition(self, grad_node_creation_str):
             for name, (ttype, pos,
                        grad_api_position) in backward_grad_inputs_map.items():
                 transformed_tensor_name = self.TransformToNextGradName(name)
-
-                input_autograd_meta_name = GetAutoGradMetaName(
-                    transformed_tensor_name)
-                if IsPlainTensorType(ttype):
-                    input_autograd_meta = f"{indent}egr::AutogradMeta* {input_autograd_meta_name} = egr::EagerUtils::nullable_autograd_meta({transformed_tensor_name});"
-                else:
-                    assert IsVectorTensorType(ttype)
-                    input_autograd_meta_vec_name = GetAutoGradMetaVectorName(
+                if transformed_tensor_name in grad_node_out_list:
+                    input_autograd_meta_name = GetAutoGradMetaName(
                         transformed_tensor_name)
-                    input_autograd_meta = f"{indent}std::vector<egr::AutogradMeta*> {input_autograd_meta_vec_name} = egr::EagerUtils::nullable_autograd_meta({transformed_tensor_name});\n"
-                    input_autograd_meta += f"{indent}std::vector<egr::AutogradMeta*>* {input_autograd_meta_name} = &{input_autograd_meta_vec_name};"
+                    if IsPlainTensorType(ttype):
+                        input_autograd_meta = f"{indent}egr::AutogradMeta* {input_autograd_meta_name} = egr::EagerUtils::nullable_autograd_meta({transformed_tensor_name});"
+                    else:
+                        assert IsVectorTensorType(ttype)
+                        input_autograd_meta_vec_name = GetAutoGradMetaVectorName(
+                            transformed_tensor_name)
+                        input_autograd_meta = f"{indent}std::vector<egr::AutogradMeta*> {input_autograd_meta_vec_name} = egr::EagerUtils::nullable_autograd_meta({transformed_tensor_name});\n"
+                        input_autograd_meta += f"{indent}std::vector<egr::AutogradMeta*>* {input_autograd_meta_name} = &{input_autograd_meta_vec_name};"
 
-                inputs_autograd_meta_list.append(input_autograd_meta)
-                compute_require_grad_args_list.append(input_autograd_meta_name)
+                    inputs_autograd_meta_list.append(input_autograd_meta)
+                    compute_require_grad_args_list.append(
+                        input_autograd_meta_name)
 
             # 2. Get TensorWrapper AutoGradMeta
             for name, (ttype, _, pos), in backward_forward_inputs_map.items():
                 transformed_tensor_name = self.TransformToNextGradName(name)
-
-                input_autograd_meta_name = GetAutoGradMetaName(
-                    transformed_tensor_name)
-                if IsPlainTensorType(ttype):
-                    input_autograd_meta = f"{indent}egr::AutogradMeta* {input_autograd_meta_name} = egr::EagerUtils::nullable_autograd_meta({transformed_tensor_name});"
-                else:
-                    assert IsVectorTensorType(ttype)
-                    input_autograd_meta_vec_name = GetAutoGradMetaVectorName(
+                if transformed_tensor_name in grad_node_out_list:
+                    input_autograd_meta_name = GetAutoGradMetaName(
                         transformed_tensor_name)
-                    input_autograd_meta = f"{indent}std::vector<egr::AutogradMeta*> {input_autograd_meta_vec_name} = egr::EagerUtils::nullable_autograd_meta({transformed_tensor_name});\n"
-                    input_autograd_meta += f"{indent}std::vector<egr::AutogradMeta*>* {input_autograd_meta_name} = &{input_autograd_meta_vec_name};"
+                    if IsPlainTensorType(ttype):
+                        input_autograd_meta = f"{indent}egr::AutogradMeta* {input_autograd_meta_name} = egr::EagerUtils::nullable_autograd_meta({transformed_tensor_name});"
+                    else:
+                        assert IsVectorTensorType(ttype)
+                        input_autograd_meta_vec_name = GetAutoGradMetaVectorName(
+                            transformed_tensor_name)
+                        input_autograd_meta = f"{indent}std::vector<egr::AutogradMeta*> {input_autograd_meta_vec_name} = egr::EagerUtils::nullable_autograd_meta({transformed_tensor_name});\n"
+                        input_autograd_meta += f"{indent}std::vector<egr::AutogradMeta*>* {input_autograd_meta_name} = &{input_autograd_meta_vec_name};"
+
+                    inputs_autograd_meta_list.append(input_autograd_meta)
+                    compute_require_grad_args_list.append(
+                        input_autograd_meta_name)
 
-                inputs_autograd_meta_list.append(input_autograd_meta)
-                compute_require_grad_args_list.append(input_autograd_meta_name)
             inputs_autograd_meta_str = "\n".join(inputs_autograd_meta_list)
             compute_require_grad_args_str = ",".join(
                 compute_require_grad_args_list)
@@ -1363,28 +1381,26 @@ def GenerateNodeDefinition(self, grad_node_creation_str):
             # 3. Get Output AutoGradMeta
             outputs_autograd_meta_list = []
             num_fwd_outputs = len(backward_grad_outputs_map.keys())
-            for name, (rtype, pos, _) in backward_grad_outputs_map.items():
+            for name, (rtype, pos,
+                       grad_api_position) in backward_grad_outputs_map.items():
                 transformed_tensor_name = self.TransformToNextGradName(name)
 
                 output_autograd_meta_name = GetAutoGradMetaName(
                     transformed_tensor_name)
                 output_autograd_meta_vec_name = GetAutoGradMetaVectorName(
                     transformed_tensor_name)
-                if num_fwd_outputs == 1:
-                    if IsPlainTensorType(rtype):
-                        output_autograd_meta = f"{indent}egr::AutogradMeta* {output_autograd_meta_name} = egr::EagerUtils::autograd_meta(&{transformed_tensor_name});"
-                    else:
-                        assert IsVectorTensorType(rtype)
-                        output_autograd_meta = f"{indent}std::vector<egr::AutogradMeta*> {output_autograd_meta_vec_name} = egr::EagerUtils::autograd_meta(&{transformed_tensor_name});\n"
-                        output_autograd_meta += f"{indent}std::vector<egr::AutogradMeta*>* {output_autograd_meta_name} = &{output_autograd_meta_vec_name};"
+                if IsPlainTensorType(rtype):
+                    output_autograd_meta = f"""
+  auto& {transformed_tensor_name} = returns[{pos}][0];
+  egr::AutogradMeta* {output_autograd_meta_name} = egr::EagerUtils::autograd_meta(&{transformed_tensor_name});"""
+
                 else:
-                    # Tuple api_result
-                    if IsPlainTensorType(rtype):
-                        output_autograd_meta = f"{indent}egr::AutogradMeta* {output_autograd_meta_name} = egr::EagerUtils::autograd_meta(&{transformed_tensor_name});"
-                    else:
-                        assert IsVectorTensorType(rtype)
-                        output_autograd_meta = f"{indent}std::vector<egr::AutogradMeta*> {output_autograd_meta_vec_name} = egr::EagerUtils::autograd_meta(&{transformed_tensor_name});\n"
-                        output_autograd_meta += f"{indent}std::vector<egr::AutogradMeta*>* {output_autograd_meta_name} = &{output_autograd_meta_vec_name};"
+                    assert IsVectorTensorType(rtype)
+                    output_autograd_meta = f"""
+  auto& {transformed_tensor_name} = returns[{pos}];
+  std::vector<egr::AutogradMeta*> {output_autograd_meta_vec_name} = egr::EagerUtils::autograd_meta(&{transformed_tensor_name});
+  std::vector<egr::AutogradMeta*>* {output_autograd_meta_name} = &{output_autograd_meta_vec_name};
+"""
 
                 outputs_autograd_meta_list.append(output_autograd_meta)
             outputs_autograd_meta_str = "\n".join(outputs_autograd_meta_list)
@@ -1392,28 +1408,14 @@ def GenerateNodeDefinition(self, grad_node_creation_str):
             compute_require_grad_str = f"{indent}bool trace_backward = egr::Controller::Instance().HasGrad() && create_graph;\n"
             compute_require_grad_str += f"{indent}bool require_any_grad = egr::EagerUtils::ComputeRequireGrad({compute_require_grad_args_str});"
 
-        # Construct grad_api returns
-        slot_num_bwd_outputs = len(self.forward_inputs_position_map.keys())
-        returns_str = f"{indent}paddle::small_vector<std::vector<paddle::experimental::Tensor>, egr::kSlotSmallVectorSize> returns({slot_num_bwd_outputs});\n"
-        for name, (ttype, fwd_position,
-                   grad_api_position) in backward_grad_outputs_map.items():
-            transformed_tensor_name = self.TransformToNextGradName(name)
-
-            # Rearrange output order accordingly
-            if IsPlainTensorType(ttype):
-                returns_str += f"{indent}returns[{fwd_position}] = {{ {transformed_tensor_name} }};\n"
-            else:
-                assert IsVectorTensorType(ttype)
-                returns_str += f"{indent}returns[{fwd_position}] = {transformed_tensor_name};\n"
-
-        returns_str += f"{indent}if(NeedComplexToRealConversion()) HandleComplexGradToRealGrad(&returns);\n"
+        returns_str = f"{indent}if(NeedComplexToRealConversion()) HandleComplexGradToRealGrad(&returns);\n"
         returns_str += f"{indent}return returns;\n"
 
         grad_node_name = GetGradNodeName(forward_api_name)
 
         self.node_definition_str = GRAD_FUNCTION_TEMPLATE.format(
             grad_node_name, fill_zero_str, get_grad_in_args_str, grad_node_name,
-            grad_function_call_str, get_outputs_str, inputs_autograd_meta_str,
+            grad_function_call_str, inputs_autograd_meta_str,
             outputs_autograd_meta_str, compute_require_grad_str,
             grad_node_creation_str, returns_str)
 
@@ -1426,16 +1428,17 @@ def run(self):
         ## Code Generation ##
         #####################
         # Higher-order GradNode generation
-        grad_node_creation_str = self.GenerateHigherOrderNodeCreationCode()
+        grad_node_creation_str, grad_node_out_list = self.GenerateHigherOrderNodeCreationCode(
+        )
 
         self.GenerateNodeDeclaration()
 
-        self.GenerateNodeDefinition(grad_node_creation_str)
+        self.GenerateNodeDefinition(grad_node_creation_str, grad_node_out_list)
 
 
 class DygraphYamlGenerator(YamlGeneratorBase):
     def __init__(self, api_yaml_path, backward_yaml_path):
-        # Parent members: 
+        # Parent members:
         # self.namespace
         # self.api_yaml_path
         # self.forward_api_list
diff --git a/paddle/phi/api/lib/CMakeLists.txt b/paddle/phi/api/lib/CMakeLists.txt
index b195ed1aefadc..ddeb073046bf1 100644
--- a/paddle/phi/api/lib/CMakeLists.txt
+++ b/paddle/phi/api/lib/CMakeLists.txt
@@ -74,6 +74,7 @@ add_custom_command(
   COMMAND ${PYTHON_EXECUTABLE} ${api_gen_file}
                  --api_yaml_path ${api_yaml_file}
                  --api_header_path ${api_header_file_tmp}
+                 --api_header_path ${api_header_file_tmp}
                  --api_source_path ${api_source_file_tmp}
   COMMAND ${CMAKE_COMMAND} -E copy_if_different ${api_header_file_tmp} ${api_header_file}
   COMMAND ${CMAKE_COMMAND} -E copy_if_different ${api_source_file_tmp} ${api_source_file}
diff --git a/paddle/phi/api/lib/api_custom_impl.cc b/paddle/phi/api/lib/api_custom_impl.cc
index 38a60ab978900..d80444e7f710c 100644
--- a/paddle/phi/api/lib/api_custom_impl.cc
+++ b/paddle/phi/api/lib/api_custom_impl.cc
@@ -592,19 +592,20 @@ Tensor conv2d_impl(const Tensor& input,
   return api_output;
 }
 
-std::vector<std::vector<Tensor>> conv2d_grad_impl(
-    const Tensor& input,
-    const Tensor& filter,
-    const Tensor& out_grad,
-    const std::vector<int>& strides,
-    const std::vector<int>& paddings,
-    const std::string& paddding_algorithm,
-    int groups,
-    const std::vector<int>& dilations,
-    const std::string& data_format,
-    bool use_addto,
-    int workspace_size_MB,
-    bool exhaustive_search) {
+void conv2d_grad_impl(const Tensor& input,
+                      const Tensor& filter,
+                      const Tensor& out_grad,
+                      const std::vector<int>& strides,
+                      const std::vector<int>& paddings,
+                      const std::string& paddding_algorithm,
+                      int groups,
+                      const std::vector<int>& dilations,
+                      const std::string& data_format,
+                      bool use_addto,
+                      int workspace_size_MB,
+                      bool exhaustive_search,
+                      Tensor* input_grad,
+                      Tensor* filter_grad) {
   Backend kernel_backend = Backend::UNDEFINED;
   DataLayout kernel_layout = DataLayout::UNDEFINED;
   DataType kernel_data_type = DataType::UNDEFINED;
@@ -646,18 +647,15 @@ std::vector<std::vector<Tensor>> conv2d_grad_impl(
   auto input_filter = PrepareData(filter, args1, {});
   auto input_out_grad = PrepareData(out_grad, args2, {});
 
-  std::vector<std::vector<Tensor>> api_output(2);
-  api_output[0].emplace_back();
-  auto kernel_out_0 = SetKernelOutput(kernel_backend, &api_output[0][0]);
-  api_output[1].emplace_back();
-  auto kernel_out_1 = SetKernelOutput(kernel_backend, &api_output[1][0]);
+  auto kernel_out_0 = SetKernelOutput(kernel_backend, input_grad);
+  auto kernel_out_1 = SetKernelOutput(kernel_backend, filter_grad);
   phi::MetaTensor meta_out_0(kernel_out_0);
   phi::MetaTensor meta_out_1(kernel_out_1);
 
   phi::GeneralBinaryGradInferMeta(MakeMetaTensor(*input_input),
                                   MakeMetaTensor(*input_filter),
-                                  &meta_out_0,
-                                  &meta_out_1);
+                                  kernel_out_0 ? &meta_out_0 : nullptr,
+                                  kernel_out_1 ? &meta_out_1 : nullptr);
 
   using kernel_signature = void (*)(const platform::DeviceContext&,
                                     const phi::DenseTensor&,
@@ -693,8 +691,6 @@ std::vector<std::vector<Tensor>> conv2d_grad_impl(
                  kernel_out_0,
                  kernel_out_1);
   }
-
-  return api_output;
 }
 
 Tensor copy_to_impl(const Tensor& x, Place place, bool blocking) {
@@ -1080,8 +1076,9 @@ std::tuple<Tensor, Tensor> sgd_impl(
 // but if we use this impl, it will not support. We need to be able to reuse
 // the autograd API here, which is not yet implemented
 // TODO(chenweihang): we should support call generated api in custom api impl
-std::vector<Tensor> add_n_grad_impl(const std::vector<Tensor>& x,
-                                    const Tensor& out_grad) {
+void add_n_grad_impl(const std::vector<Tensor>& x,
+                     const Tensor& out_grad,
+                     std::vector<Tensor*> x_grad) {
   auto kernel_key_set = ParseKernelKeyByInputArgs(out_grad);
   auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
 
@@ -1099,9 +1096,7 @@ std::vector<Tensor> add_n_grad_impl(const std::vector<Tensor>& x,
 
   auto dense_out_grad = PrepareData(out_grad, kernel.InputAt(0), {});
 
-  size_t out_number = x.size();
-  std::vector<Tensor> x_grad;
-  auto dense_x_grad = SetKernelOutput(out_number, kernel_backend, &x_grad);
+  auto dense_x_grad = SetKernelOutput(&x_grad);
 
   using kernel_signature = void (*)(const platform::DeviceContext&,
                                     const phi::DenseTensor&,
@@ -1117,8 +1112,6 @@ std::vector<Tensor> add_n_grad_impl(const std::vector<Tensor>& x,
     (*kernel_fn)(
         *dev_ctx, *dense_out_grad, phi::Scalar(1.0), 0.0, true, dense_x_grad_t);
   }
-
-  return x_grad;
 }
 
 std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor, Tensor> batch_norm_impl(
@@ -1250,7 +1243,7 @@ std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor, Tensor> batch_norm_impl(
   return api_output;
 }
 
-Tensor imag_grad_impl(const Tensor& out_grad) {
+void imag_grad_impl(const Tensor& out_grad, Tensor* x_grad) {
   phi::KernelKey kernel_key{ParseBackend(out_grad),
                             out_grad.layout(),
                             phi::dtype::ToComplex(out_grad.dtype())};
@@ -1264,8 +1257,7 @@ Tensor imag_grad_impl(const Tensor& out_grad) {
 
   auto dense_out_grad = TensorToDenseTensor(out_grad);
 
-  Tensor out;
-  auto kernel_out = SetKernelOutput(kernel_key.backend(), &out);
+  auto kernel_out = SetKernelOutput(kernel_key.backend(), x_grad);
   phi::MetaTensor meta_out(kernel_out);
   phi::RealAndImagGradInferMeta(*dense_out_grad, &meta_out);
 
@@ -1274,11 +1266,9 @@ Tensor imag_grad_impl(const Tensor& out_grad) {
 
   auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
   (*kernel_fn)(*dev_ctx, *dense_out_grad, kernel_out);
-
-  return out;
 }
 
-Tensor real_grad_impl(const Tensor& out_grad) {
+void real_grad_impl(const Tensor& out_grad, Tensor* x_grad) {
   phi::KernelKey kernel_key{ParseBackend(out_grad),
                             out_grad.layout(),
                             phi::dtype::ToComplex(out_grad.dtype())};
@@ -1292,8 +1282,7 @@ Tensor real_grad_impl(const Tensor& out_grad) {
 
   auto dense_out_grad = TensorToDenseTensor(out_grad);
 
-  Tensor out;
-  auto kernel_out = SetKernelOutput(kernel_key.backend(), &out);
+  auto kernel_out = SetKernelOutput(kernel_key.backend(), x_grad);
   phi::MetaTensor meta_out(kernel_out);
   phi::RealAndImagGradInferMeta(*dense_out_grad, &meta_out);
 
@@ -1302,8 +1291,6 @@ Tensor real_grad_impl(const Tensor& out_grad) {
 
   auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
   (*kernel_fn)(*dev_ctx, *dense_out_grad, kernel_out);
-
-  return out;
 }
 
 }  // namespace experimental
diff --git a/paddle/phi/api/lib/api_custom_impl.h b/paddle/phi/api/lib/api_custom_impl.h
index 46abcd90de32a..d88a134654caf 100644
--- a/paddle/phi/api/lib/api_custom_impl.h
+++ b/paddle/phi/api/lib/api_custom_impl.h
@@ -96,20 +96,6 @@ Tensor conv2d_impl(const Tensor& input,
                    int workspace_size_MB,
                    bool exhaustive_search);
 
-std::vector<std::vector<Tensor>> conv2d_grad_impl(
-    const Tensor& input,
-    const Tensor& filter,
-    const Tensor& out_grad,
-    const std::vector<int>& strides,
-    const std::vector<int>& paddings,
-    const std::string& paddding_algorithm,
-    int groups,
-    const std::vector<int>& dilations,
-    const std::string& data_format,
-    bool use_addto,
-    int workspace_size_MB,
-    bool exhaustive_search);
-
 Tensor copy_to_impl(const Tensor& x, Place place, bool blocking);
 
 std::vector<Tensor> split_impl(const Tensor& x,
@@ -138,12 +124,28 @@ std::tuple<Tensor, Tensor> sgd_impl(
 
 ////////////////// Backward(grad) api impls //////////////////////
 
-std::vector<Tensor> add_n_grad_impl(const std::vector<Tensor>& x,
-                                    const Tensor& out_grad);
-
-Tensor imag_grad_impl(const Tensor& x);
-
-Tensor real_grad_impl(const Tensor& x);
+void add_n_grad_impl(const std::vector<Tensor>& x,
+                     const Tensor& out_grad,
+                     std::vector<Tensor*> x_grad);
+
+void conv2d_grad_impl(const Tensor& input,
+                      const Tensor& filter,
+                      const Tensor& out_grad,
+                      const std::vector<int>& strides,
+                      const std::vector<int>& paddings,
+                      const std::string& paddding_algorithm,
+                      int groups,
+                      const std::vector<int>& dilations,
+                      const std::string& data_format,
+                      bool use_addto,
+                      int workspace_size_MB,
+                      bool exhaustive_search,
+                      Tensor* input_grad,
+                      Tensor* filter_grad);
+
+void imag_grad_impl(const Tensor& out_grad, Tensor* x_grad);
+
+void real_grad_impl(const Tensor& out_grad, Tensor* x_grad);
 
 }  // namespace experimental
 }  // namespace paddle
diff --git a/paddle/phi/api/lib/api_gen_utils.cc b/paddle/phi/api/lib/api_gen_utils.cc
index fb205212ff371..2111829b8d60b 100644
--- a/paddle/phi/api/lib/api_gen_utils.cc
+++ b/paddle/phi/api/lib/api_gen_utils.cc
@@ -113,10 +113,13 @@ phi::MetaTensor MakeMetaTensor(const phi::StringTensor& tensor) {
 /* ------------------ for output ----------------------- */
 
 phi::DenseTensor* SetKernelOutput(Backend backend, Tensor* out) {
-  if (out->impl() == nullptr) {
-    out->set_impl(std::make_shared<phi::DenseTensor>());
+  if (out) {
+    if (out->impl() == nullptr) {
+      out->set_impl(std::make_shared<phi::DenseTensor>());
+    }
+    return static_cast<phi::DenseTensor*>(out->impl().get());
   }
-  return static_cast<phi::DenseTensor*>(out->impl().get());
+  return nullptr;
 }
 
 std::vector<phi::DenseTensor*> SetKernelOutput(size_t out_size,
@@ -133,6 +136,18 @@ std::vector<phi::DenseTensor*> SetKernelOutput(size_t out_size,
   return results;
 }
 
+std::vector<phi::DenseTensor*> SetKernelOutput(std::vector<Tensor*>* out) {
+  std::vector<phi::DenseTensor*> results(out->size(), nullptr);
+  for (size_t i = 0; i < out->size(); ++i) {
+    if (out->at(i)) {
+      auto tensor_ptr = std::make_shared<phi::DenseTensor>();
+      results[i] = tensor_ptr.get();
+      (*out)[i]->set_impl(tensor_ptr);
+    }
+  }
+  return results;
+}
+
 phi::SelectedRows* SetSelectedRowsKernelOutput(Backend backend, Tensor* out) {
   if (!out->initialized()) {
     auto select_rows = std::make_shared<phi::SelectedRows>();
diff --git a/paddle/phi/api/lib/api_gen_utils.h b/paddle/phi/api/lib/api_gen_utils.h
index 47b80bb3fc290..7303e6b46114d 100644
--- a/paddle/phi/api/lib/api_gen_utils.h
+++ b/paddle/phi/api/lib/api_gen_utils.h
@@ -74,6 +74,9 @@ std::vector<phi::DenseTensor*> SetKernelOutput(size_t out_size,
                                                Backend backend,
                                                std::vector<Tensor>* out);
 
+// For backward api
+std::vector<phi::DenseTensor*> SetKernelOutput(std::vector<Tensor*>* out);
+
 phi::SelectedRows* SetSelectedRowsKernelOutput(Backend backend, Tensor* out);
 
 phi::TensorBase* SetSparseKernelOutput(Tensor* out, TensorType type);
diff --git a/paddle/phi/infermeta/multiary.cc b/paddle/phi/infermeta/multiary.cc
index 519d21b323fc2..e793eb8e66872 100644
--- a/paddle/phi/infermeta/multiary.cc
+++ b/paddle/phi/infermeta/multiary.cc
@@ -1998,7 +1998,9 @@ void StackInferMeta(const std::vector<const MetaTensor*>& x,
 void UnchangedMultiInferMeta(const std::vector<const MetaTensor*>& x,
                              std::vector<MetaTensor*> out) {
   for (size_t i = 0; i < x.size(); ++i) {
-    out[i]->share_meta(*x[i]);
+    if (out[i]) {
+      out[i]->share_meta(*x[i]);
+    }
   }
 }
 
diff --git a/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h b/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h
index 5d365786001a3..3c06b238d145c 100644
--- a/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h
@@ -513,6 +513,20 @@ void MultiplyDoubleGradKernel(const Context& dev_ctx,
                                         funcs::InverseMultiplyFunctor<T>>(
           dev_ctx, dout, ddy_safe, dx, axis);
     }
+  } else {
+    if (dx && dy) {
+      phi::funcs::ElemwiseGradCompute<Context, T, MulGradDX<T>, MulGradDY<T>>(
+          dev_ctx,
+          ddx_safe,
+          ddy_safe,
+          dout,
+          dout,
+          axis,
+          dx,
+          dy,
+          MulGradDX<T>(),
+          MulGradDY<T>());
+    }
   }
 }
 
diff --git a/paddle/phi/tests/api/test_matmul_api.cc b/paddle/phi/tests/api/test_matmul_api.cc
index e2c324a6775c8..0d4ec7bd4f592 100644
--- a/paddle/phi/tests/api/test_matmul_api.cc
+++ b/paddle/phi/tests/api/test_matmul_api.cc
@@ -179,8 +179,18 @@ TEST(API, matmul_double_grad) {
   auto dx_grad = paddle::experimental::full({3, 3}, 2.0);
 
   // 2. test API
-  const auto out = paddle::experimental::matmul_double_grad(
-      x, y, out_grad, dx_grad, {}, false, false);
+  std::vector<std::vector<paddle::experimental::Tensor>> out(
+      3, std::vector<paddle::experimental::Tensor>(1));
+  paddle::experimental::matmul_double_grad(x,
+                                           y,
+                                           out_grad,
+                                           dx_grad,
+                                           {},
+                                           false,
+                                           false,
+                                           &out[0][0],
+                                           &out[1][0],
+                                           &out[2][0]);
 
   // 3. check result
   ASSERT_EQ(out.size(), 3UL);
diff --git a/paddle/phi/tests/api/test_sparse_conv_api.cc b/paddle/phi/tests/api/test_sparse_conv_api.cc
index 7c4aa16425907..c00113389adb7 100644
--- a/paddle/phi/tests/api/test_sparse_conv_api.cc
+++ b/paddle/phi/tests/api/test_sparse_conv_api.cc
@@ -77,11 +77,11 @@ void TestConv3dBase(const std::vector<int>& indices,
          kernel.size() * sizeof(T));
 
   if (!std::is_same<T, phi::dtype::float16>::value) {
-    auto outs = paddle::experimental::sparse::conv3d(
+    auto tensor_out = paddle::experimental::sparse::conv3d(
         x, weight, paddings, dilations, strides, 1, false);
 
-    auto out = std::dynamic_pointer_cast<phi::SparseCooTensor>(
-        std::get<0>(outs).impl());
+    auto out =
+        std::dynamic_pointer_cast<phi::SparseCooTensor>(tensor_out.impl());
     ASSERT_EQ(correct_out_dims.size(), out->dims().size());
     for (int i = 0; i < correct_out_dims.size(); i++) {
       ASSERT_EQ(correct_out_dims[i], out->dims()[i]);
diff --git a/python/paddle/utils/code_gen/api_base.py b/python/paddle/utils/code_gen/api_base.py
index 717870ee01d0a..af870fcc8e54d 100644
--- a/python/paddle/utils/code_gen/api_base.py
+++ b/python/paddle/utils/code_gen/api_base.py
@@ -60,6 +60,12 @@ def get_api_name(self, api_item_yaml):
     def get_api_func_name(self):
         return self.api
 
+    def get_declare_args(self):
+        return self.args_str['args_declare']
+
+    def get_define_args(self):
+        return self.args_str["args_define"]
+
     def parse_args(self, api_name, api_item_yaml):
         optional_vars = []
         if 'optional' in api_item_yaml:
@@ -309,12 +315,12 @@ def get_return_type(self, out_type_list):
 
     def gene_api_declaration(self):
         api_declaration = f"""
-PADDLE_API {self.gene_return_type_code()} {self.get_api_func_name()}({self.args_str['args_declare']});
+PADDLE_API {self.gene_return_type_code()} {self.get_api_func_name()}({self.get_declare_args()});
 """
 
         if self.is_base_api and self.inplace_map is not None:
             api_declaration = api_declaration + f"""
-PADDLE_API {self.gene_return_type_code()} {self.get_api_func_name() + '_'}({self.args_str['args_declare']});
+PADDLE_API {self.gene_return_type_code()} {self.get_api_func_name() + '_'}({self.get_declare_args()});
 """
 
         return api_declaration
@@ -513,7 +519,7 @@ def gene_infer_meta(self, kernel_output_names, code_indent) -> str:
 {code_indent}  auto {out_name}_{PREFIX_META_TENSOR_NAME}vec = MakeMetaTensor({out_name});
 {code_indent}  std::vector<phi::MetaTensor*> {out_name}_metas({out_name}_{PREFIX_META_TENSOR_NAME}vec.size());
 {code_indent}  for (size_t i = 0; i < {out_name}_{PREFIX_META_TENSOR_NAME}vec.size(); ++i) {{
-{code_indent}    {out_name}_metas[i] = &{out_name}_{PREFIX_META_TENSOR_NAME}vec[i];
+{code_indent}    {out_name}_metas[i] = {out_name}[i] ? &{out_name}_{PREFIX_META_TENSOR_NAME}vec[i] : nullptr;
 {code_indent}  }}"""
 
                 param_code = param_code + out_name + '_metas, '
@@ -521,8 +527,10 @@ def gene_infer_meta(self, kernel_output_names, code_indent) -> str:
                 meta_tensor_code = meta_tensor_code + code_indent + "  phi::MetaTensor " + out_name.replace(
                     'kernel_',
                     PREFIX_META_TENSOR_NAME) + "(" + out_name + ");\n"
-                param_code = param_code + "&" + out_name.replace(
-                    'kernel_', PREFIX_META_TENSOR_NAME) + ", "
+                if len(kernel_output_names) == 1:
+                    param_code = param_code + f"&{out_name.replace('kernel_', PREFIX_META_TENSOR_NAME)}, "
+                else:
+                    param_code = param_code + f"{out_name} ? &{out_name.replace('kernel_', PREFIX_META_TENSOR_NAME)} : nullptr, "
 
         param_code = param_code[:-2]
         return f"""{meta_tensor_code}
@@ -712,7 +720,7 @@ def gene_return_type_code(self):
 
     # Override by child class
     def gene_return_code(self):
-        return "api_output"
+        return "return api_output;"
 
     # Override by child class
     def gene_output(self,
@@ -748,7 +756,7 @@ def gen_dense_tensor_kernel_code(self, code_indent, inplace_flag=False):
 {code_indent}    (*kernel_fn)({kernel_args}, {outputs_args});
 {code_indent}  }}
 
-{code_indent}  return {self.gene_return_code()};"""
+{code_indent}  {self.gene_return_code()}"""
 
     def gen_selected_rows_kernel_code(self, code_indent, inplace_flag=False):
         input_tensors, kernel_args, kernel_signature = self.get_selected_rows_kernel_args(
@@ -775,12 +783,12 @@ def gen_selected_rows_kernel_code(self, code_indent, inplace_flag=False):
 {code_indent}    (*kernel_fn)({kernel_args}, {outputs_args});
 {code_indent}  }}
 
-{code_indent}  return {self.gene_return_code()};"""
+{code_indent}  {self.gene_return_code()}"""
 
     def gene_base_api_code(self, inplace_flag=False):
         api_func_name = self.get_api_func_name() + ('_' if inplace_flag else '')
         api_code = f"""
-PADDLE_API {self.gene_return_type_code()} {api_func_name}({self.args_str["args_define"]}) {{
+PADDLE_API {self.gene_return_type_code()} {api_func_name}({self.get_define_args()}) {{
 {self.gene_kernel_select()}
 """
 
@@ -802,6 +810,12 @@ def gene_base_api_code(self, inplace_flag=False):
 }
 """
 
+    def gene_invoke_code(self, invoke_code, params_code):
+        return f"""
+PADDLE_API {self.outputs['return_type']} {self.api}({params_code}) {{
+  return {invoke_code};
+}}"""
+
     def gene_api_code(self):
         if self.is_base_api:
             api_code = self.gene_base_api_code()
@@ -821,12 +835,8 @@ def adjust_name(matched):
 
                 invoke_code = re.sub(pattern, adjust_name, self.invoke)
                 params_code = re.sub(pattern, adjust_name,
-                                     self.args_str["args_define"])
+                                     self.get_define_args())
             else:
                 invoke_code = self.invoke
-                params_code = self.args_str["args_define"]
-            return f"""
-{self.outputs['return_type']} {self.api}({params_code}) {{
-  return {invoke_code};
-}}
-"""
+                params_code = self.get_define_args()
+            return self.gene_invoke_code(invoke_code, params_code)
diff --git a/python/paddle/utils/code_gen/api_gen.py b/python/paddle/utils/code_gen/api_gen.py
index 538958c2361bc..8fd95f9a191c3 100644
--- a/python/paddle/utils/code_gen/api_gen.py
+++ b/python/paddle/utils/code_gen/api_gen.py
@@ -53,7 +53,7 @@ def gene_return_type_code(self):
         else:
             return_out_list = []
             for i, name in enumerate(self.outputs['names']):
-                if name not in self.intermediate_outs:
+                if name.split('@')[0] not in self.intermediate_outs:
                     return_out_list.append(self.outputs['types'][i])
             return return_out_list[0] if len(
                 return_out_list) == 1 else "std::tuple<" + ",".join(
@@ -61,19 +61,19 @@ def gene_return_type_code(self):
 
     def gene_return_code(self):
         if self.is_dygraph_api or len(self.intermediate_outs) == 0:
-            return "api_output"
+            return "return api_output;"
         else:
             return_out_list = []
             for i, name in enumerate(self.outputs['names']):
-                if name not in self.intermediate_outs:
+                if name.split('@')[0] not in self.intermediate_outs:
                     return_out_list.append(i)
             if len(return_out_list) == 1:
-                return f"std::get<{return_out_list[0]}>(api_output)"
+                return f"return std::get<{return_out_list[0]}>(api_output);"
             else:
                 selected_code = [
                     f"std::get<{i}>(api_output)" for i in return_out_list
                 ]
-            return '{' + ", ".join(selected_code) + '}'
+            return 'return {' + ", ".join(selected_code) + '};'
 
     def gene_output(self,
                     output_type_list,
diff --git a/python/paddle/utils/code_gen/backward.yaml b/python/paddle/utils/code_gen/backward.yaml
index ff49fd426146b..7c68829c0959f 100644
--- a/python/paddle/utils/code_gen/backward.yaml
+++ b/python/paddle/utils/code_gen/backward.yaml
@@ -71,7 +71,7 @@
   forward : add_n (Tensor[] x) -> Tensor(out)
   args : (Tensor[] x, Tensor out_grad)
   output : Tensor[](x_grad){x.size()}
-  invoke : add_n_grad_impl(x, out_grad)
+  invoke : add_n_grad_impl(x, out_grad, x_grad)
   no_need_buffer : x
 
 - backward_api : add_triple_grad
@@ -286,7 +286,7 @@
   forward : conv2d (Tensor input, Tensor filter, int[] strides, int[] paddings, str paddding_algorithm, int groups, int[] dilations, str data_format, bool use_addto, int workspace_size_MB, bool exhaustive_search) -> Tensor(out)
   args : (Tensor input, Tensor filter, Tensor out_grad,  int[] strides, int[] paddings, str paddding_algorithm, int groups, int[] dilations, str data_format, bool use_addto, int workspace_size_MB, bool exhaustive_search)
   output : Tensor(input_grad), Tensor(filter_grad)
-  invoke : conv2d_grad_impl(input, filter, out_grad,  strides, paddings, paddding_algorithm, groups, dilations, data_format, use_addto, workspace_size_MB, exhaustive_search)
+  invoke : conv2d_grad_impl(input, filter, out_grad,  strides, paddings, paddding_algorithm, groups, dilations, data_format, use_addto, workspace_size_MB, exhaustive_search, input_grad, filter_grad)
   backward : conv2d_grad_grad
 
 - backward_api : conv2d_grad_grad
@@ -766,7 +766,7 @@
   forward : imag (Tensor x) -> Tensor(out)
   args : (Tensor out_grad)
   output : Tensor(x_grad)
-  invoke : imag_grad_impl(out_grad)
+  invoke : imag_grad_impl(out_grad, x_grad)
 
 - backward_api : index_sample_grad
   forward : index_sample (Tensor x, Tensor index) -> Tensor(out)
@@ -1395,7 +1395,7 @@
   forward : real (Tensor x) -> Tensor(out)
   args : (Tensor out_grad)
   output : Tensor(x_grad)
-  invoke : real_grad_impl(out_grad)
+  invoke : real_grad_impl(out_grad, x_grad)
 
 - backward_api : reciprocal_grad
   forward : reciprocal (Tensor x) -> Tensor(out)
@@ -1796,7 +1796,7 @@
   forward : sum_double_grad (Tensor grad_grad_x, int64_t[] dims={}, bool keep_dim=false) -> Tensor(grad_grad_out)
   args : (Tensor grad_grad_x, Tensor grad_grad_out_grad, int64_t[] dims={}, bool keep_dim=false, bool reduce_all=false)
   output : Tensor(grad_grad_x_grad)
-  invoke : sum_grad(grad_grad_x, grad_grad_out_grad, dims, keep_dim, reduce_all)
+  invoke : sum_grad(grad_grad_x, grad_grad_out_grad, dims, keep_dim, reduce_all, grad_grad_x_grad)
   no_need_buffer : x
 
 - backward_api : swish_grad
diff --git a/python/paddle/utils/code_gen/backward_api_gen.py b/python/paddle/utils/code_gen/backward_api_gen.py
index a88339c607c55..a155a2c3d6c9f 100644
--- a/python/paddle/utils/code_gen/backward_api_gen.py
+++ b/python/paddle/utils/code_gen/backward_api_gen.py
@@ -77,6 +77,25 @@ def check_args(self, forward_config):
             f"{self.api} : Output error: The number of outputs should be less then the number of inputs of forward api. \
              Please check the output of {self.api} in yaml."
 
+    def get_declare_args(self):
+        return self.get_define_args()
+
+    def get_define_args(self):
+        out_type_map = {
+            'Tensor': 'Tensor*',
+            'std::vector<Tensor>': 'std::vector<Tensor*>'
+        }
+        intputs_and_attrs = self.args_str['args_define']
+        outs = []
+        for i, name in enumerate(self.outputs['names']):
+            outs.append(out_type_map[self.outputs['types'][i]] + ' ' +
+                        name.split('@')[0])
+        result = intputs_and_attrs + ', ' + ", ".join(outs)
+        return result
+
+    def gene_return_code(self):
+        return ""
+
     def gene_kernel_backend_select(self):
         all_no_need_buffer = True
         for in_name in self.inputs['names']:
@@ -91,8 +110,7 @@ def gene_kernel_backend_select(self):
             return super().gene_kernel_backend_select()
 
     def get_return_type(self, out_type_list):
-        return out_type_list[0] if len(
-            out_type_list) == 1 else "std::vector<std::vector<Tensor>>"
+        return 'void'
 
     def gene_output(self,
                     output_type_list,
@@ -109,23 +127,19 @@ def gene_output(self,
             inplace_assign = " = " + self.inplace_map[self.outputs['names'][
                 0]] if inplace_flag and self.inplace_map is not None and self.outputs[
                     'names'][0] in self.inplace_map else ""
-            output_create = f"""
-{code_indent}  {self.outputs['return_type']} api_output{inplace_assign};"""
-
+            output_create = ""
             if output_type_list[0] == 'std::vector<Tensor>':
                 assert self.outputs['out_size_expr'] is not None, \
                      f"{api_name}: The out size expr : '{{expr}}' should be set when output has Tensor[]. You can refer 'split' api."
                 output_create = output_create + f"""
-{code_indent}  auto kernel_out = {set_out_func}({self.outputs['out_size_expr']}, kernel_backend, &api_output);"""
+{code_indent}  auto kernel_out = {set_out_func}(&{self.outputs['names'][0]});"""
 
             else:
                 output_create = output_create + f"""
-{code_indent}  auto kernel_out = {set_out_func}(kernel_backend, &api_output);"""
+{code_indent}  auto kernel_out = {set_out_func}(kernel_backend, {self.outputs['names'][0]});"""
 
         elif len(output_type_list) > 1:
-            output_create = f"""
-{code_indent}  {self.outputs['return_type']} api_output({len(output_type_list)});"""
-
+            output_create = ""
             for i, out_type_item in enumerate(output_type_list):
                 kernel_output = kernel_output + f'kernel_out_{i}, '
                 output_names.append(f'kernel_out_{i}')
@@ -133,26 +147,21 @@ def gene_output(self,
                     if inplace_flag and self.inplace_map is not None and self.outputs[
                             'names'][i] in self.inplace_map:
                         output_create = output_create + f"""
-{code_indent}  api_output[{i}].emplace_back({self.inplace_map[self.outputs['names'][i]]});"""
-
-                    else:
-                        output_create = output_create + f"""
-{code_indent}  api_output[{i}].emplace_back();"""
+{code_indent}  *{self.outputs['names'][i]} = {self.inplace_map[self.outputs['names'][i]]};"""
 
                     output_create = output_create + f"""
-{code_indent}  auto kernel_out_{i} = {set_out_func}(kernel_backend, &api_output[{i}][0]);"""
+{code_indent}  auto kernel_out_{i} = {set_out_func}(kernel_backend, {self.outputs['names'][i]});"""
 
                 else:
-                    get_out_code = f'&api_output[{i}]'
                     if inplace_flag and self.inplace_map is not None and self.outputs[
                             'names'][i] in self.inplace_map:
                         output_create = output_create + f"""
-{code_indent}  api_output[{i}] = {self.inplace_map[self.outputs['names'][i]]};"""
+{code_indent}  *{self.outputs['names'][i]} = {self.inplace_map[self.outputs['names'][i]]};"""
 
                     assert self.outputs['out_size_expr'][i] is not None, \
                         f"{api_name}: The out size expr : '{{expr}}' should be set when output has Tensor[]. You can refer 'split' api."
                     output_create = output_create + f"""
-{code_indent}  auto kernel_out_{i} = {set_out_func}({self.outputs['out_size_expr'][i]}, kernel_backend, &api_output[{i}]);"""
+{code_indent}  auto kernel_out_{i} = {set_out_func}(&{self.outputs['names'][i]});"""
 
             kernel_output = kernel_output[:-2]
         else:
@@ -162,6 +171,21 @@ def gene_output(self,
 
         return kernel_output, output_names, output_create
 
+    def gene_invoke_code(self, invoke_code, params_code):
+        inveke_func_name = invoke_code.split('(')[0].strip()
+        if inveke_func_name.endswith('_grad') or inveke_func_name.endswith(
+                '_grad_impl'):
+            return f"""
+PADDLE_API {self.outputs['return_type']} {self.api}({params_code}) {{
+  {invoke_code};
+}}"""
+
+        else:
+            return f"""
+PADDLE_API {self.outputs['return_type']} {self.api}({params_code}) {{
+  *{self.outputs['names'][0].split('@')[0]} = {invoke_code};
+}}"""
+
 
 def header_include():
     return """
diff --git a/python/paddle/utils/code_gen/sparse_api_gen.py b/python/paddle/utils/code_gen/sparse_api_gen.py
index c0316fc164294..eb9bca2eca7b7 100644
--- a/python/paddle/utils/code_gen/sparse_api_gen.py
+++ b/python/paddle/utils/code_gen/sparse_api_gen.py
@@ -25,10 +25,9 @@ def __init__(self, api_item_yaml):
         super(SparseAPI, self).__init__(api_item_yaml)
 
     def gene_api_declaration(self):
-        return f"""
-// {", ".join(self.outputs['names'])}
-PADDLE_API {self.outputs['return_type']} {self.get_api_func_name()}({self.args_str['args_declare']});
-"""
+        api_declaration = "// " + ', '.join(self.outputs['names'])
+        return api_declaration + super(SparseAPI,
+                                       self).gene_api_declaration() + '\n'
 
     def get_kernel_tensor_out_type(self, output_name):
         sparse_type = 'TensorType::DENSE_TENSOR'
@@ -136,7 +135,8 @@ def gen_sparse_kernel_code(self, inplace_flag=False):
 
         kernel_context_code = self.gen_sparse_kernel_context(
             kernel_output_names)
-
+        return_code = "" if len(self.gene_return_code(
+        )) == 0 else "  " + self.gene_return_code()
         return f"""
   auto phi_kernel = phi::KernelFactory::Instance().SelectKernelOrThrowError(
       "{self.kernel['func'][0]}", {{kernel_backend, kernel_layout, kernel_data_type}});
@@ -148,13 +148,11 @@ def gen_sparse_kernel_code(self, inplace_flag=False):
 {output_create}
 {kernel_context_code}
   phi_kernel(&kernel_context);
-
-  return api_output;"""
+{return_code}"""
 
     def gene_base_api_code(self, inplace_flag=False):
-        api_func_name = self.get_api_func_name()
         return f"""
-PADDLE_API {self.outputs['return_type']} {api_func_name}({self.args_str["args_define"]}) {{
+PADDLE_API {self.gene_return_type_code()} {self.get_api_func_name()}({self.get_define_args()}) {{
 {self.gene_kernel_select()}
 {self.gen_sparse_kernel_code(inplace_flag)}
 }}
diff --git a/python/paddle/utils/code_gen/sparse_bw_api_gen.py b/python/paddle/utils/code_gen/sparse_bw_api_gen.py
index 4f209a7592161..6dc4a2668ebb9 100644
--- a/python/paddle/utils/code_gen/sparse_bw_api_gen.py
+++ b/python/paddle/utils/code_gen/sparse_bw_api_gen.py
@@ -34,9 +34,21 @@ def gene_kernel_backend_select(self):
     def get_return_type(self, out_type_list):
         return BackwardAPI.get_return_type(self, out_type_list)
 
+    def gene_return_type_code(self):
+        return self.outputs['return_type']
+
+    def gene_return_code(self):
+        return ""
+
     def gene_api_declaration(self):
         return SparseAPI.gene_api_declaration(self)
 
+    def get_declare_args(self):
+        return BackwardAPI.get_declare_args(self)
+
+    def get_define_args(self):
+        return BackwardAPI.get_define_args(self)
+
     def gene_output(self,
                     output_type_list,
                     set_out_func,
@@ -53,36 +65,21 @@ def gene_output(self,
                 0]] if inplace_flag and self.inplace_map is not None and self.outputs[
                     'names'][0] in self.inplace_map else ""
             output_create = f"""
-  {self.outputs['return_type']} api_output{inplace_assign};
-  auto kernel_out = {set_out_func}(&api_output, {self.get_kernel_tensor_out_type(self.outputs['names'][0])});"""
+  auto kernel_out = {set_out_func}({self.outputs['names'][0].split('@')[0]}, {self.get_kernel_tensor_out_type(self.outputs['names'][0])});"""
 
         elif len(output_type_list) > 1:
-            output_create = f"""
-  {self.outputs['return_type']} api_output({len(output_type_list)});"""
+            output_create = ""
 
             for i, out_type_item in enumerate(output_type_list):
                 kernel_output = kernel_output + f'kernel_out_{i}, '
                 output_names.append(f'kernel_out_{i}')
-                if out_type_item == 'Tensor':
-                    get_out_code = f'&api_output[{i}][0]'
-                    if inplace_flag and self.inplace_map is not None and self.outputs[
-                            'names'][i] in self.inplace_map:
-                        output_create = output_create + f"""
-  api_output[{i}].emplace_back({self.inplace_map[self.outputs['names'][i]]});"""
-
-                    else:
-                        output_create = output_create + f"""
-  api_output[{i}].emplace_back();"""
-
-                else:
-                    get_out_code = f'&api_output[{i}]'
-                    if inplace_flag and self.inplace_map is not None and self.outputs[
-                            'names'][i] in self.inplace_map:
-                        output_create = output_create + f"""
-  api_output[{i}] = {self.inplace_map[self.outputs['names'][i]]};"""
+                if inplace_flag and self.inplace_map is not None and self.outputs[
+                        'names'][i] in self.inplace_map:
+                    output_create = output_create + f"""
+  *{self.outputs['names'][i]} = {self.inplace_map[self.outputs['names'][i]]};"""
 
                 output_create = output_create + f"""
-  auto kernel_out_{i} = {set_out_func}({get_out_code}, {self.get_kernel_tensor_out_type(self.outputs['names'][i])});"""
+  auto kernel_out_{i} = {set_out_func}({self.outputs['names'][i].split('@')[0]}, {self.get_kernel_tensor_out_type(self.outputs['names'][i])});"""
 
             kernel_output = kernel_output[:-2]
         else:
diff --git a/python/paddle/utils/code_gen/strings_api_gen.py b/python/paddle/utils/code_gen/strings_api_gen.py
index 061ea6c3ceef9..815b9176cd22c 100644
--- a/python/paddle/utils/code_gen/strings_api_gen.py
+++ b/python/paddle/utils/code_gen/strings_api_gen.py
@@ -194,7 +194,7 @@ def gen_string_tensor_kernel_code(self, inplace_flag=False, code_indent=""):
 {code_indent}  auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
 {code_indent}  (*kernel_fn)({kernel_args}, {outputs_args});
 
-{code_indent}  return {self.gene_return_code()};"""
+{code_indent}  {self.gene_return_code()}"""
 
     def gene_kernel_select(self) -> str:
         api = self.api

From bf44034c6ba53e858b31faa14e70d787795caf80 Mon Sep 17 00:00:00 2001
From: chentianyu03 <chentianyu03@baidu.com>
Date: Thu, 12 May 2022 10:15:22 +0800
Subject: [PATCH 17/49] [Yaml]Tile and expand double grad (#42680)

* add tile double_grad yaml and test case

* add expand double yaml and test case

* add clip dobule grad yaml and test case

* add concat dobule grad yaml and test case
---
 .../fluid/tests/unittests/test_nn_grad.py     | 23 ++++++++++
 python/paddle/utils/code_gen/backward.yaml    | 43 +++++++++++++++++++
 2 files changed, 66 insertions(+)

diff --git a/python/paddle/fluid/tests/unittests/test_nn_grad.py b/python/paddle/fluid/tests/unittests/test_nn_grad.py
index d89465c5aecab..3a100cd321e03 100644
--- a/python/paddle/fluid/tests/unittests/test_nn_grad.py
+++ b/python/paddle/fluid/tests/unittests/test_nn_grad.py
@@ -157,6 +157,9 @@ def test_grad(self):
 
 
 class TestTileDoubleGradCheck(unittest.TestCase):
+    def tile_wrapper(self, x):
+        return paddle.tile(x[0], [4, 9])
+
     @prog_scope()
     def func(self, place):
         x_shape = [3, 12]
@@ -171,6 +174,8 @@ def func(self, place):
 
         gradient_checker.double_grad_check(
             [x], out, x_init=x_arr, place=place, eps=eps)
+        gradient_checker.double_grad_check_for_dygraph(
+            self.tile_wrapper, [x], out, x_init=x_arr, place=place)
 
     def test_grad(self):
         places = [fluid.CPUPlace()]
@@ -181,6 +186,9 @@ def test_grad(self):
 
 
 class TestExpandV2DoubleGradCheck(unittest.TestCase):
+    def expand_wrapper(self, x):
+        return paddle.expand(x[0], [4, 12])
+
     @prog_scope()
     def func(self, place):
         x_shape = [1, 12]
@@ -195,6 +203,8 @@ def func(self, place):
 
         gradient_checker.double_grad_check(
             [x], out, x_init=x_arr, place=place, eps=eps)
+        gradient_checker.double_grad_check_for_dygraph(
+            self.expand_wrapper, [x], out, x_init=x_arr, place=place)
 
     def test_grad(self):
         places = [fluid.CPUPlace()]
@@ -253,6 +263,9 @@ def test_grad(self):
 
 
 class TestClipDoubleGradCheck(unittest.TestCase):
+    def clip_wrapper(self, x):
+        return paddle.clip(x[0], min=-1., max=1.)
+
     @prog_scope()
     def func(self, place):
         x_shape = [2, 4, 10]
@@ -264,6 +277,8 @@ def func(self, place):
         x_arr = np.random.uniform(-5., 5., x_shape).astype(dtype)
 
         gradient_checker.double_grad_check([x], out, x_init=x_arr, place=place)
+        gradient_checker.double_grad_check_for_dygraph(
+            self.clip_wrapper, [x], out, x_init=x_arr, place=place)
 
     def test_grad(self):
         places = [fluid.CPUPlace()]
@@ -357,6 +372,9 @@ def func(self, place):
 
 
 class TestConcatDoubleGradCheck(unittest.TestCase):
+    def concat_wrapper(self, x):
+        return paddle.concat(x, axis=0)
+
     @prog_scope()
     def func(self, place):
         x_shape = [2, 3, 4, 5]
@@ -373,6 +391,11 @@ def func(self, place):
 
         gradient_checker.double_grad_check(
             [x1, x2], out, x_init=[x1_arr, x2_arr], place=place)
+        gradient_checker.double_grad_check_for_dygraph(
+            self.concat_wrapper, [x1, x2],
+            out,
+            x_init=[x1_arr, x2_arr],
+            place=place)
 
     def test_grad(self):
         places = [fluid.CPUPlace()]
diff --git a/python/paddle/utils/code_gen/backward.yaml b/python/paddle/utils/code_gen/backward.yaml
index 7c68829c0959f..1d27473d5c25c 100644
--- a/python/paddle/utils/code_gen/backward.yaml
+++ b/python/paddle/utils/code_gen/backward.yaml
@@ -251,6 +251,16 @@
   kernel :
     func : cholesky_solve_grad
 
+- backward_api : clip_double_grad
+  forward : clip_grad (Tensor x, Tensor grad_out, Scalar min = 0., Scalar max = 0.) -> Tensor(grad_x)
+  args : (Tensor x, Tensor grad_x_grad, Scalar min = 0., Scalar max = 0.)
+  output : Tensor(grad_out_grad)
+  infer_meta :
+    func : UnchangedInferMeta
+    param : [x]
+  kernel :
+    func : clip_grad
+
 - backward_api : clip_grad
   forward : clip (Tensor x, Scalar min, Scalar max) -> Tensor(out)
   args : (Tensor x, Tensor out_grad, Scalar min = 0., Scalar max = 0.)
@@ -260,6 +270,18 @@
     param : [x]
   kernel :
     func : clip_grad
+  backward : clip_double_grad
+
+- backward_api : concat_double_grad
+  forward : concat_grad (Tensor[] x, Tensor grad_out, Scalar axis) -> Tensor[](grad_x)
+  args : (Tensor[] grad_x_grad, Scalar axis = 0)
+  output : Tensor(grad_out_grad)
+  infer_meta :
+    func : ConcatInferMeta
+    param : [grad_x_grad, axis]
+  kernel :
+    func : concat
+  no_need_buffer : x
 
 - backward_api : concat_grad
   forward : concat (Tensor[] x, Scalar axis) -> Tensor(out)
@@ -271,6 +293,7 @@
   kernel :
     func : concat_grad
   no_need_buffer : x
+  backward : concat_double_grad
 
 - backward_api : conj_grad
   forward : conj (Tensor x) -> Tensor(out)
@@ -582,6 +605,15 @@
     func : expand_as_grad
   no_need_buffer : x
 
+- backward_api : expand_double_grad
+  forward : expand_grad (Tensor x, Tensor grad_out, IntArray shape) -> Tensor(grad_x)
+  args : (Tensor grad_x_grad, IntArray shape)
+  output : Tensor(grad_out_grad)
+  infer_meta :
+    func : ExpandInferMeta
+  kernel :
+    func : expand
+
 - backward_api : expand_grad
   forward : expand (Tensor x, IntArray shape) -> Tensor(out)
   args : (Tensor x, Tensor out_grad, IntArray shape)
@@ -592,6 +624,7 @@
   kernel :
     func : expand_grad
   no_need_buffer : x
+  backward : expand_double_grad
 
 - backward_api : expm1_grad
   forward : expm1 (Tensor x) -> Tensor(out)
@@ -1881,6 +1914,15 @@
   kernel :
     func : thresholded_relu_grad
 
+- backward_api : tile_double_grad
+  forward : tile_grad (Tensor x, Tensor grad_out, IntArray repeat_times) -> Tensor(grad_x)
+  args : (Tensor grad_x_grad, IntArray repeat_times)
+  output : Tensor(grad_out_grad)
+  infer_meta :
+    func : TileInferMeta
+  kernel :
+    func : tile
+
 - backward_api : tile_grad
   forward : tile (Tensor x, IntArray repeat_times) -> Tensor(out)
   args : (Tensor x, Tensor out_grad, IntArray repeat_times)
@@ -1891,6 +1933,7 @@
   kernel :
     func : tile_grad
   no_need_buffer : x
+  backward : tile_double_grad
 
 - backward_api : top_k_grad
   forward : top_k (Tensor x, Scalar k, int axis = -1, bool largest = true, bool sorted = true) -> Tensor(out), Tensor(indices)

From ddb3868ec18b234a96c2c83d6f3d552700f99d36 Mon Sep 17 00:00:00 2001
From: fwenguang <95677191+fwenguang@users.noreply.github.com>
Date: Thu, 12 May 2022 10:15:55 +0800
Subject: [PATCH 18/49] [MLU] add slice kernel (#42245)

---
 paddle/fluid/operators/mlu/mlu_baseop.cc      |   5 +-
 paddle/fluid/operators/slice_op_mlu.cc        | 196 ++++++
 .../tests/unittests/mlu/test_slice_op_mlu.py  | 631 ++++++++++++++++++
 3 files changed, 830 insertions(+), 2 deletions(-)
 create mode 100644 paddle/fluid/operators/slice_op_mlu.cc
 create mode 100644 python/paddle/fluid/tests/unittests/mlu/test_slice_op_mlu.py

diff --git a/paddle/fluid/operators/mlu/mlu_baseop.cc b/paddle/fluid/operators/mlu/mlu_baseop.cc
index 8c907ab0e8dec..6b801924446ca 100644
--- a/paddle/fluid/operators/mlu/mlu_baseop.cc
+++ b/paddle/fluid/operators/mlu/mlu_baseop.cc
@@ -688,8 +688,9 @@ MLUCnnlTrigonDesc::~MLUCnnlTrigonDesc() {
     const cnnlTensorDescriptor_t diff_y_desc, void* back_out) {
   cnnlHandle_t handle = GetHandleFromCTX(ctx);
 
-  PADDLE_ENFORCE_MLU_SUCCESS(cnnlSparseSoftmaxCrossEntropyWithLogits(
-      handle, mode, x_desc, input, label_desc, label, y_desc, output,
+  const cnnlComputationPreference_t prefer = CNNL_COMPUTATION_HIGH_PRECISION;
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlSparseSoftmaxCrossEntropyWithLogits_v2(
+      handle, prefer, mode, x_desc, input, label_desc, label, y_desc, output,
       diff_y_desc, back_out));
 }
 
diff --git a/paddle/fluid/operators/slice_op_mlu.cc b/paddle/fluid/operators/slice_op_mlu.cc
new file mode 100644
index 0000000000000..43322e4b2e75b
--- /dev/null
+++ b/paddle/fluid/operators/slice_op_mlu.cc
@@ -0,0 +1,196 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/slice_op.h"
+
+#include "paddle/fluid/operators/mlu/mlu_baseop.h"
+#include "paddle/phi/kernels/funcs/slice_utils.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename T>
+class SliceMLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* input = ctx.Input<Tensor>("Input");
+    auto* out = ctx.Output<Tensor>("Out");
+
+    auto axes = ctx.Attr<std::vector<int>>("axes");
+    auto starts = ctx.Attr<std::vector<int>>("starts");
+    auto ends = ctx.Attr<std::vector<int>>("ends");
+
+    auto decrease_axis = ctx.Attr<std::vector<int>>("decrease_axis");
+    auto infer_flags = ctx.Attr<std::vector<int>>("infer_flags");
+
+    // Get the accurate attribute value of starts and ends
+    auto starts_tensor_list = ctx.MultiInput<Tensor>("StartsTensorList");
+    if (ctx.HasInput("StartsTensor")) {
+      starts = GetDataFromTensor<int>(ctx.Input<Tensor>("StartsTensor"));
+    } else if (starts_tensor_list.size() > 0) {
+      starts = GetDataFromTensorList<int>(starts_tensor_list);
+    }
+
+    auto ends_tensor_list = ctx.MultiInput<Tensor>("EndsTensorList");
+    if (ctx.HasInput("EndsTensor")) {
+      ends = GetDataFromTensor<int>(ctx.Input<Tensor>("EndsTensor"));
+    } else if (ends_tensor_list.size() > 0) {
+      ends = GetDataFromTensorList<int>(ends_tensor_list);
+    }
+
+    PADDLE_ENFORCE_EQ(
+        starts.size(), axes.size(),
+        platform::errors::InvalidArgument(
+            "The size of starts must be equal to the size of axes."));
+    PADDLE_ENFORCE_EQ(
+        ends.size(), axes.size(),
+        platform::errors::InvalidArgument(
+            "The size of ends must be equal to the size of axes."));
+
+    const auto& in_dims = input->dims();
+    auto slice_dims = out->dims();
+    bool reset_slice_dims = false;
+    if (ctx.HasInput("StartsTensor") || ctx.HasInput("EndsTensor") ||
+        starts_tensor_list.size() > 0 || ends_tensor_list.size() > 0) {
+      // Infer output dims
+      for (size_t i = 0; i < axes.size(); ++i) {
+        // when start == -1 && end == start+1
+        if (starts[i] == -1 && ends[i] == 0 && infer_flags[i] == -1) {
+          auto ret =
+              std::find(decrease_axis.begin(), decrease_axis.end(), axes[i]);
+          if (ret != decrease_axis.end()) {
+            ends[i] = in_dims[axes[i]];
+          }
+        }
+      }
+
+      phi::funcs::CheckAndUpdateSliceAttrs(in_dims, axes, &starts, &ends);
+      slice_dims = phi::funcs::GetSliceDims<int>(in_dims, axes, starts, ends,
+                                                 nullptr, nullptr);
+      reset_slice_dims = true;
+      auto out_dims = phi::funcs::GetDecreasedDims(slice_dims, decrease_axis);
+
+      out->Resize(out_dims);
+    }
+    if (slice_dims.size() != in_dims.size() && !reset_slice_dims) {
+      phi::funcs::CheckAndUpdateSliceAttrs(in_dims, axes, &starts, &ends);
+      slice_dims = phi::funcs::GetSliceDims<int>(in_dims, axes, starts, ends,
+                                                 nullptr, nullptr);
+    }
+
+    int in_dim_size = input->dims().size();
+    if (static_cast<int>(axes.size()) != in_dim_size) {
+      std::vector<int> tmp_starts(in_dim_size, 0);
+      const auto& in_dims_vec = phi::vectorize(input->dims());
+      std::vector<int> tmp_ends(in_dims_vec.begin(), in_dims_vec.end());
+      for (size_t i = 0; i < axes.size(); ++i) {
+        tmp_starts[axes[i]] = starts[i];
+        tmp_ends[axes[i]] = ends[i];
+      }
+      starts.swap(tmp_starts);
+      ends.swap(tmp_ends);
+    }
+    std::vector<int> strides(in_dim_size, 1);
+
+    out->mutable_data<T>(ctx.GetPlace());
+
+    MLUCnnlTensorDesc input_desc(*input);
+    MLUCnnlTensorDesc out_desc(slice_dims.size(),
+                               phi::vectorize(slice_dims).data(),
+                               ToCnnlDataType<T>());
+    MLUCnnl::StridedSlice(ctx, starts.data(), ends.data(), strides.data(),
+                          input_desc.get(), GetBasePtr(input), out_desc.get(),
+                          GetBasePtr(out));
+  }
+};
+
+template <typename T>
+class SliceGradMLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* input = ctx.Input<Tensor>("Input");
+    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto* dinput = ctx.Output<Tensor>(framework::GradVarName("Input"));
+
+    auto axes = ctx.Attr<std::vector<int>>("axes");
+    auto starts = ctx.Attr<std::vector<int>>("starts");
+    auto ends = ctx.Attr<std::vector<int>>("ends");
+
+    // Get the accurate attribute value of starts and ends
+    auto starts_tensor_list = ctx.MultiInput<Tensor>("StartsTensorList");
+    if (ctx.HasInput("StartsTensor")) {
+      starts = GetDataFromTensor<int>(ctx.Input<Tensor>("StartsTensor"));
+    } else if (starts_tensor_list.size() > 0) {
+      starts = GetDataFromTensorList<int>(starts_tensor_list);
+    }
+
+    auto ends_tensor_list = ctx.MultiInput<Tensor>("EndsTensorList");
+    if (ctx.HasInput("EndsTensor")) {
+      ends = GetDataFromTensor<int>(ctx.Input<Tensor>("EndsTensor"));
+    } else if (ends_tensor_list.size() > 0) {
+      ends = GetDataFromTensorList<int>(ends_tensor_list);
+    }
+
+    const auto& in_dims = input->dims();
+    auto slice_dims = dout->dims();
+    if (slice_dims.size() != in_dims.size()) {
+      phi::funcs::CheckAndUpdateSliceAttrs(in_dims, axes, &starts, &ends);
+      slice_dims = phi::funcs::GetSliceDims<int>(in_dims, axes, starts, ends,
+                                                 nullptr, nullptr);
+    }
+
+    int in_dim_size = input->dims().size();
+    if (static_cast<int>(axes.size()) != in_dim_size) {
+      std::vector<int> tmp_starts(in_dim_size, 0);
+      const auto& in_dims_vec = phi::vectorize(input->dims());
+      std::vector<int> tmp_ends(in_dims_vec.begin(), in_dims_vec.end());
+      for (size_t i = 0; i < axes.size(); ++i) {
+        tmp_starts[axes[i]] = starts[i];
+        tmp_ends[axes[i]] = ends[i];
+      }
+      starts.swap(tmp_starts);
+      ends.swap(tmp_ends);
+    }
+    std::vector<int> strides(in_dim_size, 1);
+
+    dinput->mutable_data<T>(ctx.GetPlace());
+
+    MLUCnnlTensorDesc dout_desc(slice_dims.size(),
+                                phi::vectorize(slice_dims).data(),
+                                ToCnnlDataType<T>());
+    MLUCnnlTensorDesc dinput_desc(*dinput);
+    MLUCnnl::StridedSliceGrad(ctx, starts.data(), ends.data(), strides.data(),
+                              dout_desc.get(), GetBasePtr(dout),
+                              dinput_desc.get(), GetBasePtr(dinput));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_MLU_KERNEL(slice, ops::SliceMLUKernel<float>,
+                       ops::SliceMLUKernel<int>, ops::SliceMLUKernel<bool>,
+                       ops::SliceMLUKernel<int64_t>,
+                       ops::SliceMLUKernel<double>,
+                       ops::SliceMLUKernel<paddle::platform::float16>);
+
+REGISTER_OP_MLU_KERNEL(slice_grad, ops::SliceGradMLUKernel<float>,
+                       ops::SliceGradMLUKernel<int>,
+                       ops::SliceGradMLUKernel<bool>,
+                       ops::SliceGradMLUKernel<int64_t>,
+                       ops::SliceGradMLUKernel<paddle::platform::float16>);
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_slice_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_slice_op_mlu.py
new file mode 100644
index 0000000000000..44532ddceb765
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/test_slice_op_mlu.py
@@ -0,0 +1,631 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import paddle.fluid.core as core
+import sys
+sys.path.append('..')
+from op_test import OpTest
+import paddle.fluid as fluid
+import paddle.fluid.layers as layers
+import paddle
+
+paddle.enable_static()
+
+
+# Situation 1: starts(list, no tensor), ends(list, no tensor)
+# 1.1 without attr(decrease)
+class TestSliceOp(OpTest):
+    def setUp(self):
+        self.op_type = "slice"
+        self.set_mlu()
+        self.config()
+        self.inputs = {'Input': self.input}
+        self.outputs = {'Out': self.out}
+        self.attrs = {
+            'axes': self.axes,
+            'starts': self.starts,
+            'ends': self.ends,
+            'infer_flags': self.infer_flags
+        }
+
+    def config(self):
+        self.input = np.random.random([3, 4, 5, 6]).astype("float32")
+        self.starts = [1, 0, 2]
+        self.ends = [3, 3, 4]
+        self.axes = [0, 1, 2]
+        self.infer_flags = [1, 1, 1]
+        self.out = self.input[1:3, 0:3, 2:4, :]
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+    def test_check_grad_normal(self):
+        self.check_grad_with_place(
+            self.place, ['Input'], 'Out', max_relative_error=0.006)
+
+    def set_mlu(self):
+        self.__class__.use_mlu = True
+        self.place = paddle.MLUPlace(0)
+
+
+class TestCase1(TestSliceOp):
+    def config(self):
+        self.input = np.random.random([3, 4, 5, 6]).astype("float32")
+        self.starts = [-3, 0, 2]
+        self.ends = [3, 100, -1]
+        self.axes = [0, 1, 2]
+        self.infer_flags = [1, 1, 1]
+        self.out = self.input[-3:3, 0:100, 2:-1, :]
+
+
+class TestCase2(TestSliceOp):
+    def config(self):
+        self.input = np.random.random([3, 4, 5, 6]).astype("float32")
+        self.starts = [-3, 0, 2]
+        self.ends = [3, 100, -1]
+        self.axes = [0, 1, 3]
+        self.infer_flags = [1, 1, 1]
+        self.out = self.input[-3:3, 0:100, :, 2:-1]
+
+
+# 1.2 with attr(decrease)
+class TestSliceOp_decs_dim(OpTest):
+    def setUp(self):
+        self.op_type = "slice"
+        self.set_mlu()
+        self.config()
+        self.inputs = {'Input': self.input}
+        self.outputs = {'Out': self.out}
+        self.attrs = {
+            'axes': self.axes,
+            'starts': self.starts,
+            'ends': self.ends,
+            'infer_flags': self.infer_flags,
+            'decrease_axis': self.decrease_axis,
+        }
+
+    def config(self):
+        self.input = np.random.random([3, 4, 5, 6]).astype("float32")
+        self.starts = [1, 0, 2]
+        self.ends = [2, 3, 4]
+        self.axes = [0, 1, 2]
+        self.decrease_axis = [0]
+        self.infer_flags = [1, 1, 1]
+        self.out = self.input[1, 0:3, 2:4, :]
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+    def test_check_grad_normal(self):
+        self.check_grad_with_place(
+            self.place, ['Input'], 'Out', max_relative_error=0.006)
+
+    def set_mlu(self):
+        self.__class__.use_mlu = True
+        self.place = paddle.MLUPlace(0)
+
+
+class TestSliceOp_decs_dim_2(TestSliceOp_decs_dim):
+    def config(self):
+        self.input = np.random.random([3, 4, 5, 6]).astype("float32")
+        self.starts = [1, 0, 2]
+        self.ends = [2, 1, 4]
+        self.axes = [0, 1, 2]
+        self.decrease_axis = [0, 1]
+        self.infer_flags = [1, 1, 1]
+        self.out = self.input[1, 0, 2:4, :]
+
+
+class TestSliceOp_decs_dim_3(TestSliceOp_decs_dim):
+    def config(self):
+        self.input = np.random.random([3, 4, 5, 6]).astype("float32")
+        self.starts = [-1, 0, 2]
+        self.ends = [1000000, 1, 4]
+        self.axes = [0, 1, 2]
+        self.decrease_axis = [0, 1]
+        self.infer_flags = [1, 1, 1]
+        self.out = self.input[-1, 0, 2:4, :]
+
+
+class TestSliceOp_decs_dim_4(TestSliceOp_decs_dim):
+    def config(self):
+        self.input = np.random.random([3, 4, 5, 7]).astype("float32")
+        self.starts = [0, 1, 2, 3]
+        self.ends = [1, 2, 3, 4]
+        self.axes = [0, 1, 2, 3]
+        self.decrease_axis = [0, 1, 2, 3]
+        self.infer_flags = [1, 1, 1]
+        self.out = self.input[0, 1, 2, 3:4]
+
+
+class TestSliceOp_decs_dim_5(TestSliceOp_decs_dim):
+    def config(self):
+        self.input = np.random.random([3, 4, 5, 6]).astype("float32")
+        self.starts = [-1]
+        self.ends = [1000000]
+        self.axes = [3]
+        self.decrease_axis = [3]
+        self.infer_flags = [1, 1, 1]
+        self.out = self.input[:, :, :, -1]
+
+
+class TestSliceOp_decs_dim_6(TestSliceOp_decs_dim):
+    def config(self):
+        self.input = np.random.random([3, 4, 5, 6]).astype("float32")
+        self.starts = [0, 1, 2, 3]
+        self.ends = [1, 2, 3, 4]
+        self.axes = [0, 1, 2, 3]
+        self.decrease_axis = [0, 1, 2, 3]
+        self.infer_flags = [1, 1, 1]
+        self.out = self.input[0, 1, 2, 3:4]
+
+
+# Situation 2: starts(list, have tensor), ends(list, no tensor)
+# without attr(decrease)
+class TestSliceOp_starts_ListTensor(OpTest):
+    def setUp(self):
+        self.op_type = "slice"
+        self.set_mlu()
+        self.config()
+
+        starts_tensor = []
+        for index, ele in enumerate(self.starts):
+            starts_tensor.append(("x" + str(index), np.ones(
+                (1)).astype('int64') * ele))
+
+        self.inputs = {'Input': self.input, 'StartsTensorList': starts_tensor}
+        self.outputs = {'Out': self.out}
+        self.attrs = {
+            'axes': self.axes,
+            'starts': self.starts_infer,
+            'ends': self.ends,
+            'infer_flags': self.infer_flags
+        }
+
+    def config(self):
+        self.input = np.random.random([3, 4, 5, 6]).astype("float32")
+        self.starts = [1, 0, 2]
+        self.ends = [3, 3, 4]
+        self.axes = [0, 1, 2]
+        self.infer_flags = [-1, 1, -1]
+        self.out = self.input[1:3, 0:3, 2:4, :]
+
+        self.starts_infer = [-1, 0, -1]
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+    def test_check_grad_normal(self):
+        self.check_grad_with_place(
+            self.place, ['Input'], 'Out', max_relative_error=0.006)
+
+    def set_mlu(self):
+        self.__class__.use_mlu = True
+        self.place = paddle.MLUPlace(0)
+
+
+# Situation 2: starts(list, have tensor), ends(list, no tensor)
+#  with attr(decrease)
+class TestSliceOp_decs_dim_starts_ListTensor(OpTest):
+    def setUp(self):
+        self.op_type = "slice"
+        self.set_mlu()
+        self.config()
+
+        starts_tensor = []
+        for index, ele in enumerate(self.starts):
+            starts_tensor.append(("x" + str(index), np.ones(
+                (1)).astype('int32') * ele))
+
+        self.inputs = {'Input': self.input, 'StartsTensorList': starts_tensor}
+
+        self.outputs = {'Out': self.out}
+        self.attrs = {
+            'axes': self.axes,
+            'starts': self.starts_infer,
+            'ends': self.ends,
+            'infer_flags': self.infer_flags,
+            'decrease_axis': self.decrease_axis,
+        }
+
+    def config(self):
+        self.input = np.random.random([3, 4, 5, 6]).astype("float32")
+        self.starts = [1, 0, 2]
+        self.ends = [2, 3, 4]
+        self.axes = [0, 1, 2]
+        self.decrease_axis = [0]
+        self.infer_flags = [1, -1, 1]
+        self.out = self.input[1, 0:3, 2:4, :]
+
+        self.starts_infer = [1, -1, 2]
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+    def test_check_grad_normal(self):
+        self.check_grad_with_place(
+            self.place, ['Input'], 'Out', max_relative_error=0.006)
+
+    def set_mlu(self):
+        self.__class__.use_mlu = True
+        self.place = paddle.MLUPlace(0)
+
+
+class TestSliceOp_decs_dim_5_starts_ListTensor(
+        TestSliceOp_decs_dim_starts_ListTensor):
+    def config(self):
+        self.input = np.random.random([3, 4, 5, 6]).astype("float32")
+        self.starts = [-1]
+        self.ends = [1000000]
+        self.axes = [3]
+        self.decrease_axis = [3]
+        self.infer_flags = [-1]
+        self.out = self.input[:, :, :, -1]
+
+        self.starts_infer = [-1]
+
+
+# Situation 3: starts(tensor), ends(list, no tensor)
+# with attr(decrease)
+class TestSliceOp_decs_dim_starts_OneTensor(OpTest):
+    def setUp(self):
+        self.op_type = "slice"
+        self.__class__.use_mlu = True
+        self.place = paddle.MLUPlace(0)
+        self.config()
+        self.inputs = {
+            'Input': self.input,
+            "StartsTensor": np.array(
+                self.starts, dtype="int32")
+        }
+        self.outputs = {'Out': self.out}
+        self.attrs = {
+            'axes': self.axes,
+            #'starts': self.starts,
+            'ends': self.ends,
+            'infer_flags': self.infer_flags,
+            'decrease_axis': self.decrease_axis,
+        }
+
+    def config(self):
+        self.input = np.random.random([3, 4, 5, 6]).astype("float32")
+        self.starts = [1, 0, 2]
+        self.ends = [2, 3, 4]
+        self.axes = [0, 1, 2]
+        self.decrease_axis = [0]
+        self.infer_flags = [-1, -1, -1]
+        self.out = self.input[1, 0:3, 2:4, :]
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+    def test_check_grad_normal(self):
+        self.check_grad_with_place(
+            self.place, ['Input'], 'Out', max_relative_error=0.006)
+
+
+# Situation 4: starts(tensor), ends(tensor)
+# without attr(decrease)
+class TestSliceOp_starts_OneTensor_ends_OneTensor(OpTest):
+    def setUp(self):
+        self.op_type = "slice"
+        self.__class__.use_mlu = True
+        self.place = paddle.MLUPlace(0)
+        self.config()
+
+        self.inputs = {
+            'Input': self.input,
+            "StartsTensor": np.array(
+                self.starts, dtype="int64"),
+            "EndsTensor": np.array(
+                self.ends, dtype="int32")
+        }
+        self.outputs = {'Out': self.out}
+        self.attrs = {
+            'axes': self.axes,
+            #'starts': self.starts,
+            #'ends': self.ends_infer,
+            'infer_flags': self.infer_flags
+        }
+
+    def config(self):
+        self.input = np.random.random([3, 4, 5, 6]).astype("float32")
+        self.starts = [1, 0, 2]
+        self.ends = [3, 3, 4]
+        self.axes = [0, 1, 2]
+        self.infer_flags = [-1, -1, -1]
+        self.out = self.input[1:3, 0:3, 2:4, :]
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+    def test_check_grad_normal(self):
+        self.check_grad_with_place(
+            self.place, ['Input'], 'Out', max_relative_error=0.006)
+
+
+# Situation 5: starts(tensor), ends(tensor)
+#  with attr(decrease)
+class TestSliceOp_decs_dim_starts_and_ends_OneTensor(OpTest):
+    def setUp(self):
+        self.op_type = "slice"
+        self.__class__.use_mlu = True
+        self.place = paddle.MLUPlace(0)
+        self.config()
+        self.inputs = {
+            'Input': self.input,
+            "StartsTensor": np.array(
+                self.starts, dtype="int32"),
+            "EndsTensor": np.array(
+                self.ends, dtype="int32")
+        }
+        self.outputs = {'Out': self.out}
+        self.attrs = {
+            'axes': self.axes,
+            #'starts': self.starts,
+            #'ends': self.ends,
+            'infer_flags': self.infer_flags,
+            'decrease_axis': self.decrease_axis,
+        }
+
+    def config(self):
+        self.input = np.random.random([3, 4, 5, 6]).astype("float32")
+        self.starts = [1, 0, 2]
+        self.ends = [2, 1, 4]
+        self.axes = [0, 1, 2]
+        self.decrease_axis = [0, 1]
+        self.infer_flags = [-1, -1, -1]
+        self.out = self.input[1, 0, 2:4, :]
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+    def test_check_grad_normal(self):
+        self.check_grad_with_place(
+            self.place, ['Input'], 'Out', max_relative_error=0.006)
+
+
+# Situation 6: starts(tensor), ends(list, have tensor)
+# without attr(decrease)
+class TestSliceOp_starts_OneTensor_ends_ListTensor(OpTest):
+    def setUp(self):
+        self.op_type = "slice"
+        self.__class__.use_mlu = True
+        self.place = paddle.MLUPlace(0)
+        self.config()
+
+        ends_tensor = []
+        for index, ele in enumerate(self.ends):
+            ends_tensor.append(("y" + str(index), np.ones(
+                (1)).astype('int32') * ele))
+
+        self.inputs = {
+            'Input': self.input,
+            "StartsTensor": np.array(
+                self.starts, dtype="int32"),
+            'EndsTensorList': ends_tensor
+        }
+        self.outputs = {'Out': self.out}
+        self.attrs = {
+            'axes': self.axes,
+            #'starts': self.starts,
+            'ends': self.ends_infer,
+            'infer_flags': self.infer_flags
+        }
+
+    def config(self):
+        self.input = np.random.random([3, 4, 5, 6]).astype("float32")
+        self.starts = [1, 0, 2]
+        self.ends = [3, 3, 4]
+        self.axes = [0, 1, 2]
+        self.infer_flags = [-1, -1, -1]
+        self.out = self.input[1:3, 0:3, 2:4, :]
+
+        self.ends_infer = [-1, 3, 4]
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+    def test_check_grad_normal(self):
+        self.check_grad_with_place(
+            self.place, ['Input'], 'Out', max_relative_error=0.006)
+
+
+# Test float16
+class TestFP16(OpTest):
+    def setUp(self):
+        self.op_type = "slice"
+        self.__class__.use_mlu = True
+        self.place = paddle.MLUPlace(0)
+        self.config()
+        self.inputs = {'Input': self.input}
+        self.outputs = {'Out': self.out}
+        self.attrs = {
+            'axes': self.axes,
+            'starts': self.starts,
+            'ends': self.ends,
+            'infer_flags': self.infer_flags
+        }
+
+    def config(self):
+        self.dtype = "float16"
+        self.input = np.random.random([3, 4, 5, 6]).astype(self.dtype)
+        self.starts = [-3, 0, 2]
+        self.ends = [3, 100, -1]
+        self.axes = [0, 1, 3]
+        self.out = self.input[-3:3, 0:100, :, 2:-1]
+        self.infer_flags = [1, 1, 1]
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, atol=1e-5)
+
+    def test_check_grad_normal(self):
+        self.check_grad_with_place(
+            self.place, ['Input'], 'Out', max_relative_error=0.006)
+
+
+class TestFP16_2(OpTest):
+    def setUp(self):
+        self.op_type = "slice"
+        self.__class__.use_mlu = True
+        self.place = paddle.MLUPlace(0)
+        self.config()
+        self.inputs = {'Input': self.input}
+        self.outputs = {'Out': self.out}
+        self.attrs = {
+            'axes': self.axes,
+            'starts': self.starts,
+            'ends': self.ends,
+            'infer_flags': self.infer_flags
+        }
+
+    def config(self):
+        self.dtype = "float16"
+        self.input = np.random.random([3, 4, 10]).astype(self.dtype)
+        self.starts = [0]
+        self.ends = [1]
+        self.axes = [1]
+        self.out = self.input[:, 0:1, :]
+        self.infer_flags = [1]
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, atol=1e-5)
+
+    def test_check_grad_normal(self):
+        self.check_grad_with_place(
+            self.place, ['Input'],
+            'Out',
+            max_relative_error=0.006,
+            numeric_grad_delta=0.5)
+
+
+class TestSliceApiWithTensor(unittest.TestCase):
+    def test_starts_ends_is_tensor(self):
+        with paddle.fluid.dygraph.guard():
+            a = paddle.rand(shape=[4, 5, 6], dtype='float32')
+            axes = [0, 1, 2]
+            starts = [-3, 0, 2]
+            ends = [3, 2, 4]
+            a_1 = paddle.slice(
+                a,
+                axes=axes,
+                starts=paddle.to_tensor(
+                    starts, dtype='int32'),
+                ends=paddle.to_tensor(
+                    ends, dtype='int32'))
+            a_2 = paddle.slice(a, axes=axes, starts=starts, ends=ends)
+
+            self.assertTrue(np.array_equal(a_1.numpy(), a_2.numpy()))
+
+    def test_bool_tensor(self):
+        with paddle.fluid.dygraph.guard():
+            array = (np.arange(60).reshape([3, 4, 5]) % 3).astype('bool')
+            tt = paddle.to_tensor(array)
+            tt.stop_gradient = False
+
+            starts = [0, 1, 2]
+            ends = [3, 5, 4]
+            axes = [0, 1, 2]
+
+            y_paddle = paddle.slice(tt, axes, starts, ends)
+            y_np = tt[0:3, 1:5, 2:4]
+
+            self.assertTrue(paddle.bool == y_paddle.dtype)
+            self.assertTrue(np.array_equal(y_paddle.numpy(), y_np))
+
+
+class TestImperativeVarBaseGetItem(unittest.TestCase):
+    def test_getitem_with_long(self):
+        with fluid.dygraph.guard():
+            data = np.random.random((2, 80, 16128)).astype('float32')
+            var = fluid.dygraph.to_variable(data)
+            sliced = var[:, 10:, :var.shape[1]]  # var.shape[1] is 80L here
+            self.assertEqual(sliced.shape, [2, 70, 80])
+
+            sliced = var[:, var.shape[0]:, var.shape[0]:var.shape[1]]
+            self.assertEqual(sliced.shape, [2, 78, 78])
+
+    def test_getitem_with_float(self):
+        def test_float_in_slice_item():
+            with fluid.dygraph.guard():
+                data = np.random.random((2, 80, 16128)).astype('float32')
+                var = fluid.dygraph.to_variable(data)
+                sliced = var[:, 1.1:, :var.shape[1]]
+
+        self.assertRaises(Exception, test_float_in_slice_item)
+
+        def test_float_in_index():
+            with fluid.dygraph.guard():
+                data = np.random.random((2, 80, 16128)).astype('float32')
+                var = fluid.dygraph.to_variable(data)
+                sliced = var[1.1]
+
+        self.assertRaises(Exception, test_float_in_index)
+
+
+class TestInferShape(unittest.TestCase):
+    def test(self):
+        x = paddle.ones(shape=[3, 4, 5])
+        x.desc.set_shape([3, -1, 5])
+        self.assertEqual(x.shape, (3, -1, 5))
+
+        out0 = paddle.slice(x, axes=[1], starts=[0], ends=[3])
+        self.assertEqual(out0.shape, (3, 3, 5))
+
+    def test_axis_less_than_zero(self):
+
+        # Using paddle.disable_static will make other unittests fail.
+        with fluid.dygraph.guard():
+            x_arr = np.arange(0, 24, dtype=np.float32).reshape([2, 3, 4])
+            x = paddle.to_tensor(x_arr)
+
+            pp_slice = paddle.slice(x, [100, ], [0], [1])
+            np_slice = x_arr[:, :, 0:1]
+            self.assertTrue(np.array_equal(pp_slice, np_slice))
+
+            pp_slice = paddle.slice(x, (-100, ), [0], [1])
+            np_slice = x_arr[0:1]
+            self.assertTrue(np.array_equal(pp_slice, np_slice))
+
+            x_arr = np.array([], dtype=np.float32)
+            x = paddle.to_tensor(np.reshape(x_arr, (0, 0, 0)))
+
+            starts = paddle.to_tensor(
+                np.reshape(
+                    np.array(
+                        [], dtype=np.int32), (0, )))
+            ends = paddle.to_tensor(
+                np.reshape(
+                    np.array(
+                        [], dtype=np.int32), (0, )))
+
+            with self.assertRaises(ValueError):
+                paddle.slice(x, [-1000000], starts, ends)
+
+            with self.assertRaises(ValueError):
+                paddle.slice(x, [1000000], starts, ends)
+
+            with self.assertRaises(ValueError):
+                paddle.slice(x, [], starts, ends)
+
+            with self.assertRaises(ValueError):
+                paddle.slice(x, 0, starts, ends)
+
+
+if __name__ == '__main__':
+    unittest.main()

From f1eda7d01e9d51bd151794c11e79abe85c3b500b Mon Sep 17 00:00:00 2001
From: tiancaishaonvjituizi <452565578@qq.com>
Date: Thu, 12 May 2022 10:22:38 +0800
Subject: [PATCH 19/49] =?UTF-8?q?=E3=80=90Hackathon=20No.60=E3=80=91refact?=
 =?UTF-8?q?or=20unary=20sparse=20ops=20and=20add=20sparse=20sqrt,=20tanh,?=
 =?UTF-8?q?=20sin=20(#41356)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 paddle/phi/kernels/activation_grad_kernel.h   |   1 +
 .../kernels/sparse/activation_grad_kernel.cc  |  70 -------
 .../kernels/sparse/activation_grad_kernel.h   |  29 ---
 .../phi/kernels/sparse/activation_kernel.cc   |  66 -------
 paddle/phi/kernels/sparse/activation_kernel.h |  39 ----
 .../phi/kernels/sparse/unary_grad_kernel.cc   | 183 ++++++++++++++++++
 paddle/phi/kernels/sparse/unary_grad_kernel.h |  41 ++++
 paddle/phi/kernels/sparse/unary_kernel.cc     | 177 +++++++++++++++++
 paddle/phi/kernels/sparse/unary_kernel.h      |  48 +++++
 .../kernels/test_sparse_activation_dev_api.cc |   6 +-
 .../unittests/test_sparse_activation_op.py    |  50 -----
 .../tests/unittests/test_sparse_unary_op.py   | 133 +++++++++++++
 python/paddle/sparse/__init__.py              |  16 +-
 python/paddle/sparse/functional/__init__.py   |   7 +-
 python/paddle/sparse/functional/activation.py |  53 -----
 python/paddle/sparse/functional/unary.py      | 177 +++++++++++++++++
 python/paddle/sparse/layer/__init__.py        |   2 +-
 .../sparse/layer/{activation.py => unary.py}  |   0
 python/paddle/utils/code_gen/sparse_api.yaml  |  68 ++++++-
 .../paddle/utils/code_gen/sparse_bw_api.yaml  |  35 +++-
 20 files changed, 867 insertions(+), 334 deletions(-)
 delete mode 100644 paddle/phi/kernels/sparse/activation_grad_kernel.cc
 delete mode 100644 paddle/phi/kernels/sparse/activation_grad_kernel.h
 delete mode 100644 paddle/phi/kernels/sparse/activation_kernel.cc
 delete mode 100644 paddle/phi/kernels/sparse/activation_kernel.h
 create mode 100644 paddle/phi/kernels/sparse/unary_grad_kernel.cc
 create mode 100644 paddle/phi/kernels/sparse/unary_grad_kernel.h
 create mode 100644 paddle/phi/kernels/sparse/unary_kernel.cc
 create mode 100644 paddle/phi/kernels/sparse/unary_kernel.h
 delete mode 100644 python/paddle/fluid/tests/unittests/test_sparse_activation_op.py
 create mode 100644 python/paddle/fluid/tests/unittests/test_sparse_unary_op.py
 delete mode 100644 python/paddle/sparse/functional/activation.py
 create mode 100644 python/paddle/sparse/functional/unary.py
 rename python/paddle/sparse/layer/{activation.py => unary.py} (100%)

diff --git a/paddle/phi/kernels/activation_grad_kernel.h b/paddle/phi/kernels/activation_grad_kernel.h
index fd42756ba3867..084843c31cf52 100644
--- a/paddle/phi/kernels/activation_grad_kernel.h
+++ b/paddle/phi/kernels/activation_grad_kernel.h
@@ -187,6 +187,7 @@ DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(Log1p);
 DECLARE_ACTIVATION_GRAD_KERNEL_DEPOUT(Relu);
 DECLARE_ACTIVATION_GRAD_KERNEL_DEPOUT(Tanh);
 DECLARE_ACTIVATION_GRAD_KERNEL_DEPOUT(Sigmoid);
+DECLARE_ACTIVATION_GRAD_KERNEL_DEPOUT(Sqrt);
 
 DECLARE_ACTIVATION_GRAD_KERNEL_NODEP(Round);
 DECLARE_ACTIVATION_GRAD_KERNEL_NODEP(Floor);
diff --git a/paddle/phi/kernels/sparse/activation_grad_kernel.cc b/paddle/phi/kernels/sparse/activation_grad_kernel.cc
deleted file mode 100644
index 9eca14e660939..0000000000000
--- a/paddle/phi/kernels/sparse/activation_grad_kernel.cc
+++ /dev/null
@@ -1,70 +0,0 @@
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/phi/kernels/sparse/activation_grad_kernel.h"
-#include "paddle/phi/kernels/activation_grad_kernel.h"
-#include "paddle/phi/kernels/copy_kernel.h"
-#include "paddle/phi/kernels/empty_kernel.h"
-
-#include "paddle/phi/backends/cpu/cpu_context.h"
-#include "paddle/phi/backends/gpu/gpu_context.h"
-#include "paddle/phi/core/kernel_registry.h"
-
-namespace phi {
-namespace sparse {
-
-template <typename T, typename Context>
-void SparseReluGradKernel(const Context& dev_ctx,
-                          const SparseCooTensor& x,
-                          const SparseCooTensor& out_grad,
-                          SparseCooTensor* x_grad) {
-  DenseTensor non_zero_indices =
-      phi::EmptyLike<T, Context>(dev_ctx, x.non_zero_indices());
-  DenseTensor non_zero_elements =
-      phi::EmptyLike<T, Context>(dev_ctx, x.non_zero_elements());
-  phi::Copy(dev_ctx,
-            x.non_zero_indices(),
-            dev_ctx.GetPlace(),
-            false,
-            &non_zero_indices);
-  phi::ReluGradKernel<T, Context>(dev_ctx,
-                                  x.non_zero_elements(),
-                                  out_grad.non_zero_elements(),
-                                  &non_zero_elements);
-  x_grad->SetMember(non_zero_indices, non_zero_elements, x.dims(), true);
-}
-
-}  // namespace sparse
-}  // namespace phi
-
-PD_REGISTER_KERNEL(sparse_relu_grad,
-                   CPU,
-                   ALL_LAYOUT,
-                   phi::sparse::SparseReluGradKernel,
-                   float,
-                   double) {
-  kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO);
-}
-
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-PD_REGISTER_KERNEL(sparse_relu_grad,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::sparse::SparseReluGradKernel,
-                   float,
-                   double,
-                   phi::dtype::float16) {
-  kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO);
-}
-#endif
diff --git a/paddle/phi/kernels/sparse/activation_grad_kernel.h b/paddle/phi/kernels/sparse/activation_grad_kernel.h
deleted file mode 100644
index aab4a3e5a590b..0000000000000
--- a/paddle/phi/kernels/sparse/activation_grad_kernel.h
+++ /dev/null
@@ -1,29 +0,0 @@
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/phi/core/sparse_coo_tensor.h"
-
-namespace phi {
-namespace sparse {
-
-template <typename T, typename Context>
-void SparseReluGradKernel(const Context& dev_ctx,
-                          const SparseCooTensor& x,
-                          const SparseCooTensor& out_grad,
-                          SparseCooTensor* x_grad);
-
-}  // namespace sparse
-}  // namespace phi
diff --git a/paddle/phi/kernels/sparse/activation_kernel.cc b/paddle/phi/kernels/sparse/activation_kernel.cc
deleted file mode 100644
index a1a00897d33cf..0000000000000
--- a/paddle/phi/kernels/sparse/activation_kernel.cc
+++ /dev/null
@@ -1,66 +0,0 @@
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/phi/kernels/sparse/activation_kernel.h"
-#include "paddle/phi/kernels/copy_kernel.h"
-#include "paddle/phi/kernels/empty_kernel.h"
-
-#include "paddle/phi/backends/cpu/cpu_context.h"
-#include "paddle/phi/backends/gpu/gpu_context.h"
-#include "paddle/phi/core/kernel_registry.h"
-
-namespace phi {
-namespace sparse {
-
-template <typename T, typename Context>
-void SparseReluKernel(const Context& dev_ctx,
-                      const SparseCooTensor& x,
-                      SparseCooTensor* out) {
-  DenseTensor non_zero_indices =
-      phi::EmptyLike<T, Context>(dev_ctx, x.non_zero_indices());
-  DenseTensor non_zero_elements =
-      phi::EmptyLike<T, Context>(dev_ctx, x.non_zero_elements());
-  phi::Copy(dev_ctx,
-            x.non_zero_indices(),
-            dev_ctx.GetPlace(),
-            false,
-            &non_zero_indices);
-  phi::ReluKernel<T, Context>(
-      dev_ctx, x.non_zero_elements(), &non_zero_elements);
-  out->SetMember(non_zero_indices, non_zero_elements, x.dims(), true);
-}
-
-}  // namespace sparse
-}  // namespace phi
-
-PD_REGISTER_KERNEL(sparse_relu,
-                   CPU,
-                   ALL_LAYOUT,
-                   phi::sparse::SparseReluKernel,
-                   float,
-                   double) {
-  kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO);
-}
-
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-PD_REGISTER_KERNEL(sparse_relu,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::sparse::SparseReluKernel,
-                   float,
-                   double,
-                   phi::dtype::float16) {
-  kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO);
-}
-#endif
diff --git a/paddle/phi/kernels/sparse/activation_kernel.h b/paddle/phi/kernels/sparse/activation_kernel.h
deleted file mode 100644
index 568c0aa8b2ecb..0000000000000
--- a/paddle/phi/kernels/sparse/activation_kernel.h
+++ /dev/null
@@ -1,39 +0,0 @@
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/core/sparse_coo_tensor.h"
-#include "paddle/phi/kernels/activation_kernel.h"
-#include "paddle/phi/kernels/empty_kernel.h"
-
-namespace phi {
-namespace sparse {
-
-template <typename T, typename Context>
-void SparseReluKernel(const Context& dev_ctx,
-                      const SparseCooTensor& x,
-                      SparseCooTensor* out);
-
-template <typename T, typename Context>
-SparseCooTensor SparseRelu(const Context& dev_ctx, const SparseCooTensor& x) {
-  DenseTensor indices, values;
-  SparseCooTensor coo(indices, values, x.dims());
-  SparseReluKernel<T, Context>(dev_ctx, x, &coo);
-  return coo;
-}
-
-}  // namespace sparse
-}  // namespace phi
diff --git a/paddle/phi/kernels/sparse/unary_grad_kernel.cc b/paddle/phi/kernels/sparse/unary_grad_kernel.cc
new file mode 100644
index 0000000000000..1fd3ef2711299
--- /dev/null
+++ b/paddle/phi/kernels/sparse/unary_grad_kernel.cc
@@ -0,0 +1,183 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/sparse/unary_grad_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/sparse_coo_tensor.h"
+#include "paddle/phi/core/sparse_csr_tensor.h"
+#include "paddle/phi/kernels/activation_grad_kernel.h"
+#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/empty_kernel.h"
+
+#define DEFINE_SPARSE_UNARY_GRAD_KERNEL(DenseKernelFunc)                    \
+  namespace phi {                                                           \
+  namespace sparse {                                                        \
+                                                                            \
+  template <typename T, typename Context>                                   \
+  void SparseCoo##DenseKernelFunc(const Context& dev_ctx,                   \
+                                  const SparseCooTensor& x_or_out,          \
+                                  const SparseCooTensor& out_grad,          \
+                                  SparseCooTensor* x_grad) {                \
+    DenseTensor non_zero_indices =                                          \
+        phi::EmptyLike<T, Context>(dev_ctx, x_or_out.non_zero_indices());   \
+    DenseTensor non_zero_elements =                                         \
+        phi::EmptyLike<T, Context>(dev_ctx, x_or_out.non_zero_elements());  \
+    phi::Copy(dev_ctx,                                                      \
+              x_or_out.non_zero_indices(),                                  \
+              dev_ctx.GetPlace(),                                           \
+              false,                                                        \
+              &non_zero_indices);                                           \
+    phi::DenseKernelFunc<T, Context>(dev_ctx,                               \
+                                     x_or_out.non_zero_elements(),          \
+                                     out_grad.non_zero_elements(),          \
+                                     &non_zero_elements);                   \
+    x_grad->SetMember(                                                      \
+        non_zero_indices, non_zero_elements, x_or_out.dims(), true);        \
+  }                                                                         \
+                                                                            \
+  template <typename T, typename Context>                                   \
+  void SparseCsr##DenseKernelFunc(const Context& dev_ctx,                   \
+                                  const SparseCsrTensor& x_or_out,          \
+                                  const SparseCsrTensor& out_grad,          \
+                                  SparseCsrTensor* out) {                   \
+    DenseTensor non_zero_crows =                                            \
+        phi::EmptyLike<T, Context>(dev_ctx, x_or_out.non_zero_crows());     \
+    DenseTensor non_zero_cols =                                             \
+        phi::EmptyLike<T, Context>(dev_ctx, x_or_out.non_zero_cols());      \
+    DenseTensor non_zero_elements =                                         \
+        phi::EmptyLike<T, Context>(dev_ctx, x_or_out.non_zero_elements());  \
+    phi::Copy(dev_ctx,                                                      \
+              x_or_out.non_zero_crows(),                                    \
+              dev_ctx.GetPlace(),                                           \
+              false,                                                        \
+              &non_zero_crows);                                             \
+    phi::Copy(dev_ctx,                                                      \
+              x_or_out.non_zero_cols(),                                     \
+              dev_ctx.GetPlace(),                                           \
+              false,                                                        \
+              &non_zero_cols);                                              \
+    phi::DenseKernelFunc<T, Context>(dev_ctx,                               \
+                                     x_or_out.non_zero_elements(),          \
+                                     out_grad.non_zero_elements(),          \
+                                     &non_zero_elements);                   \
+    out->SetMember(                                                         \
+        non_zero_crows, non_zero_cols, non_zero_elements, x_or_out.dims()); \
+  }                                                                         \
+  }                                                                         \
+  }
+
+#define REGISTER_CPU_SPARSE_UNARY_KERNEL(kernel_name, DenseKernelFunc) \
+  PD_REGISTER_KERNEL(sparse_coo_##kernel_name,                         \
+                     CPU,                                              \
+                     ALL_LAYOUT,                                       \
+                     phi::sparse::SparseCoo##DenseKernelFunc,          \
+                     float,                                            \
+                     double) {                                         \
+    kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO);     \
+  }                                                                    \
+  PD_REGISTER_KERNEL(sparse_csr_##kernel_name,                         \
+                     CPU,                                              \
+                     ALL_LAYOUT,                                       \
+                     phi::sparse::SparseCsr##DenseKernelFunc,          \
+                     float,                                            \
+                     double) {                                         \
+    kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_CSR);     \
+  }
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#define REGISTER_GPU_SPARSE_UNARY_KERNEL(kernel_name, DenseKernelFunc) \
+  PD_REGISTER_KERNEL(sparse_coo_##kernel_name,                         \
+                     GPU,                                              \
+                     ALL_LAYOUT,                                       \
+                     phi::sparse::SparseCoo##DenseKernelFunc,          \
+                     float,                                            \
+                     double,                                           \
+                     phi::dtype::float16) {                            \
+    kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO);     \
+  }                                                                    \
+                                                                       \
+  PD_REGISTER_KERNEL(sparse_csr_##kernel_name,                         \
+                     GPU,                                              \
+                     ALL_LAYOUT,                                       \
+                     phi::sparse::SparseCsr##DenseKernelFunc,          \
+                     float,                                            \
+                     double,                                           \
+                     phi::dtype::float16) {                            \
+    kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_CSR);     \
+  }
+#else
+// This macro definition is empty when GPU is disabled
+#define REGISTER_GPU_SPARSE_UNARY_KERNEL(sparse_kernel_name, DenseKernelFunc)
+#endif
+
+#define REGISTER_SPARSE_UNARY_KERNEL(kernel_name, DenseKernelFunc) \
+  REGISTER_CPU_SPARSE_UNARY_KERNEL(kernel_name, DenseKernelFunc)   \
+  REGISTER_GPU_SPARSE_UNARY_KERNEL(kernel_name, DenseKernelFunc)
+
+#define DEFINE_AND_REGISTER_SPARSE_UNARY_GRAD_KERNEL(kernel_name,     \
+                                                     DenseKernelFunc) \
+  DEFINE_SPARSE_UNARY_GRAD_KERNEL(DenseKernelFunc)                    \
+  REGISTER_SPARSE_UNARY_KERNEL(kernel_name, DenseKernelFunc)
+
+// NOTE: the following code is to bypass the restriction of Paddle
+// kernel registration mechanism. Do NOT refactor them unless you
+// know what you are doing.
+// If you want to implement any new kernel, please follow `sin_grad`,
+// `tanh_grad` etc, do NOT follow the following `relu_grad`.
+DEFINE_SPARSE_UNARY_GRAD_KERNEL(ReluGradKernel)
+
+PD_REGISTER_KERNEL(sparse_coo_relu_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::sparse::SparseCooReluGradKernel,
+                   float,
+                   double) {
+  kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO);
+}
+PD_REGISTER_KERNEL(sparse_csr_relu_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::sparse::SparseCsrReluGradKernel,
+                   float,
+                   double) {
+  kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_CSR);
+}
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PD_REGISTER_KERNEL(sparse_coo_relu_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::sparse::SparseCooReluGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {
+  kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO);
+}
+
+PD_REGISTER_KERNEL(sparse_csr_relu_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::sparse::SparseCsrReluGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {
+  kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_CSR);
+}
+#endif
+
+DEFINE_AND_REGISTER_SPARSE_UNARY_GRAD_KERNEL(sin_grad, SinGradKernel)
+DEFINE_AND_REGISTER_SPARSE_UNARY_GRAD_KERNEL(sqrt_grad, SqrtGradKernel)
+DEFINE_AND_REGISTER_SPARSE_UNARY_GRAD_KERNEL(tanh_grad, TanhGradKernel)
diff --git a/paddle/phi/kernels/sparse/unary_grad_kernel.h b/paddle/phi/kernels/sparse/unary_grad_kernel.h
new file mode 100644
index 0000000000000..24ea4fee1a4fd
--- /dev/null
+++ b/paddle/phi/kernels/sparse/unary_grad_kernel.h
@@ -0,0 +1,41 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/sparse_coo_tensor.h"
+#include "paddle/phi/core/sparse_csr_tensor.h"
+
+#define DECLARE_SPARSE_UNARY_GRAD_KERNEL(name)                      \
+  template <typename T, typename Context>                           \
+  void SparseCoo##name##GradKernel(const Context& dev_ctx,          \
+                                   const SparseCooTensor& x,        \
+                                   const SparseCooTensor& out_grad, \
+                                   SparseCooTensor* x_grad);        \
+                                                                    \
+  template <typename T, typename Context>                           \
+  void SparseCsr##name##GradKernel(const Context& dev_ctx,          \
+                                   const SparseCsrTensor& x,        \
+                                   const SparseCsrTensor& out_grad, \
+                                   SparseCsrTensor* x_grad);
+
+namespace phi {
+namespace sparse {
+
+DECLARE_SPARSE_UNARY_GRAD_KERNEL(Relu)
+DECLARE_SPARSE_UNARY_GRAD_KERNEL(Sqrt)
+DECLARE_SPARSE_UNARY_GRAD_KERNEL(Sin)
+
+}  // namespace sparse
+}  // namespace phi
diff --git a/paddle/phi/kernels/sparse/unary_kernel.cc b/paddle/phi/kernels/sparse/unary_kernel.cc
new file mode 100644
index 0000000000000..e02d7757664fa
--- /dev/null
+++ b/paddle/phi/kernels/sparse/unary_kernel.cc
@@ -0,0 +1,177 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/sparse/unary_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/sparse_coo_tensor.h"
+#include "paddle/phi/core/sparse_csr_tensor.h"
+#include "paddle/phi/kernels/activation_kernel.h"
+#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/empty_kernel.h"
+
+#define DEFINE_SPARSE_UNARY_KERNEL(DenseKernelFunc)                      \
+  namespace phi {                                                        \
+  namespace sparse {                                                     \
+                                                                         \
+  template <typename T, typename Context>                                \
+  void SparseCoo##DenseKernelFunc(const Context& dev_ctx,                \
+                                  const SparseCooTensor& x,              \
+                                  SparseCooTensor* out) {                \
+    DenseTensor non_zero_indices =                                       \
+        phi::EmptyLike<T, Context>(dev_ctx, x.non_zero_indices());       \
+    DenseTensor non_zero_elements =                                      \
+        phi::EmptyLike<T, Context>(dev_ctx, x.non_zero_elements());      \
+    phi::Copy(dev_ctx,                                                   \
+              x.non_zero_indices(),                                      \
+              dev_ctx.GetPlace(),                                        \
+              false,                                                     \
+              &non_zero_indices);                                        \
+    phi::DenseKernelFunc<T, Context>(                                    \
+        dev_ctx, x.non_zero_elements(), &non_zero_elements);             \
+    out->SetMember(non_zero_indices, non_zero_elements, x.dims(), true); \
+  }                                                                      \
+                                                                         \
+  template <typename T, typename Context>                                \
+  void SparseCsr##DenseKernelFunc(const Context& dev_ctx,                \
+                                  const SparseCsrTensor& x,              \
+                                  SparseCsrTensor* out) {                \
+    DenseTensor non_zero_crows =                                         \
+        phi::EmptyLike<T, Context>(dev_ctx, x.non_zero_crows());         \
+    DenseTensor non_zero_cols =                                          \
+        phi::EmptyLike<T, Context>(dev_ctx, x.non_zero_cols());          \
+    DenseTensor non_zero_elements =                                      \
+        phi::EmptyLike<T, Context>(dev_ctx, x.non_zero_elements());      \
+    phi::Copy(dev_ctx,                                                   \
+              x.non_zero_crows(),                                        \
+              dev_ctx.GetPlace(),                                        \
+              false,                                                     \
+              &non_zero_crows);                                          \
+    phi::Copy(dev_ctx,                                                   \
+              x.non_zero_cols(),                                         \
+              dev_ctx.GetPlace(),                                        \
+              false,                                                     \
+              &non_zero_cols);                                           \
+    phi::DenseKernelFunc<T, Context>(                                    \
+        dev_ctx, x.non_zero_elements(), &non_zero_elements);             \
+    out->SetMember(                                                      \
+        non_zero_crows, non_zero_cols, non_zero_elements, x.dims());     \
+  }                                                                      \
+  }                                                                      \
+  }
+
+#define REGISTER_CPU_SPARSE_UNARY_KERNEL(kernel_name, DenseKernelFunc) \
+  PD_REGISTER_KERNEL(sparse_coo_##kernel_name,                         \
+                     CPU,                                              \
+                     ALL_LAYOUT,                                       \
+                     phi::sparse::SparseCoo##DenseKernelFunc,          \
+                     float,                                            \
+                     double) {                                         \
+    kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO);     \
+  }                                                                    \
+  PD_REGISTER_KERNEL(sparse_csr_##kernel_name,                         \
+                     CPU,                                              \
+                     ALL_LAYOUT,                                       \
+                     phi::sparse::SparseCsr##DenseKernelFunc,          \
+                     float,                                            \
+                     double) {                                         \
+    kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_CSR);     \
+  }
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#define REGISTER_GPU_SPARSE_UNARY_KERNEL(kernel_name, DenseKernelFunc) \
+  PD_REGISTER_KERNEL(sparse_coo_##kernel_name,                         \
+                     GPU,                                              \
+                     ALL_LAYOUT,                                       \
+                     phi::sparse::SparseCoo##DenseKernelFunc,          \
+                     float,                                            \
+                     double,                                           \
+                     phi::dtype::float16) {                            \
+    kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO);     \
+  }                                                                    \
+                                                                       \
+  PD_REGISTER_KERNEL(sparse_csr_##kernel_name,                         \
+                     GPU,                                              \
+                     ALL_LAYOUT,                                       \
+                     phi::sparse::SparseCsr##DenseKernelFunc,          \
+                     float,                                            \
+                     double,                                           \
+                     phi::dtype::float16) {                            \
+    kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_CSR);     \
+  }
+#else
+// This macro definition is empty when GPU is disabled
+#define REGISTER_GPU_SPARSE_UNARY_KERNEL(sparse_kernel_name, DenseKernelFunc)
+#endif
+
+#define REGISTER_SPARSE_UNARY_KERNEL(kernel_name, DenseKernelFunc) \
+  REGISTER_CPU_SPARSE_UNARY_KERNEL(kernel_name, DenseKernelFunc)   \
+  REGISTER_GPU_SPARSE_UNARY_KERNEL(kernel_name, DenseKernelFunc)
+
+#define DEFINE_AND_REGISTER_SPARSE_UNARY_KERNEL(kernel_name, DenseKernelFunc) \
+  DEFINE_SPARSE_UNARY_KERNEL(DenseKernelFunc)                                 \
+  REGISTER_SPARSE_UNARY_KERNEL(kernel_name, DenseKernelFunc)
+
+// NOTE: the following code is to bypass the restriction of Paddle
+// kernel registration mechanism. Do NOT refactor them unless you
+// know what you are doing.
+// If you want to implement any new kernel, please follow `sin`,
+// `tanh` etc, do NOT follow `sqrt`.
+DEFINE_SPARSE_UNARY_KERNEL(SqrtKernel)
+
+PD_REGISTER_KERNEL(sparse_coo_sqrt,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::sparse::SparseCooSqrtKernel,
+                   float,
+                   double) {
+  kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO);
+}
+PD_REGISTER_KERNEL(sparse_csr_sqrt,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::sparse::SparseCsrSqrtKernel,
+                   float,
+                   double) {
+  kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_CSR);
+}
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PD_REGISTER_KERNEL(sparse_coo_sqrt,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::sparse::SparseCooSqrtKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {
+  kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO);
+}
+
+PD_REGISTER_KERNEL(sparse_csr_sqrt,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::sparse::SparseCsrSqrtKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {
+  kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_CSR);
+}
+
+#endif
+
+DEFINE_AND_REGISTER_SPARSE_UNARY_KERNEL(sin, SinKernel)
+DEFINE_AND_REGISTER_SPARSE_UNARY_KERNEL(tanh, TanhKernel)
+DEFINE_AND_REGISTER_SPARSE_UNARY_KERNEL(relu, ReluKernel)
diff --git a/paddle/phi/kernels/sparse/unary_kernel.h b/paddle/phi/kernels/sparse/unary_kernel.h
new file mode 100644
index 0000000000000..4470173c143db
--- /dev/null
+++ b/paddle/phi/kernels/sparse/unary_kernel.h
@@ -0,0 +1,48 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/sparse_coo_tensor.h"
+#include "paddle/phi/core/sparse_csr_tensor.h"
+#include "paddle/phi/kernels/activation_kernel.h"
+#include "paddle/phi/kernels/empty_kernel.h"
+
+#define DECLARE_SPARSE_UNARY_KERNEL(name)                                      \
+  template <typename T, typename Context>                                      \
+  void SparseCoo##name##Kernel(                                                \
+      const Context& dev_ctx, const SparseCooTensor& x, SparseCooTensor* out); \
+                                                                               \
+  template <typename T, typename Context>                                      \
+  void SparseCsr##name##Kernel(                                                \
+      const Context& dev_ctx, const SparseCsrTensor& x, SparseCsrTensor* out);
+
+namespace phi {
+namespace sparse {
+
+DECLARE_SPARSE_UNARY_KERNEL(Relu)
+DECLARE_SPARSE_UNARY_KERNEL(Sqrt)
+DECLARE_SPARSE_UNARY_KERNEL(Sin)
+
+template <typename T, typename Context>
+SparseCooTensor SparseRelu(const Context& dev_ctx, const SparseCooTensor& x) {
+  DenseTensor indices, values;
+  SparseCooTensor coo(indices, values, x.dims());
+  SparseCooReluKernel<T, Context>(dev_ctx, x, &coo);
+  return coo;
+}
+
+}  // namespace sparse
+}  // namespace phi
diff --git a/paddle/phi/tests/kernels/test_sparse_activation_dev_api.cc b/paddle/phi/tests/kernels/test_sparse_activation_dev_api.cc
index 43640da270aad..05781156cd1d6 100644
--- a/paddle/phi/tests/kernels/test_sparse_activation_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_sparse_activation_dev_api.cc
@@ -24,9 +24,9 @@ limitations under the License. */
 #include "paddle/phi/kernels/activation_grad_kernel.h"
 #include "paddle/phi/kernels/activation_kernel.h"
 #include "paddle/phi/kernels/empty_kernel.h"
-#include "paddle/phi/kernels/sparse/activation_grad_kernel.h"
-#include "paddle/phi/kernels/sparse/activation_kernel.h"
 #include "paddle/phi/kernels/sparse/sparse_utils_kernel.h"
+#include "paddle/phi/kernels/sparse/unary_grad_kernel.h"
+#include "paddle/phi/kernels/sparse/unary_kernel.h"
 
 namespace phi {
 namespace tests {
@@ -70,7 +70,7 @@ TEST(DEV_API, sparse_relu) {
 
   SparseCooTensor sparse_out_grad(
       sparse_coo.non_zero_indices(), dense_out, {3, 4});
-  sparse::SparseReluGradKernel<float>(
+  sparse::SparseCooReluGradKernel<float>(
       dev_ctx_cpu, sparse_coo, sparse_out_grad, &sparse_grad_x);
 
   cmp = memcmp(dense_grad_x.data<float>(),
diff --git a/python/paddle/fluid/tests/unittests/test_sparse_activation_op.py b/python/paddle/fluid/tests/unittests/test_sparse_activation_op.py
deleted file mode 100644
index b4abbd56303ff..0000000000000
--- a/python/paddle/fluid/tests/unittests/test_sparse_activation_op.py
+++ /dev/null
@@ -1,50 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-# 
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-# 
-#     http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-import unittest
-import numpy as np
-import paddle
-from paddle.fluid.framework import _test_eager_guard
-
-
-class TestSparseActivation(unittest.TestCase):
-    def test_sparse_relu(self):
-        with _test_eager_guard():
-            x = [[0, -1, 0, 2], [0, 0, -3, 0], [4, 5, 0, 0]]
-
-            def dense_relu(x):
-                dense_x = paddle.to_tensor(
-                    x, dtype='float32', stop_gradient=False)
-                dense_relu = paddle.nn.ReLU()
-                dense_out = dense_relu(dense_x)
-                dense_out.backward(dense_out)
-                return dense_out, dense_x.grad
-
-            dense_x = paddle.to_tensor(x, dtype='float32', stop_gradient=False)
-            sparse_dim = 2
-            sparse_x = dense_x.to_sparse_coo(sparse_dim)
-            sparse_relu = paddle.sparse.ReLU()
-            sparse_out = sparse_relu(sparse_x)
-            sparse_out.backward(sparse_out)
-
-            dense_out, dense_x_grad = dense_relu(x)
-            assert np.array_equal(dense_out.numpy(),
-                                  sparse_out.to_dense().numpy())
-            assert np.array_equal(dense_x_grad.numpy(),
-                                  sparse_x.grad.to_dense().numpy())
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_sparse_unary_op.py b/python/paddle/fluid/tests/unittests/test_sparse_unary_op.py
new file mode 100644
index 0000000000000..573cc5ba8cf5d
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_sparse_unary_op.py
@@ -0,0 +1,133 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import unittest
+from typing import Union, Callable
+import numpy as np
+import paddle
+from paddle.fluid.framework import _test_eager_guard
+from paddle import _C_ops
+
+
+class TestSparseUnary(unittest.TestCase):
+    def assert_raises_on_dense_tensor(self, sparse_func):
+        with _test_eager_guard():
+            dense_x = paddle.ones((2, 3))
+            with self.assertRaises(ValueError):
+                sparse_func(dense_x)
+
+    def compare_with_dense(
+            self,
+            x,
+            to_sparse: Callable[[paddle.Tensor], paddle.Tensor],
+            dense_func: Callable[[paddle.Tensor], paddle.Tensor],
+            sparse_func: Callable[[paddle.Tensor], paddle.Tensor],
+            test_gradient: bool, ):
+        def tensor_allclose(dense_tensor: paddle.Tensor,
+                            sparse_tensor: paddle.Tensor):
+            dense_numpy = dense_tensor.numpy()
+            mask = ~np.isnan(dense_numpy)
+            return np.allclose(dense_numpy[mask],
+                               sparse_tensor.to_dense().numpy()[mask])
+
+        with _test_eager_guard():
+            dense_x = paddle.to_tensor(
+                x, dtype="float32", stop_gradient=not test_gradient)
+
+            sparse_x = to_sparse(dense_x)
+            sparse_out = sparse_func(sparse_x)
+
+            dense_x = paddle.to_tensor(
+                x, dtype="float32", stop_gradient=not test_gradient)
+            dense_out = dense_func(dense_x)
+
+            assert tensor_allclose(dense_out, sparse_out)
+
+            if test_gradient:
+                dense_out.backward(dense_out)
+                sparse_out.backward(sparse_out)
+                assert tensor_allclose(dense_x.grad, sparse_x.grad)
+
+    def test_sparse_relu(self):
+        x = [[0, -1, 0, 2], [0, 0, -3, 0], [4, 5, 0, 0]]
+        sparse_dim = 2
+        self.compare_with_dense(
+            x,
+            lambda x: x.to_sparse_coo(sparse_dim),
+            paddle.nn.ReLU(),
+            paddle.sparse.ReLU(),
+            True, )
+        self.compare_with_dense(
+            x,
+            lambda x: x.to_sparse_csr(),
+            paddle.nn.ReLU(),
+            paddle.sparse.ReLU(),
+            False, )
+        self.assert_raises_on_dense_tensor(paddle.sparse.ReLU())
+
+    def test_sparse_sqrt(self):
+        x = [[0, 16, 0, 0], [0, 0, 0, 0], [0, 4, 2, 0]]
+        sparse_dim = 2
+        self.compare_with_dense(
+            x,
+            lambda x: x.to_sparse_coo(sparse_dim),
+            paddle.sqrt,
+            paddle.sparse.sqrt,
+            True, )
+        self.compare_with_dense(
+            x,
+            lambda x: x.to_sparse_csr(),
+            paddle.sqrt,
+            paddle.sparse.sqrt,
+            False, )
+        self.assert_raises_on_dense_tensor(paddle.sparse.sqrt)
+
+    def test_sparse_sin(self):
+        x = [[0, 16, 0, 0], [0, 0, 0, 0], [0, 4, 2, 0]]
+        sparse_dim = 2
+        self.compare_with_dense(
+            x,
+            lambda x: x.to_sparse_coo(sparse_dim),
+            paddle.sin,
+            paddle.sparse.sin,
+            True, )
+        self.compare_with_dense(
+            x,
+            lambda x: x.to_sparse_csr(),
+            paddle.sin,
+            paddle.sparse.sin,
+            False, )
+        self.assert_raises_on_dense_tensor(paddle.sparse.sin)
+
+    def test_sparse_tanh(self):
+        x = [[0, 16, 0, 0], [0, 0, 0, 0], [0, -4, 2, 0]]
+        sparse_dim = 2
+        self.compare_with_dense(
+            x,
+            lambda x: x.to_sparse_coo(sparse_dim),
+            paddle.tanh,
+            paddle.sparse.tanh,
+            True, )
+        self.compare_with_dense(
+            x,
+            lambda x: x.to_sparse_csr(),
+            paddle.tanh,
+            paddle.sparse.tanh,
+            False, )
+        self.assert_raises_on_dense_tensor(paddle.sparse.tanh)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/sparse/__init__.py b/python/paddle/sparse/__init__.py
index 93653e09c9019..26a2f0cfadbe7 100644
--- a/python/paddle/sparse/__init__.py
+++ b/python/paddle/sparse/__init__.py
@@ -14,15 +14,19 @@
 
 from .creation import sparse_coo_tensor
 from .creation import sparse_csr_tensor
-from .layer.activation import ReLU
-from .layer.norm import BatchNorm
+from .layer import ReLU
+from .layer import BatchNorm
 
-from .layer.conv import Conv3D
-from .layer.conv import SubmConv3D
+from .layer import Conv3D
+from .layer import SubmConv3D
 
-from .layer.pooling import MaxPool3D
+from .layer import MaxPool3D
+
+from .functional import sqrt
+from .functional import sin
+from .functional import tanh
 
 __all__ = [
     'sparse_coo_tensor', 'sparse_csr_tensor', 'ReLU', 'Conv3D', 'SubmConv3D',
-    'BatchNorm', 'MaxPool3D'
+    'BatchNorm', 'MaxPool3D', 'sqrt', 'sin', 'tanh'
 ]
diff --git a/python/paddle/sparse/functional/__init__.py b/python/paddle/sparse/functional/__init__.py
index f1ca4cc6fcc48..cfefa3ff4ff76 100644
--- a/python/paddle/sparse/functional/__init__.py
+++ b/python/paddle/sparse/functional/__init__.py
@@ -12,9 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .activation import relu  # noqa: F401
+from .unary import relu  # noqa: F401
+from .unary import tanh  # noqa: F401
+from .unary import sqrt  # noqa: F401
+from .unary import sin  # noqa: F401
 from .conv import conv3d  # noqa: F401
 from .conv import subm_conv3d  # noqa: F401
 from .pooling import max_pool3d  # noqa: F401
 
-__all__ = ['relu', 'conv3d', 'subm_conv3d', 'max_pool3d']
+__all__ = ['relu', 'tanh', 'conv3d', 'subm_conv3d', 'max_pool3d', 'sqrt', 'sin']
diff --git a/python/paddle/sparse/functional/activation.py b/python/paddle/sparse/functional/activation.py
deleted file mode 100644
index c0109bc4e2429..0000000000000
--- a/python/paddle/sparse/functional/activation.py
+++ /dev/null
@@ -1,53 +0,0 @@
-#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-__all__ = []
-
-from paddle import _C_ops, in_dynamic_mode
-
-
-def relu(x, name=None):
-    """
-    sparse relu activation.
-
-    .. math::
-
-        out = max(x, 0)
-
-    Parameters:
-        x (Tensor): The input Sparse Tensor with data type float32, float64.
-        name (str, optional): Name for the operation (optional, default is None).
-            For more information, please refer to :ref:`api_guide_Name`.
-
-    Returns:
-        A Sparse Tensor with the same data type and shape as ``x`` .
-
-    Examples:
-        .. code-block:: python
-
-            import paddle
-            import numpy as np
-            from paddle.fluid.framework import _test_eager_guard
-
-            with _test_eager_guard():
-                dense_x = paddle.to_tensor(np.array([-2, 0, 1]).astype('float32'))
-                sparse_x = dense_x.to_sparse_coo(1)
-                out = paddle.sparse.functional.relu(sparse_x) 
-    """
-
-    assert in_dynamic_mode(), "Currently, Sparse API only support dynamic mode"
-    assert x.is_sparse_coo(
-    ), "Currently, sparse.relu only support the input of SparseCooTensor"
-
-    return _C_ops.final_state_sparse_relu(x)
diff --git a/python/paddle/sparse/functional/unary.py b/python/paddle/sparse/functional/unary.py
new file mode 100644
index 0000000000000..860b4025d89e0
--- /dev/null
+++ b/python/paddle/sparse/functional/unary.py
@@ -0,0 +1,177 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+__all__ = []
+
+from paddle import _C_ops, in_dynamic_mode
+
+
+def relu(x, name=None):
+    """
+    sparse relu activation, requiring x to be a sparse coo or sparse csr tensor.
+
+    .. math::
+
+        out = max(x, 0)
+
+    Parameters:
+        x (Tensor): The input Sparse Tensor with data type float32, float64.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        A Sparse Tensor with the same data type and shape as ``x`` .
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            from paddle.fluid.framework import _test_eager_guard
+
+            with _test_eager_guard():
+                dense_x = paddle.to_tensor([-2, 0, 1], dtype='float32')
+                sparse_x = dense_x.to_sparse_coo(1)
+                out = paddle.sparse.functional.relu(sparse_x) 
+    """
+
+    assert in_dynamic_mode(), "Currently, Sparse API only support dynamic mode"
+
+    if x.is_sparse_coo():
+        return _C_ops.final_state_sparse_coo_relu(x)
+    elif x.is_sparse_csr():
+        return _C_ops.final_state_sparse_csr_relu(x)
+    else:
+        raise ValueError(
+            "Currently, sparse.relu only support the input of SparseCooTensor or SparseCsrTensor"
+        )
+
+
+def tanh(x, name=None):
+    """
+    sparse tanh activation, requiring x to be a sparse coo or sparse csr tensor.
+
+    .. math::
+
+        out = tanh(x)
+
+    Parameters:
+        x (Tensor): The input Sparse Tensor with data type float32, float64.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        A Sparse Tensor with the same data type and shape as ``x`` .
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            from paddle.fluid.framework import _test_eager_guard
+
+            with _test_eager_guard():
+                dense_x = paddle.to_tensor([-2, 0, 1], dtype='float32')
+                sparse_x = dense_x.to_sparse_coo(1)
+                out = paddle.sparse.tanh(sparse_x)
+    """
+
+    assert in_dynamic_mode(), "Currently, Sparse API only support dynamic mode"
+
+    if x.is_sparse_coo():
+        return _C_ops.final_state_sparse_coo_tanh(x)
+    elif x.is_sparse_csr():
+        return _C_ops.final_state_sparse_csr_tanh(x)
+    else:
+        raise ValueError(
+            "Currently, sparse.tanh only support the input of SparseCooTensor or SparseCsrTensor"
+        )
+
+
+def sqrt(x, name=None):
+    """
+    Calculate square root of x, requiring x to be a sparse coo or sparse csr tensor.
+
+    .. math::
+
+        out = sqrt(x)
+
+    Parameters:
+        x (Tensor): The input Sparse Tensor with data type float32, float64.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        A Sparse Tensor with the same data type and shape as ``x`` .
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            from paddle.fluid.framework import _test_eager_guard
+
+            with _test_eager_guard():
+                dense_x = paddle.to_tensor([4, 0, 1], dtype='float32')
+                sparse_x = dense_x.to_sparse_coo(1)
+                out = paddle.sparse.sqrt(sparse_x)
+    """
+
+    assert in_dynamic_mode(), "Currently, Sparse API only support dynamic mode"
+
+    if x.is_sparse_coo():
+        return _C_ops.final_state_sparse_coo_sqrt(x)
+    elif x.is_sparse_csr():
+        return _C_ops.final_state_sparse_csr_sqrt(x)
+    else:
+        raise ValueError(
+            "Currently, sparse.sqrt only support the input of SparseCooTensor or SparseCsrTensor"
+        )
+
+
+def sin(x, name=None):
+    """
+    Calculate sin of x, requiring x to be a sparse coo or sparse csr tensor.
+
+    .. math::
+
+        out = sin(x)
+
+    Parameters:
+        x (Tensor): The input Sparse Tensor with data type float32, float64.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        A Sparse Tensor with the same data type and shape as ``x`` .
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            from paddle.fluid.framework import _test_eager_guard
+
+            with _test_eager_guard():
+                dense_x = paddle.to_tensor([-2, 0, 3], dtype='float32')
+                sparse_x = dense_x.to_sparse_coo(1)
+                out = paddle.sparse.sin(sparse_x)
+    """
+
+    assert in_dynamic_mode(), "Currently, Sparse API only support dynamic mode"
+
+    if x.is_sparse_coo():
+        return _C_ops.final_state_sparse_coo_sin(x)
+    elif x.is_sparse_csr():
+        return _C_ops.final_state_sparse_csr_sin(x)
+    else:
+        raise ValueError(
+            "Currently, sparse.sin only support the input of SparseCooTensor or SparseCsrTensor"
+        )
diff --git a/python/paddle/sparse/layer/__init__.py b/python/paddle/sparse/layer/__init__.py
index 3a6d99392e4e8..8a814b514276f 100644
--- a/python/paddle/sparse/layer/__init__.py
+++ b/python/paddle/sparse/layer/__init__.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .activation import ReLU
+from .unary import ReLU
 from .norm import BatchNorm
 from .conv import Conv3D
 from .conv import SubmConv3D
diff --git a/python/paddle/sparse/layer/activation.py b/python/paddle/sparse/layer/unary.py
similarity index 100%
rename from python/paddle/sparse/layer/activation.py
rename to python/paddle/sparse/layer/unary.py
diff --git a/python/paddle/utils/code_gen/sparse_api.yaml b/python/paddle/utils/code_gen/sparse_api.yaml
index ca4330f2af362..ae3e9e6942233 100644
--- a/python/paddle/utils/code_gen/sparse_api.yaml
+++ b/python/paddle/utils/code_gen/sparse_api.yaml
@@ -7,6 +7,38 @@
   intermediate : rulebook
   backward : conv3d_grad
 
+- api : coo_relu
+  args : (Tensor x)
+  output : Tensor(out@SparseCooTensor)
+  kernel :
+    func : sparse_coo_relu
+    layout : x
+  backward : sparse_coo_relu_grad
+
+- api : coo_sin
+  args : (Tensor x)
+  output : Tensor(out@SparseCooTensor)
+  kernel :
+    func : sparse_coo_sin
+    layout : x
+  backward : sparse_coo_sin_grad
+
+- api : coo_sqrt
+  args : (Tensor x)
+  output : Tensor(out@SparseCooTensor)
+  kernel :
+    func : sparse_coo_sqrt
+    layout : x
+  backward : sparse_coo_sqrt_grad
+
+- api : coo_tanh
+  args : (Tensor x)
+  output : Tensor(out@SparseCooTensor)
+  kernel :
+    func : sparse_coo_tanh
+    layout : x
+  backward : sparse_coo_tanh_grad
+
 - api : coo_to_dense
   args : (Tensor x)
   output : Tensor(out@DenseTensor)
@@ -30,6 +62,34 @@
     data_type : values
   backward : create_sparse_coo_tensor_grad
 
+- api : csr_relu
+  args : (Tensor x)
+  output : Tensor(out@SparseCsrTensor)
+  kernel :
+    func : sparse_csr_relu
+    layout : x
+
+- api : csr_sin
+  args : (Tensor x)
+  output : Tensor(out@SparseCsrTensor)
+  kernel :
+    func : sparse_csr_sin
+    layout : x
+
+- api : csr_sqrt
+  args : (Tensor x)
+  output : Tensor(out@SparseCsrTensor)
+  kernel :
+    func : sparse_csr_sqrt
+    layout : x
+
+- api : csr_tanh
+  args : (Tensor x)
+  output : Tensor(out@SparseCsrTensor)
+  kernel :
+    func : sparse_csr_tanh
+    layout : x
+
 - api : csr_values
   args : (Tensor x)
   output : Tensor(out@DenseTensor)
@@ -43,14 +103,6 @@
   invoke : to_sparse_coo_impl(x, sparse_dim)
   backward : dense_to_coo_grad
 
-- api : relu
-  args : (Tensor x)
-  output : Tensor(out@SparseCooTensor)
-  kernel :
-    func : sparse_relu
-    layout : x
-  backward : sparse_relu_grad
-
 - api : to_dense
   args : (Tensor x)
   output : Tensor(out@DenseTensor)
diff --git a/python/paddle/utils/code_gen/sparse_bw_api.yaml b/python/paddle/utils/code_gen/sparse_bw_api.yaml
index 74299ed3e39a0..d8e8aad8f98b2 100644
--- a/python/paddle/utils/code_gen/sparse_bw_api.yaml
+++ b/python/paddle/utils/code_gen/sparse_bw_api.yaml
@@ -32,16 +32,37 @@
   output : Tensor(x_grad@DenseTensor)
   invoke : to_dense_impl(out_grad)
 
-- backward_api : sparse_maxpool_grad
-  forward : sparse_maxpool(Tensor x, int[] kernel_sizes, int[] paddings, int[] dilations, int[] strides) -> Tensor(out@SparseCooTensor), Tensor(rulebook@DenseTensor)
-  args : (Tensor x, Tensor rulebook, Tensor out, Tensor out_grad, int[] kernel_sizes)
+- backward_api : sparse_coo_relu_grad
+  forward : sparse_coo_relu(Tensor x) -> Tensor(out@SparseCooTensor)
+  args : (Tensor out, Tensor out_grad)
   output : Tensor(x_grad@SparseCooTensor)
   kernel :
-    func : sparse_maxpool_grad
+    func : sparse_coo_relu_grad
 
-- backward_api : sparse_relu_grad
-  forward : sparse_relu(Tensor x) -> Tensor(out@SparseCooTensor)
+- backward_api : sparse_coo_sin_grad
+  forward : sparse_coo_sin(Tensor x) -> Tensor(out@SparseCooTensor)
   args : (Tensor x, Tensor out_grad)
   output : Tensor(x_grad@SparseCooTensor)
   kernel :
-    func : sparse_relu_grad
+    func : sparse_coo_sin_grad
+
+- backward_api : sparse_coo_sqrt_grad
+  forward : sparse_coo_sqrt(Tensor x) -> Tensor(out@SparseCooTensor)
+  args : (Tensor out, Tensor out_grad)
+  output : Tensor(x_grad@SparseCooTensor)
+  kernel :
+    func : sparse_coo_sqrt_grad
+
+- backward_api : sparse_coo_tanh_grad
+  forward : sparse_coo_tanh(Tensor x) -> Tensor(out@SparseCooTensor)
+  args : (Tensor out, Tensor out_grad)
+  output : Tensor(x_grad@SparseCooTensor)
+  kernel :
+    func : sparse_coo_tanh_grad
+
+- backward_api : sparse_maxpool_grad
+  forward : sparse_maxpool(Tensor x, int[] kernel_sizes, int[] paddings, int[] dilations, int[] strides) -> Tensor(out@SparseCooTensor), Tensor(rulebook@DenseTensor)
+  args : (Tensor x, Tensor rulebook, Tensor out, Tensor out_grad, int[] kernel_sizes)
+  output : Tensor(x_grad@SparseCooTensor)
+  kernel :
+    func : sparse_maxpool_grad

From 1ce9c2ba93d39b06e02607b74a6295ab85645f71 Mon Sep 17 00:00:00 2001
From: Allen Guo <alleng@graphcore.ai>
Date: Thu, 12 May 2022 11:21:40 +0800
Subject: [PATCH 20/49] fix base docker image name, test=document_fix (#42673)

---
 tools/dockerfile/Dockerfile.ipu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/dockerfile/Dockerfile.ipu b/tools/dockerfile/Dockerfile.ipu
index d6c46245e501c..ee2d984035624 100644
--- a/tools/dockerfile/Dockerfile.ipu
+++ b/tools/dockerfile/Dockerfile.ipu
@@ -6,7 +6,7 @@
 # run a container
 # docker run --ulimit memlock=-1:-1 --net=host --cap-add=IPC_LOCK --device=/dev/infiniband/ --ipc=host --rm -it paddlepaddle/paddle:latest-dev-ipu bash
 
-FROM graphcore/poplar:poplar-extbaidu:2.5.0-ubuntu-18.04-20220407
+FROM graphcore/poplar-extbaidu:2.5.0-ubuntu-18.04-20220407
 MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
 
 # ENV variables

From eca8a5795dbdb810e9fdfb675deb31b4955dd364 Mon Sep 17 00:00:00 2001
From: zlsh80826 <rewang@nvidia.com>
Date: Thu, 12 May 2022 11:23:45 +0800
Subject: [PATCH 21/49] Use tempfile to place the temporary files (#42626)

* Use tempfile to place the temporary files

* Revise test_bert to use tempfile for temporary files

* Use tempfile for test_transformer

* Fix test_dataset file race
---
 .../unittests/dygraph_to_static/test_bert.py  | 274 +++++++++---------
 .../dygraph_to_static/test_build_strategy.py  |  17 +-
 .../dygraph_to_static/test_resnet.py          | 259 +++++++++--------
 .../dygraph_to_static/test_transformer.py     |  14 +
 .../fluid/tests/unittests/test_dataset.py     |  16 +-
 .../test_dataset_consistency_inspection.py    |  19 +-
 6 files changed, 318 insertions(+), 281 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_bert.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_bert.py
index a9e94ef09b9ac..db533e6379add 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_bert.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_bert.py
@@ -14,6 +14,7 @@
 
 import os
 import time
+import tempfile
 import unittest
 import numpy as np
 
@@ -33,32 +34,118 @@
 SEED = 2020
 STEP_NUM = 10
 PRINT_STEP = 2
-MODEL_SAVE_DIR = "./inference"
-MODEL_SAVE_PREFIX = "./inference/bert"
-MODEL_FILENAME = "bert" + INFER_MODEL_SUFFIX
-PARAMS_FILENAME = "bert" + INFER_PARAMS_SUFFIX
-DY_STATE_DICT_SAVE_PATH = "./bert.dygraph"
-
-
-def train(bert_config, data_reader, to_static):
-    with fluid.dygraph.guard(place):
-        fluid.default_main_program().random_seed = SEED
-        fluid.default_startup_program().random_seed = SEED
-
-        data_loader = fluid.io.DataLoader.from_generator(
-            capacity=50, iterable=True)
-        data_loader.set_batch_generator(
-            data_reader.data_generator(), places=place)
-
-        bert = PretrainModelLayer(
-            config=bert_config, weight_sharing=False, use_fp16=False)
-
-        optimizer = fluid.optimizer.Adam(parameter_list=bert.parameters())
-        step_idx = 0
-        speed_list = []
-        for input_data in data_loader():
-            src_ids, pos_ids, sent_ids, input_mask, mask_label, mask_pos, labels = input_data
-            next_sent_acc, mask_lm_loss, total_loss = bert(
+
+
+class TestBert(unittest.TestCase):
+    def setUp(self):
+        self.bert_config = get_bert_config()
+        self.data_reader = get_feed_data_reader(self.bert_config)
+        self.temp_dir = tempfile.TemporaryDirectory()
+        self.model_save_dir = os.path.join(self.temp_dir.name, 'inference')
+        self.model_save_prefix = os.path.join(self.model_save_dir, 'bert')
+        self.model_filename = 'bert' + INFER_MODEL_SUFFIX
+        self.params_filename = 'bert' + INFER_PARAMS_SUFFIX
+        self.dy_state_dict_save_path = os.path.join(self.temp_dir.name,
+                                                    'bert.dygraph')
+
+    def tearDown(self):
+        self.temp_dir.cleanup()
+
+    def train(self, bert_config, data_reader, to_static):
+        with fluid.dygraph.guard(place):
+            fluid.default_main_program().random_seed = SEED
+            fluid.default_startup_program().random_seed = SEED
+
+            data_loader = fluid.io.DataLoader.from_generator(
+                capacity=50, iterable=True)
+            data_loader.set_batch_generator(
+                data_reader.data_generator(), places=place)
+
+            bert = PretrainModelLayer(
+                config=bert_config, weight_sharing=False, use_fp16=False)
+
+            optimizer = fluid.optimizer.Adam(parameter_list=bert.parameters())
+            step_idx = 0
+            speed_list = []
+            for input_data in data_loader():
+                src_ids, pos_ids, sent_ids, input_mask, mask_label, mask_pos, labels = input_data
+                next_sent_acc, mask_lm_loss, total_loss = bert(
+                    src_ids=src_ids,
+                    position_ids=pos_ids,
+                    sentence_ids=sent_ids,
+                    input_mask=input_mask,
+                    mask_label=mask_label,
+                    mask_pos=mask_pos,
+                    labels=labels)
+                total_loss.backward()
+                optimizer.minimize(total_loss)
+                bert.clear_gradients()
+
+                acc = np.mean(np.array(next_sent_acc.numpy()))
+                loss = np.mean(np.array(total_loss.numpy()))
+                ppl = np.mean(np.exp(np.array(mask_lm_loss.numpy())))
+
+                if step_idx % PRINT_STEP == 0:
+                    if step_idx == 0:
+                        print("Step: %d, loss: %f, ppl: %f, next_sent_acc: %f" %
+                              (step_idx, loss, ppl, acc))
+                        avg_batch_time = time.time()
+                    else:
+                        speed = PRINT_STEP / (time.time() - avg_batch_time)
+                        speed_list.append(speed)
+                        print(
+                            "Step: %d, loss: %f, ppl: %f, next_sent_acc: %f, speed: %.3f steps/s"
+                            % (step_idx, loss, ppl, acc, speed))
+                        avg_batch_time = time.time()
+
+                step_idx += 1
+                if step_idx == STEP_NUM:
+                    if to_static:
+                        fluid.dygraph.jit.save(bert, self.model_save_prefix)
+                    else:
+                        fluid.dygraph.save_dygraph(bert.state_dict(),
+                                                   self.dy_state_dict_save_path)
+                    break
+            return loss, ppl
+
+    def train_dygraph(self, bert_config, data_reader):
+        program_translator.enable(False)
+        return self.train(bert_config, data_reader, False)
+
+    def train_static(self, bert_config, data_reader):
+        program_translator.enable(True)
+        return self.train(bert_config, data_reader, True)
+
+    def predict_static(self, data):
+        paddle.enable_static()
+        exe = fluid.Executor(place)
+        # load inference model
+        [inference_program, feed_target_names,
+         fetch_targets] = fluid.io.load_inference_model(
+             self.model_save_dir,
+             executor=exe,
+             model_filename=self.model_filename,
+             params_filename=self.params_filename)
+        pred_res = exe.run(inference_program,
+                           feed=dict(zip(feed_target_names, data)),
+                           fetch_list=fetch_targets)
+
+        return pred_res
+
+    def predict_dygraph(self, bert_config, data):
+        program_translator.enable(False)
+        with fluid.dygraph.guard(place):
+            bert = PretrainModelLayer(
+                config=bert_config, weight_sharing=False, use_fp16=False)
+            model_dict, _ = fluid.dygraph.load_dygraph(
+                self.dy_state_dict_save_path)
+
+            bert.set_dict(model_dict)
+            bert.eval()
+
+            input_vars = [fluid.dygraph.to_variable(x) for x in data]
+            src_ids, pos_ids, sent_ids, input_mask, mask_label, mask_pos, labels = input_vars
+            pred_res = bert(
                 src_ids=src_ids,
                 position_ids=pos_ids,
                 sentence_ids=sent_ids,
@@ -66,120 +153,33 @@ def train(bert_config, data_reader, to_static):
                 mask_label=mask_label,
                 mask_pos=mask_pos,
                 labels=labels)
-            total_loss.backward()
-            optimizer.minimize(total_loss)
-            bert.clear_gradients()
-
-            acc = np.mean(np.array(next_sent_acc.numpy()))
-            loss = np.mean(np.array(total_loss.numpy()))
-            ppl = np.mean(np.exp(np.array(mask_lm_loss.numpy())))
-
-            if step_idx % PRINT_STEP == 0:
-                if step_idx == 0:
-                    print("Step: %d, loss: %f, ppl: %f, next_sent_acc: %f" %
-                          (step_idx, loss, ppl, acc))
-                    avg_batch_time = time.time()
-                else:
-                    speed = PRINT_STEP / (time.time() - avg_batch_time)
-                    speed_list.append(speed)
-                    print(
-                        "Step: %d, loss: %f, ppl: %f, next_sent_acc: %f, speed: %.3f steps/s"
-                        % (step_idx, loss, ppl, acc, speed))
-                    avg_batch_time = time.time()
-
-            step_idx += 1
-            if step_idx == STEP_NUM:
-                if to_static:
-                    fluid.dygraph.jit.save(bert, MODEL_SAVE_PREFIX)
-                else:
-                    fluid.dygraph.save_dygraph(bert.state_dict(),
-                                               DY_STATE_DICT_SAVE_PATH)
-                break
-        return loss, ppl
-
-
-def train_dygraph(bert_config, data_reader):
-    program_translator.enable(False)
-    return train(bert_config, data_reader, False)
-
-
-def train_static(bert_config, data_reader):
-    program_translator.enable(True)
-    return train(bert_config, data_reader, True)
-
-
-def predict_static(data):
-    paddle.enable_static()
-    exe = fluid.Executor(place)
-    # load inference model
-    [inference_program, feed_target_names,
-     fetch_targets] = fluid.io.load_inference_model(
-         MODEL_SAVE_DIR,
-         executor=exe,
-         model_filename=MODEL_FILENAME,
-         params_filename=PARAMS_FILENAME)
-    pred_res = exe.run(inference_program,
-                       feed=dict(zip(feed_target_names, data)),
-                       fetch_list=fetch_targets)
-
-    return pred_res
-
-
-def predict_dygraph(bert_config, data):
-    program_translator.enable(False)
-    with fluid.dygraph.guard(place):
-        bert = PretrainModelLayer(
-            config=bert_config, weight_sharing=False, use_fp16=False)
-        model_dict, _ = fluid.dygraph.load_dygraph(DY_STATE_DICT_SAVE_PATH)
-
-        bert.set_dict(model_dict)
-        bert.eval()
-
-        input_vars = [fluid.dygraph.to_variable(x) for x in data]
-        src_ids, pos_ids, sent_ids, input_mask, mask_label, mask_pos, labels = input_vars
-        pred_res = bert(
-            src_ids=src_ids,
-            position_ids=pos_ids,
-            sentence_ids=sent_ids,
-            input_mask=input_mask,
-            mask_label=mask_label,
-            mask_pos=mask_pos,
-            labels=labels)
-        pred_res = [var.numpy() for var in pred_res]
+            pred_res = [var.numpy() for var in pred_res]
 
-        return pred_res
-
-
-def predict_dygraph_jit(data):
-    with fluid.dygraph.guard(place):
-        bert = fluid.dygraph.jit.load(MODEL_SAVE_PREFIX)
-        bert.eval()
-
-        src_ids, pos_ids, sent_ids, input_mask, mask_label, mask_pos, labels = data
-        pred_res = bert(src_ids, pos_ids, sent_ids, input_mask, mask_label,
-                        mask_pos, labels)
-        pred_res = [var.numpy() for var in pred_res]
-
-        return pred_res
+            return pred_res
 
+    def predict_dygraph_jit(self, data):
+        with fluid.dygraph.guard(place):
+            bert = fluid.dygraph.jit.load(self.model_save_prefix)
+            bert.eval()
 
-def predict_analysis_inference(data):
-    output = PredictorTools(MODEL_SAVE_DIR, MODEL_FILENAME, PARAMS_FILENAME,
-                            data)
-    out = output()
-    return out
+            src_ids, pos_ids, sent_ids, input_mask, mask_label, mask_pos, labels = data
+            pred_res = bert(src_ids, pos_ids, sent_ids, input_mask, mask_label,
+                            mask_pos, labels)
+            pred_res = [var.numpy() for var in pred_res]
 
+            return pred_res
 
-class TestBert(unittest.TestCase):
-    def setUp(self):
-        self.bert_config = get_bert_config()
-        self.data_reader = get_feed_data_reader(self.bert_config)
+    def predict_analysis_inference(self, data):
+        output = PredictorTools(self.model_save_dir, self.model_filename,
+                                self.params_filename, data)
+        out = output()
+        return out
 
     def test_train(self):
-        static_loss, static_ppl = train_static(self.bert_config,
-                                               self.data_reader)
-        dygraph_loss, dygraph_ppl = train_dygraph(self.bert_config,
-                                                  self.data_reader)
+        static_loss, static_ppl = self.train_static(self.bert_config,
+                                                    self.data_reader)
+        dygraph_loss, dygraph_ppl = self.train_dygraph(self.bert_config,
+                                                       self.data_reader)
         self.assertTrue(
             np.allclose(static_loss, dygraph_loss),
             msg="static_loss: {} \n dygraph_loss: {}".format(static_loss,
@@ -193,10 +193,10 @@ def test_train(self):
 
     def verify_predict(self):
         for data in self.data_reader.data_generator()():
-            dygraph_pred_res = predict_dygraph(self.bert_config, data)
-            static_pred_res = predict_static(data)
-            dygraph_jit_pred_res = predict_dygraph_jit(data)
-            predictor_pred_res = predict_analysis_inference(data)
+            dygraph_pred_res = self.predict_dygraph(self.bert_config, data)
+            static_pred_res = self.predict_static(data)
+            dygraph_jit_pred_res = self.predict_dygraph_jit(data)
+            predictor_pred_res = self.predict_analysis_inference(data)
 
             for dy_res, st_res, dy_jit_res, predictor_res in zip(
                     dygraph_pred_res, static_pred_res, dygraph_jit_pred_res,
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_build_strategy.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_build_strategy.py
index f7d469327a307..95ea5ad227eeb 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_build_strategy.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_build_strategy.py
@@ -18,8 +18,7 @@
 import numpy as np
 from paddle.jit import ProgramTranslator
 
-from test_resnet import ResNet, train, predict_dygraph_jit
-from test_resnet import predict_dygraph, predict_static, predict_analysis_inference
+from test_resnet import ResNet, ResNetHelper
 
 program_translator = ProgramTranslator()
 
@@ -31,20 +30,20 @@ def setUp(self):
         self.build_strategy.fuse_bn_act_ops = True
         self.build_strategy.fuse_bn_add_act_ops = True
         self.build_strategy.enable_addto = True
+        self.resnet_helper = ResNetHelper()
         # NOTE: for enable_addto
         paddle.fluid.set_flags({"FLAGS_max_inplace_grad_add": 8})
 
     def train(self, to_static):
         program_translator.enable(to_static)
-
-        return train(to_static, self.build_strategy)
+        return self.resnet_helper.train(to_static, self.build_strategy)
 
     def verify_predict(self):
         image = np.random.random([1, 3, 224, 224]).astype('float32')
-        dy_pre = predict_dygraph(image)
-        st_pre = predict_static(image)
-        dy_jit_pre = predict_dygraph_jit(image)
-        predictor_pre = predict_analysis_inference(image)
+        dy_pre = self.resnet_helper.predict_dygraph(image)
+        st_pre = self.resnet_helper.predict_static(image)
+        dy_jit_pre = self.resnet_helper.predict_dygraph_jit(image)
+        predictor_pre = self.resnet_helper.predict_analysis_inference(image)
         self.assertTrue(
             np.allclose(dy_pre, st_pre),
             msg="dy_pre:\n {}\n, st_pre: \n{}.".format(dy_pre, st_pre))
@@ -69,7 +68,7 @@ def test_in_static_mode_mkldnn(self):
         paddle.fluid.set_flags({'FLAGS_use_mkldnn': True})
         try:
             if paddle.fluid.core.is_compiled_with_mkldnn():
-                train(True, self.build_strategy)
+                self.resnet_helper.train(True, self.build_strategy)
         finally:
             paddle.fluid.set_flags({'FLAGS_use_mkldnn': False})
 
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet.py
index efb69b530efc9..1a531c65bbf1e 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet.py
@@ -14,8 +14,10 @@
 
 from __future__ import print_function
 
+import os
 import math
 import time
+import tempfile
 import unittest
 
 import numpy as np
@@ -39,11 +41,6 @@
 place = fluid.CUDAPlace(0) if fluid.is_compiled_with_cuda() \
     else fluid.CPUPlace()
 
-MODEL_SAVE_DIR = "./inference"
-MODEL_SAVE_PREFIX = "./inference/resnet"
-MODEL_FILENAME = "resnet" + INFER_MODEL_SUFFIX
-PARAMS_FILENAME = "resnet" + INFER_PARAMS_SUFFIX
-DY_STATE_DICT_SAVE_PATH = "./resnet.dygraph"
 program_translator = ProgramTranslator()
 
 if fluid.is_compiled_with_cuda():
@@ -212,130 +209,148 @@ def __reader__():
     return __reader__
 
 
-def train(to_static, build_strategy=None):
-    """
-    Tests model decorated by `dygraph_to_static_output` in static mode. For users, the model is defined in dygraph mode and trained in static mode.
-    """
-    with fluid.dygraph.guard(place):
-        np.random.seed(SEED)
-        paddle.seed(SEED)
-        paddle.framework.random._manual_program_seed(SEED)
-
-        train_reader = paddle.batch(
-            reader_decorator(paddle.dataset.flowers.train(use_xmap=False)),
-            batch_size=batch_size,
-            drop_last=True)
-        data_loader = fluid.io.DataLoader.from_generator(
-            capacity=5, iterable=True)
-        data_loader.set_sample_list_generator(train_reader)
-
-        resnet = ResNet()
-        if to_static:
-            resnet = paddle.jit.to_static(resnet, build_strategy=build_strategy)
-        optimizer = optimizer_setting(parameter_list=resnet.parameters())
-
-        for epoch in range(epoch_num):
-            total_loss = 0.0
-            total_acc1 = 0.0
-            total_acc5 = 0.0
-            total_sample = 0
-
-            for batch_id, data in enumerate(data_loader()):
-                start_time = time.time()
-                img, label = data
-
-                pred = resnet(img)
-                loss = fluid.layers.cross_entropy(input=pred, label=label)
-                avg_loss = fluid.layers.mean(x=loss)
-                acc_top1 = fluid.layers.accuracy(input=pred, label=label, k=1)
-                acc_top5 = fluid.layers.accuracy(input=pred, label=label, k=5)
-
-                avg_loss.backward()
-                optimizer.minimize(avg_loss)
-                resnet.clear_gradients()
-
-                total_loss += avg_loss
-                total_acc1 += acc_top1
-                total_acc5 += acc_top5
-                total_sample += 1
-
-                end_time = time.time()
-                if batch_id % 2 == 0:
-                    print( "epoch %d | batch step %d, loss %0.3f, acc1 %0.3f, acc5 %0.3f, time %f" % \
-                        ( epoch, batch_id, total_loss.numpy() / total_sample, \
-                            total_acc1.numpy() / total_sample, total_acc5.numpy() / total_sample, end_time-start_time))
-                if batch_id == 10:
-                    if to_static:
-                        fluid.dygraph.jit.save(resnet, MODEL_SAVE_PREFIX)
-                    else:
-                        fluid.dygraph.save_dygraph(resnet.state_dict(),
-                                                   DY_STATE_DICT_SAVE_PATH)
-                    # avoid dataloader throw abort signaal
-                    data_loader._reset()
-                    break
-
-    return total_loss.numpy()
-
-
-def predict_dygraph(data):
-    program_translator.enable(False)
-    with fluid.dygraph.guard(place):
-        resnet = ResNet()
-
-        model_dict, _ = fluid.dygraph.load_dygraph(DY_STATE_DICT_SAVE_PATH)
-        resnet.set_dict(model_dict)
-        resnet.eval()
-
-        pred_res = resnet(fluid.dygraph.to_variable(data))
-
-        return pred_res.numpy()
-
-
-def predict_static(data):
-    paddle.enable_static()
-    exe = fluid.Executor(place)
-    [inference_program, feed_target_names,
-     fetch_targets] = fluid.io.load_inference_model(
-         MODEL_SAVE_DIR,
-         executor=exe,
-         model_filename=MODEL_FILENAME,
-         params_filename=PARAMS_FILENAME)
-
-    pred_res = exe.run(inference_program,
-                       feed={feed_target_names[0]: data},
-                       fetch_list=fetch_targets)
-
-    return pred_res[0]
-
-
-def predict_dygraph_jit(data):
-    with fluid.dygraph.guard(place):
-        resnet = fluid.dygraph.jit.load(MODEL_SAVE_PREFIX)
-        resnet.eval()
-
-        pred_res = resnet(data)
-
-        return pred_res.numpy()
-
-
-def predict_analysis_inference(data):
-    output = PredictorTools(MODEL_SAVE_DIR, MODEL_FILENAME, PARAMS_FILENAME,
-                            [data])
-    out = output()
-    return out
+class ResNetHelper:
+    def __init__(self):
+        self.temp_dir = tempfile.TemporaryDirectory()
+        self.model_save_dir = os.path.join(self.temp_dir.name, 'inference')
+        self.model_save_prefix = os.path.join(self.model_save_dir, 'resnet')
+        self.model_filename = 'resnet' + INFER_MODEL_SUFFIX
+        self.params_filename = 'resnet' + INFER_PARAMS_SUFFIX
+        self.dy_state_dict_save_path = os.path.join(self.temp_dir.name,
+                                                    'resnet.dygraph')
+
+    def __del__(self):
+        self.temp_dir.cleanup()
+
+    def train(self, to_static, build_strategy=None):
+        """
+        Tests model decorated by `dygraph_to_static_output` in static mode. For users, the model is defined in dygraph mode and trained in static mode.
+        """
+        with fluid.dygraph.guard(place):
+            np.random.seed(SEED)
+            paddle.seed(SEED)
+            paddle.framework.random._manual_program_seed(SEED)
+
+            train_reader = paddle.batch(
+                reader_decorator(paddle.dataset.flowers.train(use_xmap=False)),
+                batch_size=batch_size,
+                drop_last=True)
+            data_loader = fluid.io.DataLoader.from_generator(
+                capacity=5, iterable=True)
+            data_loader.set_sample_list_generator(train_reader)
+
+            resnet = ResNet()
+            if to_static:
+                resnet = paddle.jit.to_static(
+                    resnet, build_strategy=build_strategy)
+            optimizer = optimizer_setting(parameter_list=resnet.parameters())
+
+            for epoch in range(epoch_num):
+                total_loss = 0.0
+                total_acc1 = 0.0
+                total_acc5 = 0.0
+                total_sample = 0
+
+                for batch_id, data in enumerate(data_loader()):
+                    start_time = time.time()
+                    img, label = data
+
+                    pred = resnet(img)
+                    loss = fluid.layers.cross_entropy(input=pred, label=label)
+                    avg_loss = fluid.layers.mean(x=loss)
+                    acc_top1 = fluid.layers.accuracy(
+                        input=pred, label=label, k=1)
+                    acc_top5 = fluid.layers.accuracy(
+                        input=pred, label=label, k=5)
+
+                    avg_loss.backward()
+                    optimizer.minimize(avg_loss)
+                    resnet.clear_gradients()
+
+                    total_loss += avg_loss
+                    total_acc1 += acc_top1
+                    total_acc5 += acc_top5
+                    total_sample += 1
+
+                    end_time = time.time()
+                    if batch_id % 2 == 0:
+                        print( "epoch %d | batch step %d, loss %0.3f, acc1 %0.3f, acc5 %0.3f, time %f" % \
+                            ( epoch, batch_id, total_loss.numpy() / total_sample, \
+                                total_acc1.numpy() / total_sample, total_acc5.numpy() / total_sample, end_time-start_time))
+                    if batch_id == 10:
+                        if to_static:
+                            fluid.dygraph.jit.save(resnet,
+                                                   self.model_save_prefix)
+                        else:
+                            fluid.dygraph.save_dygraph(
+                                resnet.state_dict(),
+                                self.dy_state_dict_save_path)
+                        # avoid dataloader throw abort signaal
+                        data_loader._reset()
+                        break
+
+        return total_loss.numpy()
+
+    def predict_dygraph(self, data):
+        program_translator.enable(False)
+        with fluid.dygraph.guard(place):
+            resnet = ResNet()
+
+            model_dict, _ = fluid.dygraph.load_dygraph(
+                self.dy_state_dict_save_path)
+            resnet.set_dict(model_dict)
+            resnet.eval()
+
+            pred_res = resnet(fluid.dygraph.to_variable(data))
+
+            return pred_res.numpy()
+
+    def predict_static(self, data):
+        paddle.enable_static()
+        exe = fluid.Executor(place)
+        [inference_program, feed_target_names,
+         fetch_targets] = fluid.io.load_inference_model(
+             self.model_save_dir,
+             executor=exe,
+             model_filename=self.model_filename,
+             params_filename=self.params_filename)
+
+        pred_res = exe.run(inference_program,
+                           feed={feed_target_names[0]: data},
+                           fetch_list=fetch_targets)
+
+        return pred_res[0]
+
+    def predict_dygraph_jit(self, data):
+        with fluid.dygraph.guard(place):
+            resnet = fluid.dygraph.jit.load(self.model_save_prefix)
+            resnet.eval()
+
+            pred_res = resnet(data)
+
+            return pred_res.numpy()
+
+    def predict_analysis_inference(self, data):
+        output = PredictorTools(self.model_save_dir, self.model_filename,
+                                self.params_filename, [data])
+        out = output()
+        return out
 
 
 class TestResnet(unittest.TestCase):
+    def setUp(self):
+        self.resnet_helper = ResNetHelper()
+
     def train(self, to_static):
         program_translator.enable(to_static)
-        return train(to_static)
+        return self.resnet_helper.train(to_static)
 
     def verify_predict(self):
         image = np.random.random([1, 3, 224, 224]).astype('float32')
-        dy_pre = predict_dygraph(image)
-        st_pre = predict_static(image)
-        dy_jit_pre = predict_dygraph_jit(image)
-        predictor_pre = predict_analysis_inference(image)
+        dy_pre = self.resnet_helper.predict_dygraph(image)
+        st_pre = self.resnet_helper.predict_static(image)
+        dy_jit_pre = self.resnet_helper.predict_dygraph_jit(image)
+        predictor_pre = self.resnet_helper.predict_analysis_inference(image)
         self.assertTrue(
             np.allclose(dy_pre, st_pre),
             msg="dy_pre:\n {}\n, st_pre: \n{}.".format(dy_pre, st_pre))
@@ -360,7 +375,7 @@ def test_in_static_mode_mkldnn(self):
         fluid.set_flags({'FLAGS_use_mkldnn': True})
         try:
             if paddle.fluid.core.is_compiled_with_mkldnn():
-                train(to_static=True)
+                self.resnet_helper.train(to_static=True)
         finally:
             fluid.set_flags({'FLAGS_use_mkldnn': False})
 
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_transformer.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_transformer.py
index 06f2c60dfae9f..c8fe3e3932914 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_transformer.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_transformer.py
@@ -15,6 +15,7 @@
 import logging
 import os
 import time
+import tempfile
 import unittest
 
 import numpy as np
@@ -371,8 +372,21 @@ def predict_static(args, batch_generator):
 
 
 class TestTransformer(unittest.TestCase):
+    def setUp(self):
+        self.temp_dir = tempfile.TemporaryDirectory()
+
+    def tearDwon(self):
+        self.temp_dir.cleanup()
+
     def prepare(self, mode='train'):
         args = util.ModelHyperParams()
+        args.save_dygraph_model_path = os.path.join(
+            self.temp_dir.name, args.save_dygraph_model_path)
+        args.save_static_model_path = os.path.join(self.temp_dir.name,
+                                                   args.save_static_model_path)
+        args.inference_model_dir = os.path.join(self.temp_dir.name,
+                                                args.inference_model_dir)
+        args.output_file = os.path.join(self.temp_dir.name, args.output_file)
         batch_generator = util.get_feed_data_reader(args, mode)
         return args, batch_generator
 
diff --git a/python/paddle/fluid/tests/unittests/test_dataset.py b/python/paddle/fluid/tests/unittests/test_dataset.py
index 348945b73e1a4..5ef5a1016cc8b 100644
--- a/python/paddle/fluid/tests/unittests/test_dataset.py
+++ b/python/paddle/fluid/tests/unittests/test_dataset.py
@@ -24,6 +24,7 @@
 import numpy as np
 import os
 import shutil
+import tempfile
 import unittest
 
 
@@ -82,12 +83,17 @@ def test_run_with_dump(self):
         """
         Testcase for InMemoryDataset from create to run.
         """
-        with open("test_run_with_dump_a.txt", "w") as f:
+
+        temp_dir = tempfile.TemporaryDirectory()
+        dump_a_path = os.path.join(temp_dir.name, 'test_run_with_dump_a.txt')
+        dump_b_path = os.path.join(temp_dir.name, 'test_run_with_dump_b.txt')
+
+        with open(dump_a_path, "w") as f:
             data = "1 a 1 a 1 1 2 3 3 4 5 5 5 5 1 1\n"
             data += "1 b 1 b 1 2 2 3 4 4 6 6 6 6 1 2\n"
             data += "1 c 1 c 1 3 2 3 5 4 7 7 7 7 1 3\n"
             f.write(data)
-        with open("test_run_with_dump_b.txt", "w") as f:
+        with open(dump_b_path, "w") as f:
             data = "1 d 1 d 1 4 2 3 3 4 5 5 5 5 1 4\n"
             data += "1 e 1 e 1 5 2 3 4 4 6 6 6 6 1 5\n"
             data += "1 f 1 f 1 6 2 3 5 4 7 7 7 7 1 6\n"
@@ -110,8 +116,7 @@ def test_run_with_dump(self):
             parse_content=True,
             fea_eval=True,
             candidate_size=10000)
-        dataset.set_filelist(
-            ["test_run_with_dump_a.txt", "test_run_with_dump_b.txt"])
+        dataset.set_filelist([dump_a_path, dump_b_path])
         dataset.load_into_memory()
         dataset.local_shuffle()
 
@@ -129,8 +134,7 @@ def test_run_with_dump(self):
             except Exception as e:
                 self.assertTrue(False)
 
-        os.remove("./test_run_with_dump_a.txt")
-        os.remove("./test_run_with_dump_b.txt")
+        temp_dir.cleanup()
 
     def test_dataset_config(self):
         """ Testcase for dataset configuration. """
diff --git a/python/paddle/fluid/tests/unittests/test_dataset_consistency_inspection.py b/python/paddle/fluid/tests/unittests/test_dataset_consistency_inspection.py
index 5911ada1817b6..911bee69e8b77 100644
--- a/python/paddle/fluid/tests/unittests/test_dataset_consistency_inspection.py
+++ b/python/paddle/fluid/tests/unittests/test_dataset_consistency_inspection.py
@@ -25,6 +25,7 @@
 import math
 import os
 import shutil
+import tempfile
 import unittest
 import paddle.fluid.incubate.data_generator as dg
 
@@ -282,7 +283,11 @@ def test_var_consistency_insepection(self):
         """
         Testcase for InMemoryDataset of consistency insepection of use_var_list and data_generator.
         """
-        with open("test_run_with_dump_a.txt", "w") as f:
+
+        temp_dir = tempfile.TemporaryDirectory()
+        dump_a_path = os.path.join(temp_dir.name, 'test_run_with_dump_a.txt')
+
+        with open(dump_a_path, "w") as f:
             # data = "\n"
             # data += "\n"
             data = "2 1;1 9;20002001 20001240 20001860 20003611 20000723;20002001 20001240 20001860 20003611 20000723;0;40000001;20002001 20001240 20001860 20003611 20000157 20000723 20000070 20002616 20000157 20000005;20002001 20001240 20001860 20003611 20000157 20001776 20000070 20002616 20000157 20000005;20002001 20001240 20001860 20003611 20000723 20000070 20002001 20001240 20001860 20003611 20012788 20000157;20002001 20001240 20001860 20003611 20000623 20000251 20000157 20000723 20000070 20000001 20000057;20002640 20004695 20000157 20000723 20000070 20002001 20001240 20001860 20003611;20002001 20001240 20001860 20003611 20000157 20000723 20000070 20003519 20000005;20002001 20001240 20001860 20003611 20000157 20001776 20000070 20003519 20000005;20002001 20001240 20001860 20003611 20000723 20000070 20002001 20001240 20001860 20003611 20131464;20002001 20001240 20001860 20003611 20018820 20000157 20000723 20000070 20000001 20000057;20002640 20034154 20000723 20000070 20002001 20001240 20001860 20003611;10000200;10000200;10063938;10000008;10000177;20002001 20001240 20001860 20003611 20010833 20000210 20000500 20000401 20000251 20012198 20001023 20000157;20002001 20001240 20001860 20003611 20012396 20000500 20002513 20012198 20001023 20000157;10000123;30000004;0.623 0.233 0.290 0.208 0.354 49.000 0.000 0.000 0.000 -1.000 0.569 0.679 0.733 53 17 2 0;20002001 20001240 20001860 20003611 20000723;20002001 20001240 20001860 20003611 20000723;10000047;30000004;0.067 0.000 0.161 0.005 0.000 49.000 0.000 0.000 0.000 -1.000 0.000 0.378 0.043 0 6 0 0;20002001 20001240 20001860 20003611 20000157 20000723 20000070 20002616 20000157 20000005;20002001 20001240 20001860 20003611 20000157 20000723 20000070 20003519 20000005;10000200;30000001;0.407 0.111 0.196 0.095 0.181 49.000 0.000 0.000 0.000 -1.000 0.306 0.538 0.355 48 8 0 0;20002001 20001240 20001860 20003611 20000157 20001776 20000070 20002616 20000157 20000005;20002001 20001240 20001860 20003611 20000157 20001776 20000070 20003519 20000005;10000200;30000001;0.226 0.029 0.149 0.031 0.074 49.000 0.000 0.000 0.000 -1.000 0.220 0.531 0.286 26 6 0 0;20002001 20001240 20001860 20003611 20000723 20000070 20002001 20001240 20001860 20003611 20012788 20000157;20002001 20001240 20001860 20003611 20000723 20000070 20002001 20001240 20001860 20003611 20131464;10063938;30000001;0.250 0.019 0.138 0.012 0.027 49.000 0.000 0.000 0.000 -1.000 0.370 0.449 0.327 7 2 0 0;20002001 20001240 20001860 20003611 20000723;20002001 20001240 20001860 20003611 20000723;10000003;30000002;0.056 0.000 0.139 0.003 0.000 49.000 0.000 0.000 0.000 -1.000 0.000 0.346 0.059 15 3 0 0;20002001 20001240 20001860 20003611 20000623 20000251 20000157 20000723 20000070 20000001 20000057;20002001 20001240 20001860 20003611 20018820 20000157 20000723 20000070 20000001 20000057;10000008;30000001;0.166 0.004 0.127 0.001 0.004 49.000 0.000 0.000 0.000 -1.000 0.103 0.417 0.394 10 3 0 0;20002640 20004695 20000157 20000723 20000070 20002001 20001240 20001860 20003611;20002640 20034154 20000723 20000070 20002001 20001240 20001860 20003611;10000177;30000001;0.094 0.008 0.157 0.012 0.059 49.000 0.000 0.000 0.000 -1.000 0.051 0.382 0.142 21 0 0 0;20002001 20001240 20001860 20003611 20000157 20001776 20000070 20000157;20002001 20001240 20001860 20003611 20000157 20001776 20000070 20000157;10000134;30000001;0.220 0.016 0.181 0.037 0.098 49.000 0.000 0.000 0.000 -1.000 0.192 0.453 0.199 17 1 0 0;20002001 20001240 20001860 20003611 20002640 20004695 20000157 20000723 20000070 20002001 20001240 20001860 20003611;20002001 20001240 20001860 20003611 20002640 20034154 20000723 20000070 20002001 20001240 20001860 20003611;10000638;30000001;0.000 0.000 0.000 0.000 0.000 49.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0 0 0 0;\n"
@@ -348,7 +353,7 @@ def test_var_consistency_insepection(self):
         generator_class = CTRDataset(mode=0)
         try:
             dataset._check_use_var_with_data_generator(
-                slot_data, generator_class, "test_run_with_dump_a.txt")
+                slot_data, generator_class, dump_a_path)
             print("case 1: check passed!")
         except Exception as e:
             print("warning: catch expected error")
@@ -360,7 +365,7 @@ def test_var_consistency_insepection(self):
         generator_class = CTRDataset(mode=2)
         try:
             dataset._check_use_var_with_data_generator(
-                slot_data, generator_class, "test_run_with_dump_a.txt")
+                slot_data, generator_class, dump_a_path)
         except Exception as e:
             print("warning: case 2 catch expected error")
             print(e)
@@ -371,7 +376,7 @@ def test_var_consistency_insepection(self):
         generator_class = CTRDataset(mode=3)
         try:
             dataset._check_use_var_with_data_generator(
-                slot_data, generator_class, "test_run_with_dump_a.txt")
+                slot_data, generator_class, dump_a_path)
         except Exception as e:
             print("warning: case 3 catch expected error")
             print(e)
@@ -382,7 +387,7 @@ def test_var_consistency_insepection(self):
         generator_class = CTRDataset(mode=4)
         try:
             dataset._check_use_var_with_data_generator(
-                slot_data, generator_class, "test_run_with_dump_a.txt")
+                slot_data, generator_class, dump_a_path)
         except Exception as e:
             print("warning: case 4 catch expected error")
             print(e)
@@ -393,13 +398,13 @@ def test_var_consistency_insepection(self):
         generator_class = CTRDataset(mode=5)
         try:
             dataset._check_use_var_with_data_generator(
-                slot_data, generator_class, "test_run_with_dump_a.txt")
+                slot_data, generator_class, dump_a_path)
         except Exception as e:
             print("warning: case 5 catch expected error")
             print(e)
         print("========================================")
 
-        os.remove("./test_run_with_dump_a.txt")
+        temp_dir.cleanup()
 
 
 if __name__ == '__main__':

From a7926ef2d81381a54f744de9aa33483fcaf25609 Mon Sep 17 00:00:00 2001
From: pangyoki <pangyoki@126.com>
Date: Thu, 12 May 2022 11:29:18 +0800
Subject: [PATCH 22/49] add approval for _enable_legacy_dygraph in unittest
 (#42687)

* add approval for eager CI

* add wiki manual
---
 tools/check_file_diff_approvals.sh | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/tools/check_file_diff_approvals.sh b/tools/check_file_diff_approvals.sh
index b0800a9cd845e..8420590399549 100644
--- a/tools/check_file_diff_approvals.sh
+++ b/tools/check_file_diff_approvals.sh
@@ -391,6 +391,22 @@ if [ "${UNITTEST_FILE_CHANGED}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
     fi
 fi
 
+if [ "${UNITTEST_FILE_CHANGED}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
+    ERROR_LINES=""
+    for TEST_FILE in ${UNITTEST_FILE_CHANGED};
+    do
+        ENABLE_LEGACY_DYGRAPH_CI=`git diff -U0 upstream/$BRANCH ${PADDLE_ROOT}/${TEST_FILE} |grep "_enable_legacy_dygraph" || true`
+        if [ "${ENABLE_LEGACY_DYGRAPH_CI}" != "" ]; then
+            ERROR_LINES="${ERROR_LINES}\n${TEST_FILE}\n${ENABLE_LEGACY_DYGRAPH_CI}\n"
+        fi
+    done
+    if [ "${ERROR_LINES}" != "" ]; then
+        ERROR_LINES=${ERROR_LINES//+/'\n+\t'}
+        echo_line="_enable_legacy_dygraph forces the mode to old dynamic graph. You must have one RD (pangyoki (Recommend), Aurelius84 or JiabinYang) approval for the usage (either add or delete) of _enable_legacy_dygraph. For more information, please refer to: https://github.com/PaddlePaddle/Paddle/wiki/Enable-Eager-Mode-in-Paddle-CI. The corresponding lines are as follows:\n${ERROR_LINES}\n"
+        check_approval 1 26408901 9301846 22361972
+    fi
+fi
+
 RUNTYPE_FILE_CHANGED=`git diff --name-only --diff-filter=AM upstream/$BRANCH|grep -E "CMakeLists.txt"||true`
 if [ "${RUNTYPE_FILE_CHANGED}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
     for CMAKELISTS_FILE in ${RUNTYPE_FILE_CHANGED};

From 5914b18a084db20c9ca40a0daaea3f5b925176ba Mon Sep 17 00:00:00 2001
From: Wangzheee <634486483@qq.com>
Date: Thu, 12 May 2022 13:00:14 +0800
Subject: [PATCH 23/49] [Paddle-Inference] support transformer generation: some
 passes (#42664)

* [Paddle-Inference] support transformer generation: some passes
---
 paddle/fluid/framework/ir/CMakeLists.txt      |   4 +
 ...ete_remove_padding_recover_padding_pass.cc | 100 ++++++
 ...lete_remove_padding_recover_padding_pass.h |  59 ++++
 .../ir/remove_padding_recover_padding_pass.cc | 298 ++++++++++++++++++
 .../ir/remove_padding_recover_padding_pass.h  |  94 ++++++
 .../ir/set_transformer_input_convert_pass.cc  | 161 ++++++++++
 .../ir/set_transformer_input_convert_pass.h   |  80 +++++
 .../ir_passes/tensorrt_subgraph_pass.cc       |   7 +-
 .../inference/api/paddle_pass_builder.cc      |  27 +-
 9 files changed, 812 insertions(+), 18 deletions(-)
 create mode 100644 paddle/fluid/framework/ir/delete_remove_padding_recover_padding_pass.cc
 create mode 100644 paddle/fluid/framework/ir/delete_remove_padding_recover_padding_pass.h
 create mode 100644 paddle/fluid/framework/ir/remove_padding_recover_padding_pass.cc
 create mode 100644 paddle/fluid/framework/ir/remove_padding_recover_padding_pass.h
 create mode 100644 paddle/fluid/framework/ir/set_transformer_input_convert_pass.cc
 create mode 100644 paddle/fluid/framework/ir/set_transformer_input_convert_pass.h

diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt
index d000dc7085365..b430a409e9965 100755
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -107,6 +107,9 @@ if(WITH_TENSORRT)
     pass_library(trt_map_matmul_to_mul_pass inference)
     pass_library(preln_embedding_eltwise_layernorm_fuse_pass inference)
     pass_library(preln_skip_layernorm_fuse_pass inference)
+    pass_library(set_transformer_input_convert_pass inference)
+    pass_library(remove_padding_recover_padding_pass inference)
+    pass_library(delete_remove_padding_recover_padding_pass inference)
 endif()
 
 if(WITH_GPU OR WITH_ROCM)
@@ -161,6 +164,7 @@ if(WITH_IPU)
     pass_library(infer_shape_pass base DIR ipu)
     pass_library(delete_scale_op_pass base DIR ipu)
     pass_library(avg_shard_pass base DIR ipu)
+    pass_library(transfer_cast_op_pass base DIR ipu)
 endif()
 
 cc_library(fuse_bn_act_pass SRCS fuse_bn_act_pass.cc DEPS pass graph_pattern_detector )
diff --git a/paddle/fluid/framework/ir/delete_remove_padding_recover_padding_pass.cc b/paddle/fluid/framework/ir/delete_remove_padding_recover_padding_pass.cc
new file mode 100644
index 0000000000000..63233e0b584b2
--- /dev/null
+++ b/paddle/fluid/framework/ir/delete_remove_padding_recover_padding_pass.cc
@@ -0,0 +1,100 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/delete_remove_padding_recover_padding_pass.h"
+
+#include <string>
+
+#include "paddle/fluid/framework/op_version_registry.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+namespace patterns {
+
+void RecoverPadding::operator()() {
+  // Create nodes for recover_padding.
+  auto *recover_padding_input =
+      pattern->NewNode(recover_padding_input_repr())
+          ->assert_is_op_input("recover_padding", "Input");
+  auto *recover_padding_op = pattern->NewNode(recover_padding_op_repr())
+                                 ->assert_is_op("recover_padding");
+  auto *recover_padding_out =
+      pattern->NewNode(recover_padding_out_repr())
+          ->assert_is_op_output("recover_padding", "Out");
+
+  // Add links for recover_padding op.
+  recover_padding_op->LinksFrom({recover_padding_input})
+      .LinksTo({recover_padding_out});
+}
+}  // namespace patterns
+
+void DeleteRemovePaddingRecoverPaddingPass::ApplyImpl(ir::Graph *graph) const {
+  PADDLE_ENFORCE_NOT_NULL(
+      graph, platform::errors::PreconditionNotMet("graph should not be null."));
+  FusePassBase::Init(name_scope_, graph);
+  int found_subgraph_count = 0;
+
+  //
+  GraphPatternDetector gpd;
+  patterns::RecoverPadding recover_padding(
+      gpd.mutable_pattern(), "delete_remove_padding_recover_padding_pass");
+  recover_padding();
+
+  auto handler = [&](const GraphPatternDetector::subgraph_t &subgraph,
+                     Graph *graph) {
+    VLOG(3) << "delete_remove_padding_recover_padding_pass";
+
+    GET_IR_NODE_FROM_SUBGRAPH(recover_padding_input, recover_padding_input,
+                              recover_padding);
+    GET_IR_NODE_FROM_SUBGRAPH(recover_padding_op, recover_padding_op,
+                              recover_padding);
+    GET_IR_NODE_FROM_SUBGRAPH(recover_padding_out, recover_padding_out,
+                              recover_padding);
+
+    std::unordered_set<const Node *> del_node_set;
+
+    bool delete_recover_padding = true;
+    for (size_t i = 0; i < recover_padding_out->outputs.size(); ++i) {
+      if (recover_padding_out->outputs[i]->Name() ==
+          "remove_padding") {  // op_node
+        auto *remove_padding_out_node =
+            recover_padding_out->outputs[i]->outputs[0];          // var_node
+        auto *out_op_node = remove_padding_out_node->outputs[0];  // op_node
+        IR_NODE_LINK_TO(recover_padding_input, out_op_node);
+        del_node_set.insert(recover_padding_out->outputs[i]);
+        del_node_set.insert(remove_padding_out_node);
+        out_op_node->Op()->RenameInput(remove_padding_out_node->Name(),
+                                       recover_padding_input->Name());
+        found_subgraph_count++;
+      } else {
+        delete_recover_padding = false;
+      }
+    }
+    if (delete_recover_padding) {
+      del_node_set.insert(recover_padding_op);
+      del_node_set.insert(recover_padding_out);
+    }
+    GraphSafeRemoveNodes(graph, del_node_set);
+  };
+  gpd(graph, handler);
+  AddStatis(found_subgraph_count);
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(delete_remove_padding_recover_padding_pass,
+              paddle::framework::ir::DeleteRemovePaddingRecoverPaddingPass);
diff --git a/paddle/fluid/framework/ir/delete_remove_padding_recover_padding_pass.h b/paddle/fluid/framework/ir/delete_remove_padding_recover_padding_pass.h
new file mode 100644
index 0000000000000..3504b124c91d1
--- /dev/null
+++ b/paddle/fluid/framework/ir/delete_remove_padding_recover_padding_pass.h
@@ -0,0 +1,59 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+class Graph;
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+namespace paddle {
+namespace framework {
+namespace ir {
+namespace patterns {
+struct RecoverPadding : public PatternBase {
+  RecoverPadding(PDPattern *pattern, const std::string &name_scope)
+      : PatternBase(pattern, name_scope, "recover_padding") {}
+
+  void operator()();
+
+  PATTERN_DECL_NODE(recover_padding_input);
+  PATTERN_DECL_NODE(recover_padding_op);
+  PATTERN_DECL_NODE(recover_padding_out);
+};
+}  // namespace patterns
+
+class DeleteRemovePaddingRecoverPaddingPass : public FusePassBase {
+ public:
+  DeleteRemovePaddingRecoverPaddingPass() {}
+  virtual ~DeleteRemovePaddingRecoverPaddingPass() {}
+
+ protected:
+  void ApplyImpl(Graph *graph) const;
+  const std::string name_scope_{"delete_remove_padding_recover_padding_pass"};
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/remove_padding_recover_padding_pass.cc b/paddle/fluid/framework/ir/remove_padding_recover_padding_pass.cc
new file mode 100644
index 0000000000000..67dfe074dc075
--- /dev/null
+++ b/paddle/fluid/framework/ir/remove_padding_recover_padding_pass.cc
@@ -0,0 +1,298 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/remove_padding_recover_padding_pass.h"
+
+#include <string>
+
+#include "paddle/fluid/framework/op_version_registry.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+namespace patterns {
+void SkipLayernorm::operator()() {
+  // Create nodes for skip_layernorm.
+  auto* skip_layernorm_x = pattern->NewNode(skip_layernorm_x_repr())
+                               ->assert_is_op_input("skip_layernorm", "X");
+  auto* skip_layernorm_y = pattern->NewNode(skip_layernorm_y_repr())
+                               ->assert_is_op_input("skip_layernorm", "Y");
+  auto* skip_layernorm_op = pattern->NewNode(skip_layernorm_op_repr())
+                                ->assert_is_op("skip_layernorm");
+  auto* skip_layernorm_out = pattern->NewNode(skip_layernorm_out_repr())
+                                 ->assert_is_op_output("skip_layernorm", "Out");
+
+  // Add links for skip_layernorm op.
+  skip_layernorm_op->LinksFrom({skip_layernorm_x, skip_layernorm_y})
+      .LinksTo({skip_layernorm_out});
+}
+
+void MultiheadMatmul::operator()() {
+  // Create nodes for multihead_matmul.
+  auto* multihead_matmul_input =
+      pattern->NewNode(multihead_matmul_input_repr())
+          ->assert_is_op_input("multihead_matmul", "Input");
+  auto* multihead_matmul_op = pattern->NewNode(multihead_matmul_op_repr())
+                                  ->assert_is_op("multihead_matmul");
+  auto* multihead_matmul_out =
+      pattern->NewNode(multihead_matmul_out_repr())
+          ->assert_is_op_output("multihead_matmul", "Out");
+
+  // Add links for multihead_matmul op.
+  multihead_matmul_op->LinksFrom({multihead_matmul_input})
+      .LinksTo({multihead_matmul_out});
+}
+
+void Fc::operator()() {
+  // Create nodes for fc.
+  auto* fc_input =
+      pattern->NewNode(fc_input_repr())->assert_is_op_input("fc", "Input");
+  auto* fc_op = pattern->NewNode(fc_op_repr())->assert_is_op("fc");
+  auto* fc_out =
+      pattern->NewNode(fc_out_repr())->assert_is_op_output("fc", "Out");
+
+  // Add links for fc op.
+  fc_op->LinksFrom({fc_input}).LinksTo({fc_out});
+}
+
+void Activation::operator()() {
+  // Create nodes for activation.
+  std::unordered_set<std::string> activation_ops{"relu", "sigmoid", "tanh"};
+  auto* activation_input = pattern->NewNode(activation_input_repr())
+                               ->assert_is_ops_input(activation_ops);
+  auto* activation_op =
+      pattern->NewNode(activation_op_repr())->assert_is_ops(activation_ops);
+  auto* activation_out = pattern->NewNode(activation_out_repr())
+                             ->assert_is_ops_output(activation_ops);
+
+  // Add links for activation op.
+  activation_op->LinksFrom({activation_input}).LinksTo({activation_out});
+}
+}  // namespace patterns
+
+void RemovePaddingRecoverPaddingPass::ApplyImpl(ir::Graph* graph) const {
+  PADDLE_ENFORCE_NOT_NULL(
+      graph, platform::errors::PreconditionNotMet("graph should not be null."));
+  FusePassBase::Init(name_scope_, graph);
+  auto* scope = param_scope();
+  int found_subgraph_count = 0;
+
+  // Create an remove_padding op node
+  auto insert_remove_padding_op = [&](Node* input_node, Node* op_node) {
+    // create op, var in graph
+    OpDesc remove_padding;
+    std::string remove_padding_out_name =
+        input_node->Name() + ".remove_padding";
+
+    VarDesc remove_padding_out(remove_padding_out_name);
+    remove_padding_out.SetDataType(input_node->Var()->GetDataType());
+    remove_padding_out.SetShape(input_node->Var()->GetShape());
+    remove_padding_out.SetPersistable(false);
+
+    // remove_padding_op
+    remove_padding.SetType("remove_padding");
+
+    // input
+    remove_padding.SetInput("Input", {input_node->Name()});
+
+    // output
+    remove_padding.SetOutput("Out", {remove_padding_out_name});
+
+    auto remove_padding_op_node = graph->CreateOpNode(&remove_padding);
+    auto remove_padding_out_node = graph->CreateVarNode(&remove_padding_out);
+
+    // replace link
+    for (size_t i = 0; i < input_node->outputs.size(); ++i) {
+      if (input_node->outputs[i] == op_node) {
+        input_node->outputs[i] = remove_padding_op_node;
+        remove_padding_op_node->inputs.push_back(input_node);
+      }
+    }
+
+    // link node
+    IR_NODE_LINK_TO(remove_padding_op_node, remove_padding_out_node);
+
+    // replace link
+    for (size_t i = 0; i < op_node->inputs.size(); ++i) {
+      if (op_node->inputs[i] == input_node) {
+        op_node->inputs[i] = remove_padding_out_node;
+        remove_padding_out_node->outputs.push_back(op_node);
+      }
+    }
+
+    // create variable in scope
+    scope->Var(remove_padding_out_name);
+    auto* remove_padding_out_tensor =
+        scope->FindVar(remove_padding_out_name)->GetMutable<LoDTensor>();
+    remove_padding_out_tensor->mutable_data<float>(platform::CUDAPlace());
+
+    // rename
+    op_node->Op()->RenameInput(input_node->Name(),
+                               remove_padding_out_node->Name());
+  };
+
+  // create an remove_padding op node
+  auto insert_recover_padding_op = [&](Node* op_node, Node* out_node) {
+    // create op, var in graph
+    OpDesc recover_padding;
+    std::string recover_padding_input_name =
+        out_node->Name() + ".recover_padding";
+    VarDesc recover_padding_input(recover_padding_input_name);
+    recover_padding_input.SetDataType(out_node->Var()->GetDataType());
+    recover_padding_input.SetShape(out_node->Var()->GetShape());
+    recover_padding_input.SetPersistable(false);
+
+    // recover_padding_op
+    recover_padding.SetType("recover_padding");
+
+    // input
+    recover_padding.SetInput("Input", {recover_padding_input_name});
+
+    // output
+    recover_padding.SetOutput("Out", {out_node->Name()});
+
+    auto recover_padding_op_node = graph->CreateOpNode(&recover_padding);
+    auto recover_padding_input_node =
+        graph->CreateVarNode(&recover_padding_input);
+
+    // replace link
+    for (size_t i = 0; i < op_node->outputs.size(); ++i) {
+      if (op_node->outputs[i] == out_node) {
+        op_node->outputs[i] = recover_padding_input_node;
+        recover_padding_input_node->inputs.push_back(op_node);
+      }
+    }
+
+    // link node
+    IR_NODE_LINK_TO(recover_padding_input_node, recover_padding_op_node);
+
+    // replace link
+    for (size_t i = 0; i < out_node->inputs.size(); ++i) {
+      if (out_node->inputs[i] == op_node) {
+        out_node->inputs[i] = recover_padding_op_node;
+        recover_padding_op_node->outputs.push_back(out_node);
+      }
+    }
+
+    // create variable in scope
+    scope->Var(recover_padding_input_name);
+    auto* recover_padding_input_tensor =
+        scope->FindVar(recover_padding_input_name)->GetMutable<LoDTensor>();
+    recover_padding_input_tensor->mutable_data<float>(platform::CUDAPlace());
+
+    // rename
+    op_node->Op()->RenameOutput(out_node->Name(), recover_padding_input_name);
+  };
+
+  GraphPatternDetector gpd1;
+  patterns::SkipLayernorm skip_layernorm(gpd1.mutable_pattern(),
+                                         "remove_padding_recover_padding_pass");
+  skip_layernorm();
+
+  auto handler1 = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                      Graph* graph) {
+    VLOG(3) << "remove_padding_recover_padding_pass for transformer: "
+               "skip_layernorm";
+
+    GET_IR_NODE_FROM_SUBGRAPH(skip_layernorm_x, skip_layernorm_x,
+                              skip_layernorm);
+    GET_IR_NODE_FROM_SUBGRAPH(skip_layernorm_y, skip_layernorm_y,
+                              skip_layernorm);
+    GET_IR_NODE_FROM_SUBGRAPH(skip_layernorm_op, skip_layernorm_op,
+                              skip_layernorm);
+    GET_IR_NODE_FROM_SUBGRAPH(skip_layernorm_out, skip_layernorm_out,
+                              skip_layernorm);
+
+    insert_remove_padding_op(skip_layernorm_x, skip_layernorm_op);
+    insert_remove_padding_op(skip_layernorm_y, skip_layernorm_op);
+    insert_recover_padding_op(skip_layernorm_op, skip_layernorm_out);
+
+    found_subgraph_count++;
+  };
+  gpd1(graph, handler1);
+
+  GraphPatternDetector gpd2;
+  patterns::MultiheadMatmul multihead_matmul(
+      gpd2.mutable_pattern(), "remove_padding_recover_padding_pass");
+  multihead_matmul();
+
+  auto handler2 = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                      Graph* graph) {
+    VLOG(3) << "remove_padding_recover_padding_pass for transformer: "
+               "multihead_matmul";
+
+    GET_IR_NODE_FROM_SUBGRAPH(multihead_matmul_input, multihead_matmul_input,
+                              multihead_matmul);
+    GET_IR_NODE_FROM_SUBGRAPH(multihead_matmul_op, multihead_matmul_op,
+                              multihead_matmul);
+    GET_IR_NODE_FROM_SUBGRAPH(multihead_matmul_out, multihead_matmul_out,
+                              multihead_matmul);
+
+    insert_remove_padding_op(multihead_matmul_input, multihead_matmul_op);
+    insert_recover_padding_op(multihead_matmul_op, multihead_matmul_out);
+
+    found_subgraph_count++;
+  };
+  gpd2(graph, handler2);
+
+  GraphPatternDetector gpd3;
+  patterns::Fc fc(gpd3.mutable_pattern(),
+                  "remove_padding_recover_padding_pass");
+  fc();
+
+  auto handler3 = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                      Graph* graph) {
+    VLOG(3) << "remove_padding_recover_padding_pass for transformer: fc";
+
+    GET_IR_NODE_FROM_SUBGRAPH(fc_input, fc_input, fc);
+    GET_IR_NODE_FROM_SUBGRAPH(fc_op, fc_op, fc);
+    GET_IR_NODE_FROM_SUBGRAPH(fc_out, fc_out, fc);
+
+    insert_remove_padding_op(fc_input, fc_op);
+    insert_recover_padding_op(fc_op, fc_out);
+
+    found_subgraph_count++;
+  };
+  gpd3(graph, handler3);
+
+  GraphPatternDetector gpd4;
+  patterns::Activation activation(gpd4.mutable_pattern(),
+                                  "remove_padding_recover_padding_pass");
+  activation();
+
+  auto handler4 = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                      Graph* graph) {
+    VLOG(3)
+        << "remove_padding_recover_padding_pass for transformer: activation";
+
+    GET_IR_NODE_FROM_SUBGRAPH(activation_input, activation_input, activation);
+    GET_IR_NODE_FROM_SUBGRAPH(activation_op, activation_op, activation);
+    GET_IR_NODE_FROM_SUBGRAPH(activation_out, activation_out, activation);
+
+    insert_remove_padding_op(activation_input, activation_op);
+    insert_recover_padding_op(activation_op, activation_out);
+
+    found_subgraph_count++;
+  };
+  gpd4(graph, handler4);
+
+  AddStatis(found_subgraph_count);
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(remove_padding_recover_padding_pass,
+              paddle::framework::ir::RemovePaddingRecoverPaddingPass);
diff --git a/paddle/fluid/framework/ir/remove_padding_recover_padding_pass.h b/paddle/fluid/framework/ir/remove_padding_recover_padding_pass.h
new file mode 100644
index 0000000000000..d7ccfc75c2000
--- /dev/null
+++ b/paddle/fluid/framework/ir/remove_padding_recover_padding_pass.h
@@ -0,0 +1,94 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+class Graph;
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+namespace paddle {
+namespace framework {
+namespace ir {
+namespace patterns {
+
+struct SkipLayernorm : public PatternBase {
+  SkipLayernorm(PDPattern *pattern, const std::string &name_scope)
+      : PatternBase(pattern, name_scope, "skip_layernorm") {}
+
+  void operator()();
+
+  PATTERN_DECL_NODE(skip_layernorm_x);
+  PATTERN_DECL_NODE(skip_layernorm_y);
+  PATTERN_DECL_NODE(skip_layernorm_op);
+  PATTERN_DECL_NODE(skip_layernorm_out);
+};
+
+struct MultiheadMatmul : public PatternBase {
+  MultiheadMatmul(PDPattern *pattern, const std::string &name_scope)
+      : PatternBase(pattern, name_scope, "multihead_matmul") {}
+
+  void operator()();
+
+  PATTERN_DECL_NODE(multihead_matmul_input);
+  PATTERN_DECL_NODE(multihead_matmul_op);
+  PATTERN_DECL_NODE(multihead_matmul_out);
+};
+
+struct Fc : public PatternBase {
+  Fc(PDPattern *pattern, const std::string &name_scope)
+      : PatternBase(pattern, name_scope, "fc") {}
+
+  void operator()();
+
+  PATTERN_DECL_NODE(fc_input);
+  PATTERN_DECL_NODE(fc_op);
+  PATTERN_DECL_NODE(fc_out);
+};
+
+struct Activation : public PatternBase {
+  Activation(PDPattern *pattern, const std::string &name_scope)
+      : PatternBase(pattern, name_scope, "activation") {}
+
+  void operator()();
+
+  PATTERN_DECL_NODE(activation_input);
+  PATTERN_DECL_NODE(activation_op);
+  PATTERN_DECL_NODE(activation_out);
+};
+}  // namespace patterns
+
+class RemovePaddingRecoverPaddingPass : public FusePassBase {
+ public:
+  RemovePaddingRecoverPaddingPass() {}
+  virtual ~RemovePaddingRecoverPaddingPass() {}
+
+ protected:
+  void ApplyImpl(Graph *graph) const;
+  const std::string name_scope_{"remove_padding_recover_padding_pass"};
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/set_transformer_input_convert_pass.cc b/paddle/fluid/framework/ir/set_transformer_input_convert_pass.cc
new file mode 100644
index 0000000000000..37e77bc134d3c
--- /dev/null
+++ b/paddle/fluid/framework/ir/set_transformer_input_convert_pass.cc
@@ -0,0 +1,161 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/set_transformer_input_convert_pass.h"
+
+#include <string>
+
+#include "paddle/fluid/framework/op_version_registry.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+SetTransformerInputConvertPass::SetTransformerInputConvertPass() {
+  AddOpCompat(OpCompat("elementwise_add"))
+      .AddInput("X")
+      .IsTensor()
+      .End()
+      .AddInput("Y")
+      .IsTensor()
+      .End()
+      .AddOutput("Out")
+      .IsTensor()
+      .End()
+      .AddAttr("axis")
+      .End();
+}
+namespace patterns {
+
+void SetTransformerInputConvert::operator()() {
+  std::unordered_set<std::string> lookup_table_ops{"lookup_table",
+                                                   "lookup_table_v2"};
+  // Create nodes for lookup_table1 op.
+  auto *lookup_table1_x = pattern->NewNode(lookup_table1_x_repr())
+                              ->assert_is_ops_input(lookup_table_ops, "Ids");
+  auto *lookup_table1_w = pattern->NewNode(lookup_table1_w_repr())
+                              ->assert_is_ops_input(lookup_table_ops, "W");
+  auto *lookup_table1_op =
+      pattern->NewNode(lookup_table1_repr())->assert_is_ops(lookup_table_ops);
+  auto *lookup_table1_out = pattern->NewNode(lookup_table1_out_repr())
+                                ->assert_is_ops_output(lookup_table_ops)
+                                ->AsIntermediate()
+                                ->assert_is_op_input("elementwise_add", "X");
+
+  // Create nodes for lookup_table2 op.
+  auto *lookup_table2_x = pattern->NewNode(lookup_table2_x_repr())
+                              ->assert_is_ops_input(lookup_table_ops, "Ids");
+  auto *lookup_table2_w = pattern->NewNode(lookup_table2_w_repr())
+                              ->assert_is_ops_input(lookup_table_ops, "W");
+  auto *lookup_table2_op =
+      pattern->NewNode(lookup_table2_repr())->assert_is_ops(lookup_table_ops);
+  auto *lookup_table2_out = pattern->NewNode(lookup_table2_out_repr())
+                                ->assert_is_ops_output(lookup_table_ops)
+                                ->AsIntermediate()
+                                ->assert_is_op_input("elementwise_add", "Y");
+
+  // Create nodes for elementwise_add op.
+  auto *elementwise_op =
+      pattern->NewNode(elementwise_repr())->assert_is_op("elementwise_add");
+  auto *elementwise_out = pattern->NewNode(elementwise_out_repr())
+                              ->AsOutput()
+                              ->assert_is_only_output_of_op("elementwise_add");
+
+  // links nodes.
+  lookup_table1_op->LinksFrom({lookup_table1_x, lookup_table1_w})
+      .LinksTo({lookup_table1_out});
+  lookup_table2_op->LinksFrom({lookup_table2_x, lookup_table2_w})
+      .LinksTo({lookup_table2_out});
+  elementwise_op->LinksFrom({lookup_table1_out, lookup_table2_out})
+      .LinksTo({elementwise_out});
+}
+
+}  // namespace patterns
+
+void SetTransformerInputConvertPass::ApplyImpl(ir::Graph *graph) const {
+  PADDLE_ENFORCE_NOT_NULL(
+      graph, platform::errors::PreconditionNotMet("graph should not be null."));
+  FusePassBase::Init(name_scope_, graph);
+  int found_subgraph_count = 0;
+
+  GraphPatternDetector gpd;
+  patterns::SetTransformerInputConvert fused_pattern(
+      gpd.mutable_pattern(), "transformer_input_convert_pass");
+  fused_pattern();
+
+  auto handler = [&](const GraphPatternDetector::subgraph_t &subgraph,
+                     Graph *graph) {
+    if (!IsCompat(subgraph, graph)) {
+      LOG(WARNING) << "transformer_input_convert_pass in op compat failed.";
+      return;
+    }
+
+    VLOG(3) << "transformer_input_convert_pass for pos_id, max_seqlen";
+
+    GET_IR_NODE_FROM_SUBGRAPH(lookup_table2_x, lookup_table2_x, fused_pattern);
+
+    // create op, var in graph
+    OpDesc new_desc;
+    new_desc.SetType("transformer_input_convert");
+
+    // inputs
+    new_desc.SetInput("X", {lookup_table2_x->Name()});
+
+    // outputs
+    std::vector<std::string> output_0 = {"pos_id_tensor"};
+    std::vector<std::string> output_1 = {"max_seqlen_tensor"};
+    new_desc.SetOutput("PosId", output_0);
+    new_desc.SetOutput("MaxSeqlen", output_1);
+
+    std::string transformer_input_convert_out0_name = "pos_id_tensor";
+    std::string transformer_input_convert_out1_name = "max_seqlen_tensor";
+    VarDesc transformer_input_convert_out0(transformer_input_convert_out0_name);
+    VarDesc transformer_input_convert_out1(transformer_input_convert_out1_name);
+    transformer_input_convert_out0.SetDataType(proto::VarType::INT32);
+    transformer_input_convert_out1.SetDataType(proto::VarType::INT32);
+    transformer_input_convert_out0.SetShape({-1});
+    transformer_input_convert_out1.SetShape({-1});
+    transformer_input_convert_out0.SetPersistable(false);
+    transformer_input_convert_out1.SetPersistable(false);
+
+    auto new_op_node = graph->CreateOpNode(&new_desc);
+    auto transformer_input_convert_out0_node =
+        graph->CreateVarNode(&transformer_input_convert_out0);
+    auto transformer_input_convert_out1_node =
+        graph->CreateVarNode(&transformer_input_convert_out1);
+
+    // needn't create variable in scope
+
+    IR_NODE_LINK_TO(lookup_table2_x, new_op_node);
+    IR_NODE_LINK_TO(new_op_node, transformer_input_convert_out0_node);
+    IR_NODE_LINK_TO(new_op_node, transformer_input_convert_out1_node);
+
+    found_subgraph_count++;
+  };
+
+  gpd(graph, handler);
+  AddStatis(found_subgraph_count);
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(set_transformer_input_convert_pass,
+              paddle::framework::ir::SetTransformerInputConvertPass);
+REGISTER_PASS_CAPABILITY(set_transformer_input_convert_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .LE("lookup_table", 1)
+            .LE("lookup_table_v2", 1)
+            .LE("elementweise_add", 1));
diff --git a/paddle/fluid/framework/ir/set_transformer_input_convert_pass.h b/paddle/fluid/framework/ir/set_transformer_input_convert_pass.h
new file mode 100644
index 0000000000000..5a5843e810f9a
--- /dev/null
+++ b/paddle/fluid/framework/ir/set_transformer_input_convert_pass.h
@@ -0,0 +1,80 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+class Graph;
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+namespace paddle {
+namespace framework {
+namespace ir {
+namespace patterns {
+
+//     in_var  emb       in_var   emb
+//       |      |          |       |
+//     lookup_table      lookup_table
+//           |                 |
+//        lkt_var           lkt_var
+//            \                /
+//             elementwise_add
+//                    |
+//               elt_out_var
+//
+struct SetTransformerInputConvert : public PatternBase {
+  SetTransformerInputConvert(PDPattern *pattern, const std::string &name_scope)
+      : PatternBase(pattern, name_scope, "transformer_input_convert") {}
+
+  void operator()();
+
+  // declare operator node's name
+  PATTERN_DECL_NODE(lookup_table1);
+  PATTERN_DECL_NODE(lookup_table2);
+  PATTERN_DECL_NODE(elementwise);
+
+  // declare variable node's name
+  PATTERN_DECL_NODE(lookup_table1_x);
+  PATTERN_DECL_NODE(lookup_table1_w);
+  PATTERN_DECL_NODE(lookup_table1_out);
+  PATTERN_DECL_NODE(lookup_table2_x);
+  PATTERN_DECL_NODE(lookup_table2_w);
+  PATTERN_DECL_NODE(lookup_table2_out);
+  PATTERN_DECL_NODE(elementwise_out);
+};
+}  // namespace patterns
+
+class SetTransformerInputConvertPass : public FusePassBase {
+ public:
+  SetTransformerInputConvertPass();
+  virtual ~SetTransformerInputConvertPass() {}
+
+ protected:
+  void ApplyImpl(Graph *graph) const;
+  const std::string name_scope_{"transformer_input_convert_pass"};
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
index e4fc52b6fa744..059a9cb21e1d5 100644
--- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
@@ -377,12 +377,7 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
   trt_engine->SetUseDLA(Get<bool>("trt_use_dla"));
   trt_engine->SetDLACore(Get<int>("trt_dla_core"));
   trt_engine->SetUseInspector(Get<bool>("use_inspector"));
-
-  trt_engine->SetWithErnie(
-      (graph->Has(framework::ir::kEmbEltwiseLayernormPass) &&
-       graph->Has(framework::ir::kMultiheadMatmulPass)) ||
-      (graph->Has(framework::ir::kPrelnEmbEltwiseLayernormPass) &&
-       graph->Has(framework::ir::kMultiheadMatmulPass)));
+  trt_engine->SetWithErnie(graph->Has(framework::ir::kMultiheadMatmulPass));
 
   if (use_static_engine) {
     trt_engine_serialized_data = GetTrtEngineSerializedData(
diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc
index f59494628ad7e..4c3587e54036b 100644
--- a/paddle/fluid/inference/api/paddle_pass_builder.cc
+++ b/paddle/fluid/inference/api/paddle_pass_builder.cc
@@ -98,18 +98,21 @@ const std::vector<std::string> kTRTSubgraphPasses({
       "multihead_matmul_fuse_pass_v3",                //
       "skip_layernorm_fuse_pass",                     //
       "preln_skip_layernorm_fuse_pass",               //
-      "conv_bn_fuse_pass",                            //
-      "unsqueeze2_eltwise_fuse_pass",                 //
-      "trt_squeeze2_matmul_fuse_pass",                //
-      "trt_reshape2_matmul_fuse_pass",                //
-      "trt_flatten2_matmul_fuse_pass",                //
-      "trt_map_matmul_v2_to_mul_pass",                //
-      "trt_map_matmul_v2_to_matmul_pass",             //
-      "trt_map_matmul_to_mul_pass",                   //
-      "fc_fuse_pass",                                 //
-      "conv_elementwise_add_fuse_pass",               //
-      "tensorrt_subgraph_pass",                       //
-      "conv_bn_fuse_pass",                            //
+      // "set_transformer_input_convert_pass",           //
+      "conv_bn_fuse_pass",                 //
+      "unsqueeze2_eltwise_fuse_pass",      //
+      "trt_squeeze2_matmul_fuse_pass",     //
+      "trt_reshape2_matmul_fuse_pass",     //
+      "trt_flatten2_matmul_fuse_pass",     //
+      "trt_map_matmul_v2_to_mul_pass",     //
+      "trt_map_matmul_v2_to_matmul_pass",  //
+      "trt_map_matmul_to_mul_pass",        //
+      "fc_fuse_pass",                      //
+      "conv_elementwise_add_fuse_pass",    //
+      // "remove_padding_recover_padding_pass",          //
+      // "delete_remove_padding_recover_padding_pass",    //
+      "tensorrt_subgraph_pass",  //
+      "conv_bn_fuse_pass",       //
 #if CUDNN_VERSION >= 7100  // To run conv_fusion, the version of cudnn must be
                            // guaranteed at least v7
 // cudnn8.0 has memory leak problem in conv + eltwise + act, so we

From 6e90ba1b16754b23b4eebc72fe5fab84c11172a1 Mon Sep 17 00:00:00 2001
From: zhupengyang <zhu_py@qq.com>
Date: Thu, 12 May 2022 14:29:53 +0800
Subject: [PATCH 24/49] add exp,log trt converter (#42655)

---
 .../fluid/inference/api/analysis_predictor.cc |   2 +
 .../inference/tensorrt/convert/CMakeLists.txt |  38 ++++-
 .../inference/tensorrt/convert/unary_op.cc    |  84 +++++++++++
 paddle/fluid/inference/tensorrt/op_teller.cc  |   6 +-
 .../ir/inference/test_trt_convert_unary.py    | 132 ++++++++++++++++++
 5 files changed, 256 insertions(+), 6 deletions(-)
 create mode 100644 paddle/fluid/inference/tensorrt/convert/unary_op.cc
 create mode 100644 python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_unary.py

diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index 0d3a687c461d1..181f8f6649a5d 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -1744,6 +1744,8 @@ USE_TRT_CONVERTER(flatten_contiguous_range);
 USE_TRT_CONVERTER(matmul);
 USE_TRT_CONVERTER(conv2d);
 USE_TRT_CONVERTER(relu);
+USE_TRT_CONVERTER(exp);
+USE_TRT_CONVERTER(log);
 USE_TRT_CONVERTER(sigmoid);
 USE_TRT_CONVERTER(tanh);
 USE_TRT_CONVERTER(fc);
diff --git a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
index ec8c1b2fcd75c..77c31f941d7fc 100644
--- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
@@ -1,10 +1,38 @@
 # Add TRT tests
 nv_library(tensorrt_converter
-           SRCS matmul_op.cc conv2d_op.cc fc_op.cc pool2d_op.cc elementwise_op.cc
-                batch_norm_op.cc activation_op.cc softmax_op.cc concat_op.cc dropout_op.cc group_norm_op.cc
-                pad_op.cc split_op.cc prelu_op.cc leaky_relu_op.cc gelu_op.cc layer_norm_op.cc multihead_matmul_op.cc
-                shuffle_channel_op.cc swish_op.cc instance_norm_op.cc stack_op.cc transpose_op.cc flatten_op.cc flatten_contiguous_range_op.cc
-                emb_eltwise_layernorm.cc skip_layernorm.cc scale_op.cc slice_op.cc hard_sigmoid_op.cc hard_swish_op.cc clip_op.cc
+           SRCS matmul_op.cc
+                conv2d_op.cc
+                fc_op.cc
+                pool2d_op.cc
+                elementwise_op.cc
+                batch_norm_op.cc
+                activation_op.cc
+                unary_op.cc
+                softmax_op.cc
+                concat_op.cc
+                dropout_op.cc
+                group_norm_op.cc
+                pad_op.cc
+                split_op.cc
+                prelu_op.cc
+                leaky_relu_op.cc
+                gelu_op.cc
+                layer_norm_op.cc
+                multihead_matmul_op.cc
+                shuffle_channel_op.cc
+                swish_op.cc
+                instance_norm_op.cc
+                stack_op.cc
+                transpose_op.cc
+                flatten_op.cc
+                flatten_contiguous_range_op.cc
+                emb_eltwise_layernorm.cc
+                skip_layernorm.cc
+                scale_op.cc
+                slice_op.cc
+                hard_sigmoid_op.cc
+                hard_swish_op.cc
+                clip_op.cc
                 gather_op.cc
                 anchor_generator_op.cc
                 yolo_box_op.cc
diff --git a/paddle/fluid/inference/tensorrt/convert/unary_op.cc b/paddle/fluid/inference/tensorrt/convert/unary_op.cc
new file mode 100644
index 0000000000000..aa3d38ebe2073
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/unary_op.cc
@@ -0,0 +1,84 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <NvInfer.h>
+#include <string>
+#include "glog/logging.h"
+#include "paddle/fluid/framework/op_desc.h"
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+#include "paddle/fluid/inference/tensorrt/engine.h"
+#include "paddle/fluid/inference/tensorrt/helper.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace framework {
+class Scope;
+
+namespace proto {
+class OpDesc;
+}  // namespace proto
+}  // namespace framework
+}  // namespace paddle
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+class UnaryOpConverter : public OpConverter {
+ public:
+  UnaryOpConverter() {}
+  void operator()(const framework::proto::OpDesc& op,
+                  const framework::Scope& scope, bool test_mode) override {
+    // Here the two nullptr looks strange, that's because the
+    // framework::OpDesc's constructor is strange.
+    framework::OpDesc op_desc(op, nullptr);
+    VLOG(3) << "convert a fluid unary op to tensorrt unary layer whose "
+               "type is "
+            << op_type_;
+    nvinfer1::ITensor* input_tensor =
+        engine_->GetITensor(op_desc.Input("X")[0]);
+    auto op_pair = ops.find(op_type_);
+    nvinfer1::IUnaryLayer* layer =
+        TRT_ENGINE_ADD_LAYER(engine_, Unary, *input_tensor, op_pair->second);
+    auto output_name = op_desc.Output("Out")[0];
+    RreplenishLayerAndOutput(layer, op_type_, {output_name}, test_mode);
+  }
+
+ protected:
+  std::string op_type_;
+  static const std::unordered_map<std::string, nvinfer1::UnaryOperation> ops;
+};
+
+const std::unordered_map<std::string, nvinfer1::UnaryOperation>
+    UnaryOpConverter::ops = {
+        {"exp", nvinfer1::UnaryOperation::kEXP},
+        {"log", nvinfer1::UnaryOperation::kLOG},
+};
+
+class ExpOpConverter : public UnaryOpConverter {
+ public:
+  ExpOpConverter() { op_type_ = "exp"; }
+};
+
+class LogOpConverter : public UnaryOpConverter {
+ public:
+  LogOpConverter() { op_type_ = "log"; }
+};
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+
+REGISTER_TRT_OP_CONVERTER(exp, ExpOpConverter);
+REGISTER_TRT_OP_CONVERTER(log, LogOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc
index 280a1e3708bdb..01f2a4fca9d4e 100644
--- a/paddle/fluid/inference/tensorrt/op_teller.cc
+++ b/paddle/fluid/inference/tensorrt/op_teller.cc
@@ -65,6 +65,8 @@ struct SimpleOpTypeSetTeller : public Teller {
       "conv2d_fusion",
       "pool2d",
       "relu",
+      "exp",
+      "log",
       "softmax",
       "sigmoid",
       "hard_swish",
@@ -128,6 +130,8 @@ struct SimpleOpTypeSetTeller : public Teller {
       "conv2d_fusion",
       "pool2d",
       "relu",
+      "exp",
+      "log",
       "softmax",
       "sigmoid",
       "hard_swish",
@@ -200,7 +204,7 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8,
 
   for (auto& teller : tellers_) {
     if (op_type == "relu" || op_type == "relu6" || op_type == "tanh" ||
-        op_type == "sigmoid") {
+        op_type == "sigmoid" || op_type == "exp" || op_type == "log") {
       auto* block = desc.Block();
       if (block == nullptr) {
         VLOG(3) << "The block desc is nullptr, we can't continue to analyze. "
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_unary.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_unary.py
new file mode 100644
index 0000000000000..2abf0a1acda67
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_unary.py
@@ -0,0 +1,132 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from trt_layer_auto_scan_test import TrtLayerAutoScanTest, SkipReasons
+from program_config import TensorConfig, ProgramConfig
+import unittest
+import numpy as np
+import paddle.inference as paddle_infer
+from functools import partial
+from typing import Optional, List, Callable, Dict, Any, Set
+
+
+class TrtConvertActivationTest(TrtLayerAutoScanTest):
+    def is_program_valid(self, program_config: ProgramConfig) -> bool:
+        return True
+
+    def sample_program_configs(self):
+        def generate_input1(dims, batch, attrs: List[Dict[str, Any]]):
+            if dims == 1:
+                return np.ones([32]).astype(np.float32)
+            elif dims == 2:
+                return np.ones([3, 32]).astype(np.float32)
+            elif dims == 3:
+                return np.ones([3, 32, 32]).astype(np.float32)
+            else:
+                return np.ones([batch, 3, 32, 32]).astype(np.float32)
+
+        for dims in [1, 2, 3, 4]:
+            for batch in [1, 4]:
+                for op_type in ["exp", "log"]:
+                    self.dims = dims
+                    dics = [{}]
+
+                    ops_config = [{
+                        "op_type": op_type,
+                        "op_inputs": {
+                            "X": ["input_data"]
+                        },
+                        "op_outputs": {
+                            "Out": ["output_data"]
+                        },
+                        "op_attrs": dics[0]
+                    }]
+                    ops = self.generate_op_config(ops_config)
+
+                    program_config = ProgramConfig(
+                        ops=ops,
+                        weights={},
+                        inputs={
+                            "input_data": TensorConfig(data_gen=partial(
+                                generate_input1, dims, batch, dics))
+                        },
+                        outputs=["output_data"])
+
+                    yield program_config
+
+    def sample_predictor_configs(
+            self, program_config) -> (paddle_infer.Config, List[int], float):
+        def generate_dynamic_shape(attrs):
+            if self.dims == 1:
+                self.dynamic_shape.min_input_shape = {"input_data": [1]}
+                self.dynamic_shape.max_input_shape = {"input_data": [64]}
+                self.dynamic_shape.opt_input_shape = {"input_data": [32]}
+            elif self.dims == 2:
+                self.dynamic_shape.min_input_shape = {"input_data": [1, 16]}
+                self.dynamic_shape.max_input_shape = {"input_data": [4, 32]}
+                self.dynamic_shape.opt_input_shape = {"input_data": [3, 32]}
+            elif self.dims == 3:
+                self.dynamic_shape.min_input_shape = {"input_data": [1, 16, 16]}
+                self.dynamic_shape.max_input_shape = {"input_data": [4, 32, 32]}
+                self.dynamic_shape.opt_input_shape = {"input_data": [3, 32, 32]}
+            else:
+                self.dynamic_shape.min_input_shape = {
+                    "input_data": [1, 3, 16, 16]
+                }
+                self.dynamic_shape.max_input_shape = {
+                    "input_data": [4, 3, 32, 32]
+                }
+                self.dynamic_shape.opt_input_shape = {
+                    "input_data": [1, 3, 32, 32]
+                }
+
+        def clear_dynamic_shape():
+            self.dynamic_shape.min_input_shape = {}
+            self.dynamic_shape.max_input_shape = {}
+            self.dynamic_shape.opt_input_shape = {}
+
+        def generate_trt_nodes_num(attrs, dynamic_shape):
+            if self.dims == 1:
+                return 0, 3
+            return 1, 2
+
+        attrs = [
+            program_config.ops[i].attrs
+            for i in range(len(program_config.ops))
+        ]
+
+        # for static_shape
+        clear_dynamic_shape()
+        self.trt_param.precision = paddle_infer.PrecisionType.Float32
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, False), 1e-5
+        self.trt_param.precision = paddle_infer.PrecisionType.Half
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, False), 1e-5
+
+        # for dynamic_shape
+        generate_dynamic_shape(attrs)
+        self.trt_param.precision = paddle_infer.PrecisionType.Float32
+        yield self.create_inference_config(), generate_trt_nodes_num(attrs,
+                                                                     True), 1e-5
+        self.trt_param.precision = paddle_infer.PrecisionType.Half
+        yield self.create_inference_config(), generate_trt_nodes_num(attrs,
+                                                                     True), 1e-5
+
+    def test(self):
+        self.run_test()
+
+
+if __name__ == "__main__":
+    unittest.main()

From 35c7c83599240af4b6b72111cd83562418f8a609 Mon Sep 17 00:00:00 2001
From: Jiabin Yang <360788950@qq.com>
Date: Thu, 12 May 2022 14:39:04 +0800
Subject: [PATCH 25/49] [Eager] Remove full reserved strategy (#42690)

* remove full reserved strategy

* fix inplace error
---
 .../auto_code_generator/eager_generator.cc    | 31 +++-----
 .../final_state_generator/eager_gen.py        | 16 ++--
 paddle/fluid/eager/tensor_wrapper.h           | 76 ++++++-------------
 .../tensor_wrapper_test.cc                    | 12 ++-
 4 files changed, 51 insertions(+), 84 deletions(-)

diff --git a/paddle/fluid/eager/auto_code_generator/eager_generator.cc b/paddle/fluid/eager/auto_code_generator/eager_generator.cc
index 44fa8461f2fe9..3edd13ccd597f 100644
--- a/paddle/fluid/eager/auto_code_generator/eager_generator.cc
+++ b/paddle/fluid/eager/auto_code_generator/eager_generator.cc
@@ -1156,28 +1156,20 @@ static std::string GenerateGradNodeCreationContent(
   for (const auto& iter : op_base_infos) {
     const std::map<std::string, std::string>& grad_ins_fwd_slotname_map =
         iter.GetGradInsFwdSlotnameMap();
-    const std::unordered_set<std::string>& no_need_buffer_ins =
-        iter.GetNoNeedBufferInputs();
     for (auto& kv : grad_ins_fwd_slotname_map) {
       const std::string& tensor_wrapper_name = kv.second;
-      std::string full_reserved = "false";
-      if (fwd_outputs_name_pos_map.find(tensor_wrapper_name) ==
-              fwd_outputs_name_pos_map.end() &&
-          !no_need_buffer_ins.count(tensor_wrapper_name)) {
-        full_reserved = "true";
-      }
       const char* SET_TENSOR_WRAPPER_TEMPLATE =
-          "      grad_node->SetTensorWrapper%s(%s, %s);\n";
+          "      grad_node->SetTensorWrapper%s(%s);\n";
       // Replace output directly with input in inplace op.
       if (!inplace_map.empty() && inplace_map.count(tensor_wrapper_name)) {
         auto inplace_input_name = inplace_map[tensor_wrapper_name];
         grad_node_creation_str += paddle::string::Sprintf(
             SET_TENSOR_WRAPPER_TEMPLATE, LegalizeVarName(tensor_wrapper_name),
-            LegalizeVarName(inplace_input_name), full_reserved);
+            LegalizeVarName(inplace_input_name));
       } else {
         grad_node_creation_str += paddle::string::Sprintf(
             SET_TENSOR_WRAPPER_TEMPLATE, LegalizeVarName(tensor_wrapper_name),
-            LegalizeVarName(tensor_wrapper_name), full_reserved);
+            LegalizeVarName(tensor_wrapper_name));
       }
     }
   }
@@ -2592,7 +2584,6 @@ static std::string GenerateGradNodeHeaderContents(
 
       std::string tensor_wrapper_arg_str;
       std::string tensor_wrapper_body_str;
-      std::string full_reserved_str = "full_reserved";
       std::string no_need_buffer_str = "false";
       if (no_need_buffer_ins.count(tensor_wrapper_name)) {
         no_need_buffer_str = "true";
@@ -2610,12 +2601,12 @@ static std::string GenerateGradNodeHeaderContents(
 
         const char* SET_TENSOR_WRAPPER_BODY_TEMPLATE =
             "for(const auto& eager_tensor : %s) {\n"
-            "          %s.emplace_back( egr::TensorWrapper(eager_tensor, %s "
-            "/*full_reserved*/, %s) );\n"
+            "          %s.emplace_back( egr::TensorWrapper(eager_tensor "
+            ", %s) );\n"
             "      }\n";
         tensor_wrapper_body_str = paddle::string::Sprintf(
             SET_TENSOR_WRAPPER_BODY_TEMPLATE, tensor_wrapper_name,
-            struct_tensor_wrapper_name, full_reserved_str, no_need_buffer_str);
+            struct_tensor_wrapper_name, no_need_buffer_str);
 
         const char* CLEAR_TENSOR_WRAPPER_TEMPLATE =
             "for (auto tw: %s)   {\n"
@@ -2636,22 +2627,20 @@ static std::string GenerateGradNodeHeaderContents(
             TENSOR_WRAPPER_MEMBER_TEMPLATE, struct_tensor_wrapper_name);
 
         const char* SET_TENSOR_WRAPPER_BODY_TEMPLATE =
-            "%s = egr::TensorWrapper(%s, %s /*full_reserved*/, %s);\n";
+            "%s = egr::TensorWrapper(%s, %s);\n";
         tensor_wrapper_body_str = paddle::string::Sprintf(
             SET_TENSOR_WRAPPER_BODY_TEMPLATE, struct_tensor_wrapper_name,
-            tensor_wrapper_name, full_reserved_str, no_need_buffer_str);
+            tensor_wrapper_name, no_need_buffer_str);
 
         const char* CLEAR_TENSOR_WRAPPER_TEMPLATE = "   %s.clear();\n";
         clear_tensor_wrappers_str += paddle::string::Sprintf(
             CLEAR_TENSOR_WRAPPER_TEMPLATE, struct_tensor_wrapper_name);
       }
-      std::string full_reserved_signature_str = "bool full_reserved";
       const char* SET_TENSOR_WRAPPER_TEMPLATE =
-          "   void SetTensorWrapper%s(%s, %s) {\n     %s\n   }\n";
+          "   void SetTensorWrapper%s(%s) {\n     %s\n   }\n";
       set_tensor_wrappers_str += paddle::string::Sprintf(
           SET_TENSOR_WRAPPER_TEMPLATE, tensor_wrapper_name,
-          tensor_wrapper_arg_str, full_reserved_signature_str,
-          tensor_wrapper_body_str);
+          tensor_wrapper_arg_str, tensor_wrapper_body_str);
     }
   }
   VLOG(6) << "Generated TensorWrapper";
diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py
index 092c4b6e605db..4e2c6db1a44a4 100644
--- a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py
+++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py
@@ -55,8 +55,8 @@ def ParseArguments():
 ## Code Gen Templates ##
 ########################
 SET_PLAIN_TENSOR_WRAPPER_TEMPLATE = \
-"""  void SetTensorWrapper{}(const paddle::experimental::Tensor& {}, bool full_reserved) {{
-    {} = egr::TensorWrapper({}, full_reserved, {});
+"""  void SetTensorWrapper{}(const paddle::experimental::Tensor& {}) {{
+    {} = egr::TensorWrapper({}, {});
   }}
 """
 
@@ -69,9 +69,9 @@ def ParseArguments():
 """
 
 SET_VECTOR_TENSOR_WRAPPER_TEMPLATE = \
-"""  void SetTensorWrapper{}(const std::vector<paddle::experimental::Tensor>& {}, bool full_reserved) {{
+"""  void SetTensorWrapper{}(const std::vector<paddle::experimental::Tensor>& {}) {{
     for(const auto& eager_tensor : {}) {{
-      {}.emplace_back(egr::TensorWrapper(eager_tensor, full_reserved, {}));
+      {}.emplace_back(egr::TensorWrapper(eager_tensor, {}));
     }};
   }}
 """
@@ -676,9 +676,9 @@ def GenerateNodeCreationCodes(self):
 
             if is_fwd_input:
                 if is_optional:
-                    set_tensor_wrappers = f"{indent}if({name}.get_ptr() != nullptr) grad_node->SetTensorWrapper{name}(*({name}.get_ptr()), true);"
+                    set_tensor_wrappers = f"{indent}if({name}.get_ptr() != nullptr) grad_node->SetTensorWrapper{name}(*({name}.get_ptr()));"
                 else:
-                    set_tensor_wrappers = f"{indent}grad_node->SetTensorWrapper{name}({name}, true);"
+                    set_tensor_wrappers = f"{indent}grad_node->SetTensorWrapper{name}({name});"
                 set_input_tensor_wrappers_list.append(set_tensor_wrappers)
             else:
                 if num_fwd_outputs > 1:
@@ -688,9 +688,9 @@ def GenerateNodeCreationCodes(self):
                     fwd_output_pos = forward_outputs_position_map[name][1]
 
                 if is_optional:
-                    set_tensor_wrappers = f"{indent}if({name}.get_ptr() != nullptr) grad_node->SetTensorWrapper{name}(*({name}.get_ptr()), false);"
+                    set_tensor_wrappers = f"{indent}if({name}.get_ptr() != nullptr) grad_node->SetTensorWrapper{name}(*({name}.get_ptr()));"
                 else:
-                    set_tensor_wrappers = f"{indent}grad_node->SetTensorWrapper{name}({name}, false);"
+                    set_tensor_wrappers = f"{indent}grad_node->SetTensorWrapper{name}({name});"
                 set_output_tensor_wrappers_list.append(set_tensor_wrappers)
         set_input_tensor_wrappers_str = "\n".join(
             set_input_tensor_wrappers_list)
diff --git a/paddle/fluid/eager/tensor_wrapper.h b/paddle/fluid/eager/tensor_wrapper.h
index a90b7bc7d7202..8893e0ed7ee0a 100644
--- a/paddle/fluid/eager/tensor_wrapper.h
+++ b/paddle/fluid/eager/tensor_wrapper.h
@@ -34,7 +34,6 @@ class TensorWrapper {
  public:
   TensorWrapper() = default;
   explicit TensorWrapper(const paddle::experimental::Tensor& tensor,
-                         bool full_reserved = false,
                          bool no_need_buffer = false) {
     // set inplace_version_snapshot_ according to tensor's current inplace
     // version.
@@ -46,32 +45,12 @@ class TensorWrapper {
     }
 
     /**
-     * Normally, we should fully reserved all non-output or non-leaf fwd tensor
-     * here. And for fwd output tensor, we should not reserve its autogradmeta,
-     * to avoid recursive depends on GradNodeBase
+     * Normally, we should only save data and part of autograd_meta of fwd
+     * tensor, and should not reserve its original grad_node,
+     * to avoid recursive and additional depends on GradNodeBase
      * **/
-    full_reserved_ = full_reserved;
+    auto* tensor_autograd_meta = EagerUtils::nullable_autograd_meta(tensor);
     no_need_buffer_ = no_need_buffer;
-    if (full_reserved_) {
-      VLOG(6) << "Fully reserved tensor: " << tensor.name();
-      intermidiate_tensor_ = tensor;
-      if (no_need_buffer_) {
-        if (phi::DenseTensor::classof(tensor.impl().get())) {
-          // Only Copy Meta
-          phi::DenseTensor* dense_tensor =
-              static_cast<phi::DenseTensor*>(tensor.impl().get());
-          auto tw_dense_tensor =
-              std::make_shared<phi::DenseTensor>(*dense_tensor);
-          tw_dense_tensor->clear();
-          intermidiate_tensor_.set_impl(tw_dense_tensor);
-        } else {
-          PADDLE_THROW(paddle::platform::errors::Fatal(
-              "Unrecognized tensor type for no_need_buffer feature"));
-        }
-      }
-      return;
-    }
-
     // shallow copy tensor_impl here
     if (no_need_buffer) {
       if (phi::DenseTensor::classof(tensor.impl().get())) {
@@ -89,10 +68,11 @@ class TensorWrapper {
       intermidiate_tensor_.set_impl(tensor.impl());
     }
 
-    // TODO(jiabin): This may has server performance issue
-    intermidiate_tensor_.set_name(tensor.name() + "@Saved");
+    if (VLOG_IS_ON(7)) {
+      // TODO(jiabin): This may has server performance issue
+      intermidiate_tensor_.set_name(tensor.name() + "@Saved");
+    }
 
-    auto* tensor_autograd_meta = EagerUtils::nullable_autograd_meta(tensor);
     if (tensor_autograd_meta) {
       auto autograd_meta =
           std::make_shared<AutogradMeta>(*tensor_autograd_meta);
@@ -112,33 +92,28 @@ class TensorWrapper {
 
     check_inplace_version();
 
-    // if it's full_reserved just return the full copy of tensor
-    if (full_reserved_) {
-      return intermidiate_tensor_;
+    paddle::experimental::Tensor recovered_tensor = intermidiate_tensor_;
+
+    std::shared_ptr<GradNodeBase> new_grad_node = weak_grad_node_.lock();
+    if (new_grad_node) {
+      VLOG(3) << "Recovered TensorWrapper with GradNode "
+              << new_grad_node->name() << " addr: " << new_grad_node.get();
     } else {
-      paddle::experimental::Tensor recovered_tensor = intermidiate_tensor_;
+      VLOG(3) << "Recovered TensorWrapper with Empty GradNode";
+    }
+    auto* intermediate_autograd_meta =
+        EagerUtils::nullable_autograd_meta(intermidiate_tensor_);
 
-      std::shared_ptr<GradNodeBase> new_grad_node = weak_grad_node_.lock();
+    if (intermediate_autograd_meta) {
+      auto p_ab_autograd_meta =
+          std::make_shared<AutogradMeta>(*intermediate_autograd_meta);
       if (new_grad_node) {
-        VLOG(3) << "Recovered TensorWrapper with GradNode "
-                << new_grad_node->name() << " addr: " << new_grad_node.get();
-      } else {
-        VLOG(3) << "Recovered TensorWrapper with Empty GradNode";
+        p_ab_autograd_meta->SetGradNode(new_grad_node);
       }
-      auto* intermediate_autograd_meta =
-          EagerUtils::nullable_autograd_meta(intermidiate_tensor_);
-
-      if (intermediate_autograd_meta) {
-        auto p_ab_autograd_meta =
-            std::make_shared<AutogradMeta>(*intermediate_autograd_meta);
-        if (new_grad_node) {
-          p_ab_autograd_meta->SetGradNode(new_grad_node);
-        }
-        recovered_tensor.set_autograd_meta(p_ab_autograd_meta);
-      }
-
-      return recovered_tensor;
+      recovered_tensor.set_autograd_meta(p_ab_autograd_meta);
     }
+
+    return recovered_tensor;
   }
 
   void clear() { intermidiate_tensor_.reset(); }
@@ -179,7 +154,6 @@ class TensorWrapper {
   }
 
  private:
-  bool full_reserved_ = false;
   bool no_need_buffer_ = false;
   paddle::experimental::Tensor intermidiate_tensor_;
   std::weak_ptr<egr::GradNodeBase> weak_grad_node_;
diff --git a/paddle/fluid/eager/tests/data_structure_tests/tensor_wrapper_test.cc b/paddle/fluid/eager/tests/data_structure_tests/tensor_wrapper_test.cc
index 5f563edee39f1..28c3472f90d03 100644
--- a/paddle/fluid/eager/tests/data_structure_tests/tensor_wrapper_test.cc
+++ b/paddle/fluid/eager/tests/data_structure_tests/tensor_wrapper_test.cc
@@ -40,9 +40,11 @@ TEST(TensorWrapper, Basic) {
   auto auto_grad0 = std::make_shared<egr::AutogradMeta>(edge0);
   et1.set_autograd_meta(auto_grad0);
   et1.set_name("et1");
-  auto tw0 = egr::TensorWrapper(et1, true);
+  auto tw0 = egr::TensorWrapper(et1);
   auto recover_et1 = tw0.recover();
-  CHECK_EQ(recover_et1.name(), std::string("et1"));
+  if (VLOG_IS_ON(7)) {
+    CHECK_EQ(recover_et1.name(), std::string("et1@saved"));
+  }
   CHECK_EQ(egr::EagerUtils::OutRankInfo(recover_et1).first,
            egr::EagerUtils::OutRankInfo(et1).first);
   CHECK_EQ(egr::EagerUtils::OutRankInfo(recover_et1).second,
@@ -68,13 +70,15 @@ TEST(TensorWrapper, Basic) {
   et2.set_autograd_meta(auto_grad1);
   auto tw1 = egr::TensorWrapper(et2, false);
   auto recover_et2 = tw1.recover();
-  CHECK_EQ(recover_et2.name(), std::string("et2@Saved"));
+  if (VLOG_IS_ON(7)) {
+    CHECK_EQ(recover_et2.name(), std::string("et2@Saved"));
+  }
   CHECK_EQ(egr::EagerUtils::OutRankInfo(recover_et2).first,
            egr::EagerUtils::OutRankInfo(et2).first);
   CHECK_EQ(egr::EagerUtils::OutRankInfo(recover_et2).second,
            egr::EagerUtils::OutRankInfo(et2).second);
   // Test Raw recover
   paddle::experimental::Tensor et3;
-  auto tw2 = egr::TensorWrapper(et3, true);
+  auto tw2 = egr::TensorWrapper(et3);
   CHECK(tw2.recover().initialized() == false);
 }

From 43d70bccdaab5f0f69da41db953051f04797d1bd Mon Sep 17 00:00:00 2001
From: Jiabin Yang <360788950@qq.com>
Date: Thu, 12 May 2022 14:39:11 +0800
Subject: [PATCH 26/49] Speed up sr accumulation (#42658)

* Support Gradient Accumulation for sr

* add ut

* change ut to fit small vector

* speed up accumulation
---
 .../eager/accumulation/accumulation_node.cc     | 17 ++---------------
 1 file changed, 2 insertions(+), 15 deletions(-)

diff --git a/paddle/fluid/eager/accumulation/accumulation_node.cc b/paddle/fluid/eager/accumulation/accumulation_node.cc
index 857f1be1f7ae0..2ed44ce489934 100644
--- a/paddle/fluid/eager/accumulation/accumulation_node.cc
+++ b/paddle/fluid/eager/accumulation/accumulation_node.cc
@@ -34,22 +34,9 @@ static void CopyOrAddTensor(paddle::experimental::Tensor* tensor,
     *tensor = t;
   } else {
     // Accumulation
-    PADDLE_ENFORCE_EQ(t.initialized(), true,
-                      paddle::platform::errors::Fatal(
-                          "We can only accumulate initialized tensor, but we "
-                          "got tensor: %s is empty please check you network "
-                          "and make sure it creates grads.",
-                          t.name()));
-    PADDLE_ENFORCE_NOT_NULL(
-        tensor, paddle::platform::errors::Fatal(
-                    "We can only accumulate initialized tensor to non-nullptr "
-                    "tensor but we got nullptr please check you network "
-                    "and make sure it creates grads."));
-
-    if (t.is_dense_tensor()) {
-      if (tensor->is_dense_tensor()) {
+    if (LIKELY(t.is_dense_tensor())) {
+      if (LIKELY(tensor->is_dense_tensor())) {
         paddle::imperative::TensorAdd<paddle::experimental::Tensor>(t, tensor);
-
       } else {
         // TODO(jiabin): Support Other TensorBase later
         // TODO(zhanlve): Replace SelectedRowsAddTensor with

From e906eb5bce215f08a2cf22fc71ea85903c25433e Mon Sep 17 00:00:00 2001
From: JYChen <zoooo0820@qq.com>
Date: Thu, 12 May 2022 15:18:52 +0800
Subject: [PATCH 27/49] add batch tensor support for some vision transforms
 functions (#42701)

---
 python/paddle/tests/test_transforms.py        | 125 ++++++++++++++++++
 .../vision/transforms/functional_tensor.py    |  20 ++-
 python/paddle/vision/transforms/transforms.py |   9 +-
 3 files changed, 147 insertions(+), 7 deletions(-)

diff --git a/python/paddle/tests/test_transforms.py b/python/paddle/tests/test_transforms.py
index 82ae3cb6b68f6..e07ac47a0f818 100644
--- a/python/paddle/tests/test_transforms.py
+++ b/python/paddle/tests/test_transforms.py
@@ -458,6 +458,20 @@ def test_color_jitter(self):
         trans = transforms.Compose([transforms.ColorJitter(1.1, 2.2, 0.8, 0.1)])
         self.do_transform(trans)
 
+        color_jitter_trans = transforms.ColorJitter(1.2, 0.2, 0.5, 0.2)
+        batch_input = paddle.rand((2, 3, 4, 4), dtype=paddle.float32)
+        result = color_jitter_trans(batch_input)
+
+    def test_perspective(self):
+        trans = transforms.RandomPerspective(prob=1.0, distortion_scale=0.7)
+        batch_input = paddle.rand((2, 3, 4, 4), dtype=paddle.float32)
+        result = trans(batch_input)
+
+    def test_affine(self):
+        trans = transforms.RandomAffine(15, translate=[0.1, 0.1])
+        batch_input = paddle.rand((2, 3, 4, 4), dtype=paddle.float32)
+        result = trans(batch_input)
+
     def test_pad(self):
         trans = transforms.Compose([transforms.Pad(2)])
         self.do_transform(trans)
@@ -508,6 +522,10 @@ def test_erase(self):
         ])
         self.do_transform(trans)
 
+        erase_trans = transforms.RandomErasing(value=(0.5, 0.2, 0.01))
+        batch_input = paddle.rand((2, 3, 4, 4), dtype=paddle.float32)
+        result = erase_trans(batch_input)
+
     def test_exception(self):
         trans = transforms.Compose([transforms.Resize(-1)])
 
@@ -1003,6 +1021,113 @@ def test_perspective(self):
         # Tolerance : less than 6% of different pixels
         assert ratio_diff_pixels < 0.06
 
+    def test_batch_input(self):
+        paddle.seed(777)
+        batch_tensor = paddle.rand((2, 3, 8, 8), dtype=paddle.float32)
+
+        def test_erase(batch_tensor):
+            input1, input2 = paddle.unbind(batch_tensor, axis=0)
+            target_result = paddle.stack([
+                F.erase(input1, 1, 1, 2, 2, 0.5),
+                F.erase(input2, 1, 1, 2, 2, 0.5)
+            ])
+
+            batch_result = F.erase(batch_tensor, 1, 1, 2, 2, 0.5)
+
+            return paddle.allclose(batch_result, target_result)
+
+        self.assertTrue(test_erase(batch_tensor))
+
+        def test_affine(batch_tensor):
+            input1, input2 = paddle.unbind(batch_tensor, axis=0)
+            target_result = paddle.stack([
+                F.affine(
+                    input1,
+                    45,
+                    translate=[0.2, 0.2],
+                    scale=0.5,
+                    shear=[-10, 10]), F.affine(
+                        input2,
+                        45,
+                        translate=[0.2, 0.2],
+                        scale=0.5,
+                        shear=[-10, 10])
+            ])
+            batch_result = F.affine(
+                batch_tensor,
+                45,
+                translate=[0.2, 0.2],
+                scale=0.5,
+                shear=[-10, 10])
+
+            return paddle.allclose(batch_result, target_result)
+
+        self.assertTrue(test_affine(batch_tensor))
+
+        def test_perspective(batch_tensor):
+            input1, input2 = paddle.unbind(batch_tensor, axis=0)
+            startpoints = [[0, 0], [3, 0], [4, 5], [6, 7]]
+            endpoints = [[0, 1], [3, 1], [4, 4], [5, 7]]
+            target_result = paddle.stack([
+                F.perspective(input1, startpoints, endpoints),
+                F.perspective(input2, startpoints, endpoints)
+            ])
+
+            batch_result = F.perspective(batch_tensor, startpoints, endpoints)
+
+            return paddle.allclose(batch_result, target_result)
+
+        self.assertTrue(test_perspective(batch_tensor))
+
+        def test_adjust_brightness(batch_tensor):
+            input1, input2 = paddle.unbind(batch_tensor, axis=0)
+            target_result = paddle.stack([
+                F.adjust_brightness(input1, 2.1),
+                F.adjust_brightness(input2, 2.1)
+            ])
+
+            batch_result = F.adjust_brightness(batch_tensor, 2.1)
+
+            return paddle.allclose(batch_result, target_result)
+
+        self.assertTrue(test_adjust_brightness(batch_tensor))
+
+        def test_adjust_contrast(batch_tensor):
+            input1, input2 = paddle.unbind(batch_tensor, axis=0)
+            target_result = paddle.stack([
+                F.adjust_contrast(input1, 0.3), F.adjust_contrast(input2, 0.3)
+            ])
+
+            batch_result = F.adjust_contrast(batch_tensor, 0.3)
+
+            return paddle.allclose(batch_result, target_result)
+
+        self.assertTrue(test_adjust_contrast(batch_tensor))
+
+        def test_adjust_saturation(batch_tensor):
+            input1, input2 = paddle.unbind(batch_tensor, axis=0)
+            target_result = paddle.stack([
+                F.adjust_saturation(input1, 1.1),
+                F.adjust_saturation(input2, 1.1)
+            ])
+
+            batch_result = F.adjust_saturation(batch_tensor, 1.1)
+
+            return paddle.allclose(batch_result, target_result)
+
+        self.assertTrue(test_adjust_saturation(batch_tensor))
+
+        def test_adjust_hue(batch_tensor):
+            input1, input2 = paddle.unbind(batch_tensor, axis=0)
+            target_result = paddle.stack(
+                [F.adjust_hue(input1, -0.2), F.adjust_hue(input2, -0.2)])
+
+            batch_result = F.adjust_hue(batch_tensor, -0.2)
+
+            return paddle.allclose(batch_result, target_result)
+
+        self.assertTrue(test_adjust_hue(batch_tensor))
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/vision/transforms/functional_tensor.py b/python/paddle/vision/transforms/functional_tensor.py
index df2529d1224b3..27f83029babaa 100644
--- a/python/paddle/vision/transforms/functional_tensor.py
+++ b/python/paddle/vision/transforms/functional_tensor.py
@@ -28,8 +28,9 @@
 
 def _assert_image_tensor(img, data_format):
     if not isinstance(
-            img, paddle.Tensor) or img.ndim != 3 or not data_format.lower() in (
-                'chw', 'hwc'):
+            img, paddle.Tensor
+    ) or img.ndim < 3 or img.ndim > 4 or not data_format.lower() in ('chw',
+                                                                     'hwc'):
         raise RuntimeError(
             'not support [type={}, ndim={}, data_format={}] paddle image'.
             format(type(img), img.ndim, data_format))
@@ -276,7 +277,10 @@ def affine(img, matrix, interpolation="nearest", fill=None, data_format='CHW'):
         paddle.Tensor: Affined image.
 
     """
-    img = img.unsqueeze(0)
+    ndim = len(img.shape)
+    if ndim == 3:
+        img = img.unsqueeze(0)
+
     img = img if data_format.lower() == 'chw' else img.transpose((0, 3, 1, 2))
 
     matrix = paddle.to_tensor(matrix, place=img.place)
@@ -292,8 +296,9 @@ def affine(img, matrix, interpolation="nearest", fill=None, data_format='CHW'):
     out = _grid_transform(img, grid, mode=interpolation, fill=fill)
 
     out = out if data_format.lower() == 'chw' else out.transpose((0, 2, 3, 1))
+    out = out.squeeze(0) if ndim == 3 else out
 
-    return out.squeeze(0)
+    return out
 
 
 def rotate(img,
@@ -443,7 +448,9 @@ def perspective(img,
 
     """
 
-    img = img.unsqueeze(0)
+    ndim = len(img.shape)
+    if ndim == 3:
+        img = img.unsqueeze(0)
 
     img = img if data_format.lower() == 'chw' else img.transpose((0, 3, 1, 2))
     ow, oh = img.shape[-1], img.shape[-2]
@@ -454,8 +461,9 @@ def perspective(img,
     out = _grid_transform(img, grid, mode=interpolation, fill=fill)
 
     out = out if data_format.lower() == 'chw' else out.transpose((0, 2, 3, 1))
+    out = out.squeeze(0) if ndim == 3 else out
 
-    return out.squeeze(0)
+    return out
 
 
 def vflip(img, data_format='CHW'):
diff --git a/python/paddle/vision/transforms/transforms.py b/python/paddle/vision/transforms/transforms.py
index 79d3b1bc92ece..fea2efb1fb2b1 100644
--- a/python/paddle/vision/transforms/transforms.py
+++ b/python/paddle/vision/transforms/transforms.py
@@ -45,7 +45,14 @@ def _get_image_size(img):
     elif F._is_numpy_image(img):
         return img.shape[:2][::-1]
     elif F._is_tensor_image(img):
-        return img.shape[1:][::-1]  # chw
+        if len(img.shape) == 3:
+            return img.shape[1:][::-1]  # chw -> wh
+        elif len(img.shape) == 4:
+            return img.shape[2:][::-1]  # nchw -> wh
+        else:
+            raise ValueError(
+                "The dim for input Tensor should be 3-D or 4-D, but received {}".
+                format(len(img.shape)))
     else:
         raise TypeError("Unexpected type {}".format(type(img)))
 

From cc343a41e3062bdcc5a086dfe0fd019c8f7ac27c Mon Sep 17 00:00:00 2001
From: helen88 <z8hanghuan@126.com>
Date: Thu, 12 May 2022 15:31:49 +0800
Subject: [PATCH 28/49] add xpu buffer_reader, *test=kunlun (#42578)

* add xpu buffer_reader, *test=kunlun

* xpu buffer_reader, use XPUDeviceGuard, *test=kunlun

* modify xpu.cmake, *test=kunlun

* modify xpu.cmake, *test=kunlun

* modify xpu.cmake, *test=kunlun

* add xpu buffer_reader, *test=kunlun

* add xpu buffer reader, *test=kunlun

* add xpu buffer reader, *test=kunlun
---
 .../fluid/operators/reader/buffered_reader.cc | 73 +++++++++++++-
 .../fluid/operators/reader/buffered_reader.h  | 13 ++-
 paddle/fluid/platform/CMakeLists.txt          |  2 +-
 .../fluid/platform/device/xpu/CMakeLists.txt  |  1 +
 paddle/fluid/platform/device/xpu/xpu_info.cc  |  6 +-
 paddle/fluid/platform/device/xpu/xpu_info.h   | 10 +-
 .../platform/device/xpu/xpu_resource_pool.cc  | 98 +++++++++++++++++++
 .../platform/device/xpu/xpu_resource_pool.h   | 64 ++++++++++++
 paddle/fluid/platform/device_context.h        |  3 +-
 9 files changed, 264 insertions(+), 6 deletions(-)
 create mode 100644 paddle/fluid/platform/device/xpu/xpu_resource_pool.cc
 create mode 100644 paddle/fluid/platform/device/xpu/xpu_resource_pool.h

diff --git a/paddle/fluid/operators/reader/buffered_reader.cc b/paddle/fluid/operators/reader/buffered_reader.cc
index 4b6759ea165ed..db0f5758d2f53 100644
--- a/paddle/fluid/operators/reader/buffered_reader.cc
+++ b/paddle/fluid/operators/reader/buffered_reader.cc
@@ -1,4 +1,4 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -14,6 +14,7 @@
 
 #include "paddle/fluid/operators/reader/buffered_reader.h"
 #include "paddle/fluid/framework/convert_utils.h"
+#include "paddle/fluid/platform/device/device_wrapper.h"
 #include "paddle/fluid/platform/profiler.h"
 #include "paddle/fluid/platform/profiler/event_tracing.h"
 
@@ -85,10 +86,27 @@ BufferedReader::BufferedReader(
     stream_ = platform::MluStreamResourcePool::Instance().New(dev_idx);
   }
 #endif
+
+#ifdef PADDLE_WITH_XPU
+  if (platform::is_xpu_place(place_)) {
+    int dev_idx = place_.device;
+    compute_stream_ =
+        ((platform::XPUDeviceContext *)(platform::DeviceContextPool::Instance()
+                                            .Get(place_)))
+            ->stream();
+    events_.resize(buffer_size);
+    for (auto &event : events_) {
+      event = platform::XpuEventResourcePool::Instance().New(dev_idx);
+    }
+    stream_ = platform::XpuStreamResourcePool::Instance().New(dev_idx);
+  }
+#endif
+
   cpu_buffer_.resize(buffer_size);
   cuda_buffer_.resize(buffer_size);
   npu_buffer_.resize(buffer_size);
   mlu_buffer_.resize(buffer_size);
+  xpu_buffer_.resize(buffer_size);
   ReadTillBufferFullAsync();
 }
 
@@ -322,6 +340,57 @@ void BufferedReader::ReadAsync(size_t i) {
       platform::MLUStreamSync(stream_.get());
     }
 #endif
+
+#ifdef PADDLE_WITH_XPU
+    if (platform::is_xpu_place(place_)) {
+      TensorVec &xpu = xpu_buffer_[i];
+      if (xpu.empty()) {
+        xpu.resize(cpu.size());
+      } else {
+        PADDLE_ENFORCE_EQ(
+            xpu.size(), cpu.size(),
+            platform::errors::InvalidArgument(
+                "Input tensor number on XPU and CPU devices are not matched. "
+                "The number on XPU is %d, on CPU is %d",
+                xpu.size(), cpu.size()));
+      }
+
+      std::vector<void *> xpu_ptrs;
+      xpu_ptrs.reserve(cpu.size());
+      for (size_t i = 0; i < cpu.size(); ++i) {
+        xpu[i].Resize(cpu[i].dims());
+        xpu[i].set_layout(cpu[i].layout());
+        xpu_ptrs.emplace_back(xpu[i].mutable_data(place_, cpu[i].type()));
+      }
+
+      platform::XPUDeviceGuard gurad(place_.device);
+      int r = xpu_event_record(events_[i].get(), compute_stream_);
+      PADDLE_ENFORCE_XDNN_SUCCESS(r, "xpu_event_record");
+      r = xpu_stream_wait_event(stream_.get(), events_[i].get());
+      PADDLE_ENFORCE_XDNN_SUCCESS(r, "xpu_stream_wait_event");
+
+      platform::RecordEvent record_event("BufferedReader:MemoryCopy",
+                                         platform::TracerEventType::UserDefined,
+                                         1);
+      for (size_t i = 0; i < cpu.size(); ++i) {
+        auto cpu_place = cpu[i].place();
+        auto cpu_ptr = cpu[i].data();
+        auto xpu_ptr = xpu_ptrs[i];
+        auto size =
+            cpu[i].numel() * paddle::framework::DataTypeSize(cpu[i].dtype());
+        // TODO(zhanghuan) for now hardware not support xpu_memcpy_async, maybe
+        // KL3
+        if ((platform::is_xpu_place(cpu_place))) {
+          memory::Copy(place_, xpu_ptr, cpu_place, cpu_ptr, size);
+          platform::XPUStreamSync(stream_.get());
+        } else {
+          memory::Copy(place_, xpu_ptr, cpu_place, cpu_ptr, size);
+        }
+        xpu[i].set_lod(cpu[i].lod());
+      }
+      platform::XPUStreamSync(stream_.get());
+    }
+#endif
     return i;
   }));
 }
@@ -359,6 +428,8 @@ void BufferedReader::ReadNextImpl(std::vector<framework::LoDTensor> *out) {
     *out = std::move(npu_buffer_[i]);
   } else if (platform::is_mlu_place(place_)) {
     *out = std::move(mlu_buffer_[i]);
+  } else if (platform::is_xpu_place(place_)) {
+    *out = std::move(xpu_buffer_[i]);
   } else {
     *out = std::move(cpu_buffer_[i]);
   }
diff --git a/paddle/fluid/operators/reader/buffered_reader.h b/paddle/fluid/operators/reader/buffered_reader.h
index f0f3b6b7f9fdf..52d3d8d6999a0 100644
--- a/paddle/fluid/operators/reader/buffered_reader.h
+++ b/paddle/fluid/operators/reader/buffered_reader.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -33,6 +33,10 @@
 #include "paddle/fluid/platform/device/mlu/mlu_info.h"
 #include "paddle/fluid/platform/device/mlu/mlu_resource_pool.h"
 #endif
+#ifdef PADDLE_WITH_XPU
+#include "paddle/fluid/platform/device/xpu/xpu_info.h"
+#include "paddle/fluid/platform/device/xpu/xpu_resource_pool.h"
+#endif
 
 namespace paddle {
 namespace operators {
@@ -76,6 +80,7 @@ class BufferedReader : public framework::DecoratedReader {
   std::vector<TensorVec> cuda_buffer_;
   std::vector<TensorVec> npu_buffer_;
   std::vector<TensorVec> mlu_buffer_;
+  std::vector<TensorVec> xpu_buffer_;
   size_t prev_pos_{-1UL};
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   gpuStream_t compute_stream_;
@@ -94,6 +99,12 @@ class BufferedReader : public framework::DecoratedReader {
   std::shared_ptr<platform::MluStreamObject> stream_;
   std::vector<std::shared_ptr<platform::MluEventObject>> events_;
 #endif
+
+#ifdef PADDLE_WITH_XPU
+  xpuStream compute_stream_;
+  std::shared_ptr<platform::XpuStreamObject> stream_;
+  std::vector<std::shared_ptr<platform::XpuEventObject>> events_;
+#endif
 };
 
 }  // namespace reader
diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt
index f29546c5210d9..356b5ab2cd23c 100644
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -125,7 +125,7 @@ cc_library(device_context SRCS device_context.cc DEPS simple_threadpool malloc x
     place phi_place eigen3 stringpiece cpu_helper cpu_info framework_proto ${IPU_CTX_DEPS} ${GPU_CTX_DEPS} ${NPU_CTX_DEPS} ${MKLDNN_CTX_DEPS}
     ${dgc_deps} dlpack cudnn_workspace_helper ${XPU_CTX_DEPS} ${MLU_CTX_DEPS} eigen3 cpu_context generator)
 if(WITH_XPU)
-  target_link_libraries(device_context xpu_context)
+  target_link_libraries(device_context xpu_context xpu_resource_pool)
 endif()
 
 cc_library(collective_helper SRCS collective_helper.cc gen_comm_id_helper.cc DEPS framework_proto device_context enforce)
diff --git a/paddle/fluid/platform/device/xpu/CMakeLists.txt b/paddle/fluid/platform/device/xpu/CMakeLists.txt
index b6a26f2554a13..3399fff087f8d 100644
--- a/paddle/fluid/platform/device/xpu/CMakeLists.txt
+++ b/paddle/fluid/platform/device/xpu/CMakeLists.txt
@@ -7,5 +7,6 @@ set(XPU_CTX_DEPS xpulib ssl crypto rt z resolv dl)
 
 cc_library(xpu_info SRCS xpu_info.cc DEPS gflags glog enforce xpulib device_context place phi_xpu_info)
 cc_library(xpu_op_list SRCS xpu_op_list.cc DEPS gflags glog enforce xpulib device_context op_kernel_type)
+cc_library(xpu_resource_pool SRCS xpu_resource_pool.cc DEPS xpu_info)
 
 add_subdirectory(tests)
diff --git a/paddle/fluid/platform/device/xpu/xpu_info.cc b/paddle/fluid/platform/device/xpu/xpu_info.cc
index 2e960c1c0dd9c..cdd7ee7f806e9 100644
--- a/paddle/fluid/platform/device/xpu/xpu_info.cc
+++ b/paddle/fluid/platform/device/xpu/xpu_info.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
@@ -79,6 +79,10 @@ void MemcpySyncD2D(void* dst, const platform::XPUPlace& dst_place,
                                     *dev_ctx);
 }
 
+void XPUStreamSync(xpuStream stream) {
+  PADDLE_ENFORCE_XDNN_SUCCESS(xpu_wait(stream), "xpu_wait");
+}
+
 /**************************** Others **************************/
 
 phi::backends::xpu::XPUVersion get_xpu_version(int dev_id) {
diff --git a/paddle/fluid/platform/device/xpu/xpu_info.h b/paddle/fluid/platform/device/xpu/xpu_info.h
index 33385f8e45937..38b4defadc6c3 100644
--- a/paddle/fluid/platform/device/xpu/xpu_info.h
+++ b/paddle/fluid/platform/device/xpu/xpu_info.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
@@ -14,8 +14,13 @@ limitations under the License. */
 #include <vector>
 #include "paddle/fluid/platform/place.h"
 #include "paddle/phi/backends/xpu/xpu_info.h"
+#include "xpu/runtime.h"
 
 namespace paddle {
+
+using xpuStream = XPUStream;
+using xpuEventHandle = XPUEvent;
+
 namespace platform {
 
 /***** Version Management *****/
@@ -51,6 +56,9 @@ void MemcpySyncD2D(void *dst, const platform::XPUPlace &dst_place,
                    const void *src, const platform::XPUPlace &src_place,
                    size_t count);
 
+//! Blocks until stream has completed all operations.
+void XPUStreamSync(xpuStream stream);
+
 using XPUDeviceGuard = phi::backends::xpu::XPUDeviceGuard;
 
 phi::backends::xpu::XPUVersion get_xpu_version(int dev_id);
diff --git a/paddle/fluid/platform/device/xpu/xpu_resource_pool.cc b/paddle/fluid/platform/device/xpu/xpu_resource_pool.cc
new file mode 100644
index 0000000000000..af0d47c716717
--- /dev/null
+++ b/paddle/fluid/platform/device/xpu/xpu_resource_pool.cc
@@ -0,0 +1,98 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#if defined(PADDLE_WITH_XPU)
+#include "paddle/fluid/platform/device/xpu/xpu_resource_pool.h"
+
+namespace paddle {
+namespace platform {
+
+XpuStreamResourcePool::XpuStreamResourcePool() {
+  int dev_cnt = platform::GetXPUDeviceCount();
+  pool_.reserve(dev_cnt);
+  for (int dev_idx = 0; dev_idx < dev_cnt; ++dev_idx) {
+    auto creator = [dev_idx] {
+      platform::XPUDeviceGuard gurad(dev_idx);
+      xpuStream stream;
+      xpu_stream_create(&stream);
+      return stream;
+    };
+
+    auto deleter = [dev_idx](xpuStream stream) {
+      platform::XPUDeviceGuard gurad(dev_idx);
+      xpu_stream_destroy(stream);
+    };
+
+    pool_.emplace_back(ResourcePool<XpuStreamObject>::Create(creator, deleter));
+  }
+}
+
+XpuStreamResourcePool& XpuStreamResourcePool::Instance() {
+  static XpuStreamResourcePool pool;
+  return pool;
+}
+
+std::shared_ptr<XpuStreamObject> XpuStreamResourcePool::New(int dev_idx) {
+  PADDLE_ENFORCE_GE(
+      dev_idx, 0,
+      platform::errors::InvalidArgument(
+          "The dev_idx should be not less than 0, but got %d.", dev_idx));
+  PADDLE_ENFORCE_LT(
+      dev_idx, pool_.size(),
+      platform::errors::OutOfRange(
+          "The dev_idx should be less than device count %d, but got %d.",
+          pool_.size(), dev_idx));
+  return pool_[dev_idx]->New();
+}
+
+XpuEventResourcePool::XpuEventResourcePool() {
+  int dev_cnt = platform::GetXPUDeviceCount();
+  pool_.reserve(dev_cnt);
+  for (int dev_idx = 0; dev_idx < dev_cnt; ++dev_idx) {
+    auto creator = [dev_idx] {
+      platform::XPUDeviceGuard gurad(dev_idx);
+      xpuEventHandle event;
+      xpu_event_create(&event);
+      return event;
+    };
+
+    auto deleter = [dev_idx](xpuEventHandle event) {
+      platform::XPUDeviceGuard gurad(dev_idx);
+      xpu_event_destroy(event);
+    };
+
+    pool_.emplace_back(ResourcePool<XpuEventObject>::Create(creator, deleter));
+  }
+}
+
+XpuEventResourcePool& XpuEventResourcePool::Instance() {
+  static XpuEventResourcePool pool;
+  return pool;
+}
+
+std::shared_ptr<XpuEventObject> XpuEventResourcePool::New(int dev_idx) {
+  PADDLE_ENFORCE_GE(
+      dev_idx, 0,
+      platform::errors::InvalidArgument(
+          "The dev_idx should be not less than 0, but got %d.", dev_idx));
+  PADDLE_ENFORCE_LT(
+      dev_idx, pool_.size(),
+      platform::errors::OutOfRange(
+          "The dev_idx should be less than device count %d, but got %d.",
+          pool_.size(), dev_idx));
+  return pool_[dev_idx]->New();
+}
+
+}  // namespace platform
+}  // namespace paddle
+#endif
diff --git a/paddle/fluid/platform/device/xpu/xpu_resource_pool.h b/paddle/fluid/platform/device/xpu/xpu_resource_pool.h
new file mode 100644
index 0000000000000..5c6ade8f6f88f
--- /dev/null
+++ b/paddle/fluid/platform/device/xpu/xpu_resource_pool.h
@@ -0,0 +1,64 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#if defined(PADDLE_WITH_XPU)
+#include <memory>
+#include <type_traits>
+#include <vector>
+
+#include "paddle/fluid/platform/device/xpu/xpu_info.h"
+#include "paddle/fluid/platform/resource_pool.h"
+
+namespace paddle {
+namespace platform {
+
+using XpuStreamObject = std::remove_pointer<xpuStream>::type;
+using XpuEventObject = std::remove_pointer<xpuEventHandle>::type;
+
+class XpuStreamResourcePool {
+ public:
+  std::shared_ptr<XpuStreamObject> New(int dev_idx);
+
+  static XpuStreamResourcePool &Instance();
+
+ private:
+  XpuStreamResourcePool();
+
+  DISABLE_COPY_AND_ASSIGN(XpuStreamResourcePool);
+
+ private:
+  std::vector<std::shared_ptr<ResourcePool<XpuStreamObject>>> pool_;
+};
+
+class XpuEventResourcePool {
+ public:
+  std::shared_ptr<XpuEventObject> New(int dev_idx);
+
+  static XpuEventResourcePool &Instance();
+
+ private:
+  XpuEventResourcePool();
+
+  DISABLE_COPY_AND_ASSIGN(XpuEventResourcePool);
+
+ private:
+  std::vector<std::shared_ptr<ResourcePool<XpuEventObject>>> pool_;
+};
+
+}  // namespace platform
+}  // namespace paddle
+
+#endif
diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h
index 2c5f24d28c6d6..2b53ecf86a641 100644
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 Copyright (c) 2022 NVIDIA Corporation. All rights reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
@@ -188,6 +188,7 @@ class XPUDeviceContext : public phi::XPUContext {
   explicit XPUDeviceContext(XPUPlace place);
   virtual ~XPUDeviceContext();
   Eigen::DefaultDevice* eigen_device() const { return nullptr; }
+  xpuStream stream() const { return XPUContext::x_context()->xpu_stream; }
 };
 
 template <>

From 9ac736c2ad9b4ad90cbfe2a4df001de01ab981b2 Mon Sep 17 00:00:00 2001
From: sneaxiy <32832641+sneaxiy@users.noreply.github.com>
Date: Thu, 12 May 2022 16:02:43 +0800
Subject: [PATCH 29/49] Add cinn pass to program (#42623)

* add cinn pass to program

* remove build_cinn_pass ut

* polish ut, add ut

* guard ut with is_compiled_with_cinn

* enable ut test_build_cinn_pass_resnet
---
 .../framework/paddle2cinn/build_cinn_pass.cc  | 12 +++++-
 .../framework/paddle2cinn/cinn_compiler.cc    |  4 +-
 paddle/fluid/operators/cinn/cinn_launch_op.h  | 14 +++++++
 python/paddle/distributed/passes/cpp_pass.py  | 33 +++++++++++++++
 .../distributed_passes/dist_pass_test_base.py |  6 +--
 .../unittests/distributed_passes/model_zoo.py | 37 ++++++++++++++++
 .../test_build_cinn_pass_resnet.py            | 41 ++++++++++++++++++
 .../test_build_cinn_pass_simple_net.py        | 42 +++++++++++++++++++
 8 files changed, 183 insertions(+), 6 deletions(-)
 create mode 100644 python/paddle/fluid/tests/unittests/distributed_passes/test_build_cinn_pass_resnet.py
 create mode 100644 python/paddle/fluid/tests/unittests/distributed_passes/test_build_cinn_pass_simple_net.py

diff --git a/paddle/fluid/framework/paddle2cinn/build_cinn_pass.cc b/paddle/fluid/framework/paddle2cinn/build_cinn_pass.cc
index e259d6d417a5c..0de89aaad3b0d 100644
--- a/paddle/fluid/framework/paddle2cinn/build_cinn_pass.cc
+++ b/paddle/fluid/framework/paddle2cinn/build_cinn_pass.cc
@@ -545,6 +545,15 @@ void ReplaceSubGraphWithCinnOpNode(
   RemoveSubGraphFromGraph(cluster, cluster_internals, graph);
 }
 
+static bool IsInplaceOp(const OpDesc& op_desc) {
+  auto inputs = op_desc.InputArgumentNames();
+  std::unordered_set<std::string> input_set(inputs.begin(), inputs.end());
+  for (auto& name : op_desc.OutputArgumentNames()) {
+    if (input_set.count(name) > 0) return true;
+  }
+  return false;
+}
+
 // Search all subgraphs which all op node supported by CINN,
 // Here we using SubgraphDetector to detecte the subgraph that
 // all of op node supported by CINN. We using OpMapperRegistry
@@ -565,9 +574,10 @@ void SearchAllSubgraphs(Graph* graph) {
     if (deny_ops.size()) {
       return registered && !deny_ops.count(node->Name());
     }
+
     // if the user doesn't set FLAGS_allow_cinn_ops and FLAGS_deny_cinn_ops,
     // return true only when it is registered in CINN
-    return registered;
+    return registered && (node->IsOp() && !IsInplaceOp(*node->Op()));
   };
   VLOG(4) << "The allowed Cinn Ops: " << FLAGS_allow_cinn_ops;
   VLOG(4) << "The denied Cinn Ops: " << FLAGS_deny_cinn_ops;
diff --git a/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc b/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc
index 51dca93c7c7f0..549c854961764 100644
--- a/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc
+++ b/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc
@@ -61,8 +61,8 @@ using ::cinn::hlir::framework::BuildScope;
 using ::cinn::hlir::framework::GraphCompiler;
 
 CinnCompiler* CinnCompiler::GetInstance() {
-  static CinnCompiler instance;
-  return &instance;
+  static CinnCompiler* instance = new CinnCompiler();
+  return instance;
 }
 
 const CinnCompiledObject& CinnCompiler::Compile(
diff --git a/paddle/fluid/operators/cinn/cinn_launch_op.h b/paddle/fluid/operators/cinn/cinn_launch_op.h
index 024bf2bceb3d0..6001a4f5c0709 100644
--- a/paddle/fluid/operators/cinn/cinn_launch_op.h
+++ b/paddle/fluid/operators/cinn/cinn_launch_op.h
@@ -14,6 +14,7 @@
 
 #pragma once
 
+#include <chrono>
 #include <memory>
 #include <string>
 #include <unordered_map>
@@ -101,8 +102,21 @@ class CinnLaunchOpKernel : public framework::OpKernel<T> {
 
     // Step 2. Get compilation result of the graph
     auto target = details::PlaceToCinnTarget(place);
+    using ClockType = std::chrono::steady_clock;
+    std::chrono::time_point<ClockType> start_t, end_t;
+    if (VLOG_IS_ON(1)) {
+      VLOG(1) << "Starts to compile at thread " << std::this_thread::get_id();
+      start_t = ClockType::now();
+    }
     const auto& cinn_compiled_object = CinnCompiler::GetInstance()->Compile(
         compilation_key, inputs_name2tensor, target, stream);
+    if (VLOG_IS_ON(1)) {
+      end_t = ClockType::now();
+      auto time_sec = std::chrono::duration_cast<std::chrono::milliseconds>(
+          end_t - start_t);
+      VLOG(1) << "Ends to compile at thread " << std::this_thread::get_id()
+              << " , time cost : " << time_sec.count() << " ms";
+    }
     details::DebugCinnCompiledResult(cinn_compiled_object);
     auto* launch_context = cinn_compiled_object.launch_context.get();
 
diff --git a/python/paddle/distributed/passes/cpp_pass.py b/python/paddle/distributed/passes/cpp_pass.py
index 4a4e5ecbbb495..72525255b7eaa 100644
--- a/python/paddle/distributed/passes/cpp_pass.py
+++ b/python/paddle/distributed/passes/cpp_pass.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 from .pass_base import PassType, CPPPassWrapper, register_pass
+from paddle.fluid.framework import core, _apply_pass as _apply_cpp_pass
 
 
 @register_pass("fuse_elewise_add_act")
@@ -93,3 +94,35 @@ def cpp_name(self):
 
     def _type(self):
         return PassType.CALC_OPT
+
+
+@register_pass("build_cinn")
+class BuildCINNPass(CPPPassWrapper):
+    def __init__(self):
+        super(BuildCINNPass, self).__init__()
+        self.set_attr("allow_ops", [])
+        self.set_attr("deny_ops", [])
+
+    @property
+    def cpp_name(self):
+        return "build_cinn_pass"
+
+    def _type(self):
+        return PassType.CALC_OPT
+
+    def _apply_single_impl(self, main_program, startup_program, context):
+        allow_ops = ";".join(self.get_attr("allow_ops"))
+        deny_ops = ";".join(self.get_attr("deny_ops"))
+
+        assert 'FLAGS_allow_cinn_ops' in core.globals(
+        ), "PaddlePaddle is not compiled with CINN support"
+        old_allow_ops = core.globals()['FLAGS_allow_cinn_ops']
+        old_deny_ops = core.globals()['FLAGS_deny_cinn_ops']
+        try:
+            core.globals()['FLAGS_allow_cinn_ops'] = allow_ops
+            core.globals()['FLAGS_deny_cinn_ops'] = deny_ops
+            _apply_cpp_pass(main_program, startup_program, self.cpp_name, {},
+                            self.cpp_attr_types)
+        finally:
+            core.globals()['FLAGS_allow_cinn_ops'] = old_allow_ops
+            core.globals()['FLAGS_deny_cinn_ops'] = old_deny_ops
diff --git a/python/paddle/fluid/tests/unittests/distributed_passes/dist_pass_test_base.py b/python/paddle/fluid/tests/unittests/distributed_passes/dist_pass_test_base.py
index f0ed2cdc04950..786ee06487fbc 100644
--- a/python/paddle/fluid/tests/unittests/distributed_passes/dist_pass_test_base.py
+++ b/python/paddle/fluid/tests/unittests/distributed_passes/dist_pass_test_base.py
@@ -39,7 +39,7 @@ def prepare_python_path_and_return_module(path):
             paths.append(dirname)
         python_path = ":".join(paths)
     else:
-        python_path = path
+        python_path = dirname
     os.environ[env_name] = python_path
     print('GLOG_v=', os.environ.get('GLOG_v', None), flush=1)
     return filename[:-len(py_suffix)]
@@ -85,9 +85,9 @@ def apply_passes(self, main_prog, startup_prog):
         raise NotImplementedError()
 
     def check_main(self, model=None, gpus=None, **kwargs):
-        no_pass_rets = self._distributed_launch(
-            model=model, apply_pass=True, gpus=gpus, **kwargs)
         pass_rets = self._distributed_launch(
+            model=model, apply_pass=True, gpus=gpus, **kwargs)
+        no_pass_rets = self._distributed_launch(
             model=model, apply_pass=False, gpus=gpus, **kwargs)
         self.check_results(no_pass_rets, pass_rets)
 
diff --git a/python/paddle/fluid/tests/unittests/distributed_passes/model_zoo.py b/python/paddle/fluid/tests/unittests/distributed_passes/model_zoo.py
index 0b522b79c4e93..7eebee47e59a8 100644
--- a/python/paddle/fluid/tests/unittests/distributed_passes/model_zoo.py
+++ b/python/paddle/fluid/tests/unittests/distributed_passes/model_zoo.py
@@ -59,3 +59,40 @@ def reader():
     main_program = paddle.static.default_main_program()
     startup_program = paddle.static.default_startup_program()
     return main_program, startup_program, [image, label], [loss], reader
+
+
+def simple_net(place, batch_size, image_shape=[784], num_classes=10):
+    image = paddle.static.data(
+        shape=[batch_size] + image_shape, dtype='float32', name='image')
+    label = paddle.static.data(
+        shape=[batch_size, 1], dtype='int64', name='label')
+    linears = [nn.Linear(784, 784) for _ in range(3)]
+    hidden = image
+    for linear in linears:
+        hidden = linear(hidden)
+        hidden = nn.ReLU()(hidden)
+    loss_fn = nn.loss.CrossEntropyLoss()
+    loss = loss_fn(hidden, label)
+    optimizer = paddle.optimizer.Adam(learning_rate=1e-3)
+
+    dist_strategy = fleet.DistributedStrategy()
+    dist_strategy.fuse_all_reduce_ops = False
+    dist_strategy.without_graph_optimization = True
+    fleet.init(is_collective=True, strategy=dist_strategy)
+    optimizer = fleet.distributed_optimizer(optimizer)
+    optimizer.minimize(loss)
+
+    rank = paddle.distributed.get_rank()
+
+    def reader():
+        seed = get_seed_from_env()
+        np.random.seed(seed + rank)
+        for _ in range(10):
+            image_np = np.random.random(size=image.shape).astype('float32')
+            label_np = np.random.randint(
+                low=0, high=num_classes, size=label.shape).astype('int64')
+            yield image_np, label_np
+
+    main_program = paddle.static.default_main_program()
+    startup_program = paddle.static.default_startup_program()
+    return main_program, startup_program, [image, label], [loss], reader
diff --git a/python/paddle/fluid/tests/unittests/distributed_passes/test_build_cinn_pass_resnet.py b/python/paddle/fluid/tests/unittests/distributed_passes/test_build_cinn_pass_resnet.py
new file mode 100644
index 0000000000000..8430eb615a20c
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/distributed_passes/test_build_cinn_pass_resnet.py
@@ -0,0 +1,41 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+from paddle.distributed.passes import new_pass, PassManager
+import unittest
+from dist_pass_test_base import DistPassTestBase
+from model_zoo import resnet_model
+
+
+class TestBuildCINNPass(DistPassTestBase):
+    def init(self):
+        self.atol = 0.5
+        self.rtol = 0.0
+
+    def apply_passes(self, main_prog, startup_prog):
+        pass_manager = PassManager([
+            new_pass("build_cinn"),
+            new_pass("fuse_elewise_add_act"),
+        ])
+        pass_manager.apply([main_prog], [startup_prog])
+        print(pass_manager.names)
+
+    def test_bs_32(self):
+        if paddle.is_compiled_with_cinn():
+            self.check_main(resnet_model, batch_size=32)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/distributed_passes/test_build_cinn_pass_simple_net.py b/python/paddle/fluid/tests/unittests/distributed_passes/test_build_cinn_pass_simple_net.py
new file mode 100644
index 0000000000000..e030420d32420
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/distributed_passes/test_build_cinn_pass_simple_net.py
@@ -0,0 +1,42 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+from paddle.distributed.passes import new_pass, PassManager
+import unittest
+from dist_pass_test_base import DistPassTestBase
+from model_zoo import simple_net
+
+
+class TestBuildCINNPass(DistPassTestBase):
+    def init(self):
+        self.atol = 0.0
+        self.rtol = 0.0
+
+    def apply_passes(self, main_prog, startup_prog):
+        pass_manager = PassManager([
+            new_pass("build_cinn"),
+            new_pass("fuse_elewise_add_act"),
+        ])
+        pass_manager.apply([main_prog], [startup_prog])
+        op_types = [op.type for op in main_prog.global_block().ops]
+        self.assertTrue('cinn_launch' in op_types)
+
+    def test_bs_32(self):
+        if paddle.is_compiled_with_cinn():
+            self.check_main(simple_net, batch_size=32)
+
+
+if __name__ == "__main__":
+    unittest.main()

From 190cf44f6762b33cf5b24d833bc2d24989fc433b Mon Sep 17 00:00:00 2001
From: fwenguang <95677191+fwenguang@users.noreply.github.com>
Date: Thu, 12 May 2022 17:11:45 +0800
Subject: [PATCH 30/49] [MLU] fix cnnl error when index is 2D (#42669)

---
 paddle/fluid/operators/gather_op_mlu.cc | 39 +++++++++++++++++++++++--
 1 file changed, 37 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/operators/gather_op_mlu.cc b/paddle/fluid/operators/gather_op_mlu.cc
index 220d045952643..cf35e051edf87 100644
--- a/paddle/fluid/operators/gather_op_mlu.cc
+++ b/paddle/fluid/operators/gather_op_mlu.cc
@@ -27,11 +27,28 @@ class GatherOpMLUKernel : public framework::OpKernel<T> {
     auto *index = ctx.Input<Tensor>("Index");
     auto axis = ctx.Attr<int>("axis");
 
+    const auto index_dims = index->dims();
+    if (index_dims.size() == 2) {
+      PADDLE_ENFORCE_EQ(
+          index_dims[1], 1,
+          platform::errors::InvalidArgument(
+              "The last dim of index should be 1 when it is 2D, but we get %d",
+              index_dims[1]));
+    } else {
+      PADDLE_ENFORCE_EQ(
+          index_dims.size(), 1,
+          platform::errors::InvalidArgument(
+              "The index should be 1D, when it is not 2D, but we get %d",
+              index_dims.size()));
+    }
+
     auto *out = ctx.Output<Tensor>("Out");
     out->mutable_data<T>(ctx.GetPlace());
 
     MLUCnnlTensorDesc x_desc(*x);
-    MLUCnnlTensorDesc index_desc(*index);
+    int index_shape_1d[1] = {static_cast<int>(index_dims[0])};
+    MLUCnnlTensorDesc index_desc(1, index_shape_1d,
+                                 ToCnnlDataType(index->dtype()));
     MLUCnnlTensorDesc out_desc(*out);
     MLUCnnl::GatherFunctor(ctx, axis, 0 /*batch_dims*/, x_desc.get(),
                            GetBasePtr(x), index_desc.get(), GetBasePtr(index),
@@ -46,6 +63,22 @@ class GatherGradOpMLUKernel : public framework::OpKernel<T> {
     auto *index = ctx.Input<Tensor>("Index");
     auto *dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
     auto *dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+
+    const auto index_dims = index->dims();
+    if (index_dims.size() == 2) {
+      PADDLE_ENFORCE_EQ(
+          index_dims[1], 1,
+          platform::errors::InvalidArgument(
+              "The last dim of index should be 1 when it is 2D, but we get %d",
+              index_dims[1]));
+    } else {
+      PADDLE_ENFORCE_EQ(
+          index_dims.size(), 1,
+          platform::errors::InvalidArgument(
+              "The index should be 1D, when it is not 2D, but we get %d",
+              index_dims.size()));
+    }
+
     dx->mutable_data<T>(ctx.GetPlace());
 
     MLUCnnlTensorDesc dx_desc(*dx);
@@ -53,7 +86,9 @@ class GatherGradOpMLUKernel : public framework::OpKernel<T> {
     MLUCnnl::Fill(ctx, CNNL_POINTER_MODE_HOST, &value, dx_desc.get(),
                   GetBasePtr(dx));
 
-    MLUCnnlTensorDesc index_desc(*index);
+    int index_shape_1d[1] = {static_cast<int>(index_dims[0])};
+    MLUCnnlTensorDesc index_desc(1, index_shape_1d,
+                                 ToCnnlDataType(index->dtype()));
     MLUCnnlTensorDesc dout_desc(*dout);
     const cnnlScatterRefMode_t mode = CNNL_SCATTERREF_UPDATE;
     MLUCnnl::ScatterFunctor(ctx, dx_desc.get(), GetBasePtr(dx), dout_desc.get(),

From 272b7f1c17f822eb81d524a55efb634af0d10934 Mon Sep 17 00:00:00 2001
From: Fan Zhang <frank08081993@gmail.com>
Date: Thu, 12 May 2022 18:55:45 +0800
Subject: [PATCH 31/49] Xpups dev (#42692)

* Adapt XPUPS - 1st version - 3.24

* Adapt XPUPS - update XPU PushSparse -  2nd version - 3.24

* Adapt XPUPS - add XPU PullSparseOp - 3nd version - 3.25

* refactor heter comm kernel

* update. test=develop

* Adapt XPUPS - modify by compilation - 4th version - 3.27

* update calc_shard_offset. test=develop

* update xpu kernel. test=develop

* update args of calc_shard_offset

* update. test=develop

* remove customGradMerger

* update. test=develop

* heter_comm update

* heter_comm update

* update calc_shard_offset. test=develop

* heter_comm update

* update args of calc_shard_offset

* update. test=develop

* remove customGradMerger

* update. test=develop

* fix. test=develop

* update. test=develop

* update. test=develop

* update optimizer kernel

* Adapt XPUPS - use WITH_XPU_KP and modify wrapper kernel function - 5th version - 3.30

* update. test=develop

* update pslib.cmake

* update. test=develop

* update. test=develop

* update. test=develop

* update. test=develop

* update. test=develop

* Adapt XPUPS - modify by kp compilation  - 6th version - 3.30

* update. test=develop

* update. test=develop

* update. test=develop

* update optimizer kernel

* update. test=develop

* update. test=develop

* update. test=develop

* update. test=develop

* update. test=develop

* update. test=develop

* update. test=develop

* update. test=develop

* fix. test=develop

* fix. test=develop

* used by minxu

* update heter_comm_inl

* fix. test=develop

* Adapt XPUPS - modify by kp compilation  - 7th version - 3.30

* fix. test=develop

* add optimizer kernel. test=develop

* fix. test=develop

* fix. test=develop

* fix. test=develop

* fix. test=develop

* 3.31 update

* Adapt XPUPS - update kp compilation path  - 8th version - 3.31

* add optimizer kernel. test=develop

* fix kunlun not support size_t. test=develop

* fix. test=develop

* fix. test=develop

* fix. test=develop

* fix. test=develop

* fix kunlun not support size_t. test=develop

* fix. test=develop

* fix. test=develop

* fix. test=develop

* fix. test=develop

* update heter_comm_kernel.kps 3.31

* fix. test=develop

* fix. test=develop

* update heter_comm_kernel.kps 3.31

* fix. test=develop

* fix. test=develop

* fix. test=develop

* fix. test=develop

* fix. test=develop

* update heter_comm.h 3.31

* fix. test=develop

* fix. test=develop

* fix. test=develop

* fix. test=develop

* update hashtable. test=develop

* update. test=develop

* Adapt XPUPS - update by kp compilation  - 9th version - 4.1

* update hashtable. test=develop

* fix. test=develop

* update hashtable 4.1

* fix. test=develop

* fix. test=develop

* fix. test=develop

* fix. test=develop

* fix. test=develop

* fix. test=develop

* fix. test=develop

* Adapt XPUPS - update by kp compilation  - 10th version - 4.1

* fix. test=develop

* fix. test=develop

* fix. test=develop

* update. test=develop

* modify by compilation 4.1

* update. test=develop

* update. test=develop

* fix. test=develop

* modify by compilation 4.1

* update. test=develop

* fix. test=develop

* fix. test=develop

* fix. test=develop

* fix. test=develop

* fix. test=develop

* fix. test=develop

* fix. test=develop

* fix. test=develop

* modify by compilation 4.1

* fix. test=develop

* fix. test=develop

* fix. test=develop

* modify by compilation 4.1 19:30

* fix. test=develop

* update ps_gpu_wrapper.kps 4.1

* fix. test=develop

* fix. test=develop

* fix. test=develop

* fix. test=develop

* fix. test=develop

* Adapt XPUPS - update by kp compilation  - 11th version - 4.1

* fix. test=develop

* Adapt XPUPS - update by kp compilation  - 12nd version - 4.2

* fix. test=develop

* fix. test=develop

* modify by compilation 4.2

* 4.2 update

* fix. test=develop

* template init. test=develop

* update 4.6

* fix. test=develop

* template init. test=develop

* 4.6 modify by compilation

* hashtable template init. test=develop

* hashtable template init. test=develop

* fix. test=develop

* fix. test=develop

* fix. test=devlop

* fix. test=develop

* fix. test=develop

* fix. test=develop

* fix. test=develop

* fix. test=develop

* fix. test=devlop

* fix. test=develop

* fix. test=develop

* fix. test=develop

* fix. test=develop

* fix. test=develop

* fix. test=develop

* fix. test=develop

* Adapt XPUPS - update by kp compilation  - 13nd version - 4.7

* fix. test=develop

* fix. test=develop

* fix. test=develop

* fix. test=develop

* fix. test=develop

* fix. test=develop

* fix. test=develop

* 4.11 update

* fix. test=develop

* fix. test=develop

* 4.11 update

* update by pre-commit

* fix. test=develop

* fix. test=develop

* fix. test=develop

* fix. test=develop

* 4.12 update

* fix. test=develop

* Adapt XPUPS - update by kp compilation  - 14th version - 4.13

* 4.13 update

* 4.14 update

* 4.14 update

* 4.14 update

* 4.14 modify by merged latest compilation

* retry CI 4.14

* 4.15 pass static check

* 4.15 modify by gpups CI

* 3.16 update by gpups CI - modify ps_gpu_wrapper.h

* 4.16 update

* 4.16 pass xpu compile

* 4.16 retry CI

* 4.16 update

* Adapt XPUPS - adapt BKCL comm for XPUPS - 4.24

* update by compilation

* Adapt XPUPS - register PSGPUTrainer for XPUPS - 4.25

* update device_worker_factory

* Adapt XPUPS - split heter_ps into .cu and .cc - 4.27

* Adapt XPUPS - register pull_box_sparse op under XPU_KP - 4.28

* update

* 5.7 modify ps_gpu_wrapper pull_sparse

* 5.11 update ps_gpu_wrapper CopyKeysKernel

Co-authored-by: zmxdream <zhangminxu01@baidu.com>
---
 .../fluid/framework/fleet/ps_gpu_wrapper.cc   | 43 +++++--------------
 .../fluid/framework/fleet/ps_gpu_wrapper.kps  | 23 ++++++----
 2 files changed, 26 insertions(+), 40 deletions(-)

diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc
index 64765c98fd04b..f512fcc7b9fdb 100644
--- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc
+++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc
@@ -898,17 +898,9 @@ void PSGPUWrapper::PullSparse(const paddle::platform::Place& place,
   all_timer.Start();
   int64_t total_length =
       std::accumulate(slot_lengths.begin(), slot_lengths.end(), 0UL);
-#ifdef PADDLE_WITH_CUDA
-  VLOG(3) << "Begine Gpu Ps PullSparse";
+  VLOG(3) << "Begine Gpu/Xpu Ps PullSparse";
   auto buf = memory::Alloc(place, total_length * sizeof(FeatureValue));
   FeatureValue* total_values_gpu = reinterpret_cast<FeatureValue*>(buf->ptr());
-#endif
-#ifdef PADDLE_WITH_XPU_KP
-  VLOG(3) << "Begine Xpu Ps PullSparse";
-  FeatureValue* total_values_gpu = nullptr;
-  xpu_malloc(reinterpret_cast<void**>(&total_values_gpu),
-             total_length * sizeof(FeatureValue));
-#endif
   if (platform::is_cpu_place(place)) {
     PADDLE_THROW(platform::errors::Unimplemented(
         "Warning:: CPUPlace is not supported in GpuPs now."));
@@ -969,19 +961,11 @@ void PSGPUWrapper::PullSparse(const paddle::platform::Place& place,
       slot_lengths_lod[i] += slot_lengths_lod[i - 1];
     }
 
-    uint64_t* buf_key = nullptr;
-    int64_t* buf_length = nullptr;
-    PADDLE_ENFORCE_EQ(xpu_malloc(reinterpret_cast<void**>(&buf_key),
-                                 keys.size() * sizeof(uint64_t*)),
-                      XPU_SUCCESS, platform::errors::ResourceExhausted(
-                                       "XPU has no enough memory"));
-    PADDLE_ENFORCE_EQ(xpu_malloc(reinterpret_cast<void**>(&buf_length),
-                                 slot_lengths.size() * sizeof(int64_t)),
-                      XPU_SUCCESS, platform::errors::ResourceExhausted(
-                                       "XPU has no enough memory"));
-
-    uint64_t** xpu_keys = reinterpret_cast<uint64_t**>(&buf_key);
-    int64_t* xpu_len = reinterpret_cast<int64_t*>(buf_length);
+    auto buf_key = memory::Alloc(place, keys.size() * sizeof(uint64_t*));
+    auto buf_length =
+        memory::Alloc(place, slot_lengths.size() * sizeof(int64_t));
+    uint64_t** xpu_keys = reinterpret_cast<uint64_t**>(buf_key->ptr());
+    int64_t* xpu_len = reinterpret_cast<int64_t*>(buf_length->ptr());
     PADDLE_ENFORCE_XPU_SUCCESS(xpu_memcpy(xpu_keys, keys.data(),
                                           keys.size() * sizeof(uint64_t*),
                                           XPU_HOST_TO_DEVICE));
@@ -997,8 +981,6 @@ void PSGPUWrapper::PullSparse(const paddle::platform::Place& place,
     pull_gpups_timer.Start();
     HeterPs_->pull_sparse(devid_2_index, total_keys, total_values_gpu,
                           static_cast<int>(total_length));
-    // PADDLE_ENFORCE_EQ(ret, 0, platform::errors::PreconditionNotMet(
-    //                              "PullSparseGPU failed in GPUPS."));
     pull_gpups_timer.Pause();
 
     VLOG(3) << "Begin Copy result to tensor, total_length[" << total_length
@@ -1029,22 +1011,16 @@ void PSGPUWrapper::PushSparseGrad(const paddle::platform::Place& place,
   all_timer.Start();
   int64_t total_length =
       std::accumulate(slot_lengths.begin(), slot_lengths.end(), 0UL);
-#ifdef PADDLE_WITH_CUDA
+  // #ifdef PADDLE_WITH_CUDA
   VLOG(3) << "Begin GPUPS PushSparseGrad";
   auto buf = memory::Alloc(place, total_length * sizeof(FeaturePushValue));
   FeaturePushValue* total_grad_values_gpu =
       reinterpret_cast<FeaturePushValue*>(buf->ptr());
-#endif
-#ifdef PADDLE_WITH_XPU_KP
-  VLOG(3) << "Begine Xpu Ps PushSparseGrad";
-  FeaturePushValue* total_grad_values_gpu = nullptr;
-  xpu_malloc(reinterpret_cast<void**>(&total_grad_values_gpu),
-             total_length * sizeof(FeaturePushValue));
-#endif
   if (platform::is_cpu_place(place)) {
     PADDLE_THROW(platform::errors::Unimplemented(
         "Warning:: CPUPlace is not supported in GPUPS now."));
   } else if (platform::is_gpu_place(place)) {
+#ifdef PADDLE_WITH_CUDA
     int device_id = place.GetDeviceId();
     int devid_2_index = HeterPs_->get_index_by_devid(device_id);
     LoDTensor& cached_total_keys_tensor = keys_tensor[devid_2_index];
@@ -1060,7 +1036,9 @@ void PSGPUWrapper::PushSparseGrad(const paddle::platform::Place& place,
     HeterPs_->push_sparse(devid_2_index, total_keys, total_grad_values_gpu,
                           static_cast<int>(total_length));
     push_gpups_timer.Pause();
+#endif
   } else if (platform::is_xpu_place(place)) {
+#ifdef PADDLE_WITH_XPU_KP
     int device_id = place.GetDeviceId();
     int devid_2_index = HeterPs_->get_index_by_devid(device_id);
     LoDTensor& cached_total_keys_tensor = keys_tensor[devid_2_index];
@@ -1076,6 +1054,7 @@ void PSGPUWrapper::PushSparseGrad(const paddle::platform::Place& place,
     HeterPs_->push_sparse(devid_2_index, total_keys, total_grad_values_gpu,
                           static_cast<int>(total_length));
     push_gpups_timer.Pause();
+#endif
   } else {
     PADDLE_THROW(platform::errors::PreconditionNotMet(
         "GPUPS: PushSparseGrad Only Support CUDAPlace Now."));
diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.kps b/paddle/fluid/framework/fleet/ps_gpu_wrapper.kps
index 28dd873a117dc..58b9f0f722f8c 100644
--- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.kps
+++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.kps
@@ -84,7 +84,7 @@ __global__ void PullCopy(float** dest, const FeatureValue* src,
   }
 }
 
-__global__ void CopyKeysKernel(unsigned long long** src_keys,
+__global__ void CopyKeysKernel(unsigned long long* src_keys,
                                unsigned long long* dest_total_keys,
                                const long long* len, int slot_num,
                                int total_len) {
@@ -95,21 +95,27 @@ __global__ void CopyKeysKernel(unsigned long long** src_keys,
   }
   int thread_id = ncores * cluster_id() + cid;
   int nthreads = ncores * cluster_num();
-  __local__ int64_t local_len[slot_num];
-  GM2LM(len, local_len, slot_num * sizeof(int64_t));
+  __local__ long long local_len[slot_num];
+  GM2LM(len, local_len, slot_num * sizeof(long long));
+
+  __global_ptr__ unsigned long long* local_keys[slot_num];
+  GM2LM(src_keys, local_keys,
+        slot_num * sizeof(__global_ptr__ unsigned long long*));
 
   for (int i = thread_id; i < slot_num; i += nthreads) {
     // max core local memory = 8KB
     int slot_len = i ? local_len[i] - local_len[i - 1] : local_len[0];
-    int read_len = min(slot_len, 1024);
+    // int read_len = min(slot_len, 1024);
+    int read_len = 100;
     int dest_len = i ? local_len[i - 1] : 0;
-    __local__ uint64_t local_slot_keys[read_len];
+    __local__ unsigned long long local_slot_keys[read_len];
 
     for (int k = 0; k < slot_len; k += read_len) {
       int real_read_len = min(read_len, slot_len - k);
-      GM2LM(src_keys[i] + k, local_slot_keys, real_read_len * sizeof(uint64_t));
+      GM2LM(local_keys[i] + k, local_slot_keys,
+            real_read_len * sizeof(unsigned long long));
       LM2GM(local_slot_keys, dest_total_keys + dest_len + k,
-            real_read_len * sizeof(uint64_t));
+            real_read_len * sizeof(unsigned long long));
     }
   }
 }
@@ -199,7 +205,8 @@ void PSGPUWrapper::CopyKeys(const paddle::platform::Place& place,
   stream = static_cast<platform::XPUDeviceContext*>(dev_ctx)
                ->x_context()
                ->xpu_stream;
-  unsigned long long** o_keys = (unsigned long long**)origin_keys;
+  unsigned long long* o_keys =
+      reinterpret_cast<unsigned long long*>(origin_keys);
   unsigned long long* t_keys = (unsigned long long*)total_keys;
   const long long* c_len = (const long long*)gpu_len;
   CopyKeysKernel<<<2, 64, stream>>>(o_keys, t_keys, c_len, slot_num, total_len);

From 91cf770bd2feb2406e192a6d319a0dcd2de73537 Mon Sep 17 00:00:00 2001
From: piotrekobi <48731682+piotrekobi@users.noreply.github.com>
Date: Thu, 12 May 2022 13:28:58 +0200
Subject: [PATCH 32/49] Change oneDNN SHA (#42657)

---
 cmake/external/mkldnn.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/external/mkldnn.cmake b/cmake/external/mkldnn.cmake
index 29625b2b52e18..8f955008fa079 100644
--- a/cmake/external/mkldnn.cmake
+++ b/cmake/external/mkldnn.cmake
@@ -19,7 +19,7 @@ SET(MKLDNN_PREFIX_DIR     ${THIRD_PARTY_PATH}/mkldnn)
 SET(MKLDNN_INSTALL_DIR    ${THIRD_PARTY_PATH}/install/mkldnn)
 SET(MKLDNN_INC_DIR        "${MKLDNN_INSTALL_DIR}/include" CACHE PATH "mkldnn include directory." FORCE)
 SET(MKLDNN_REPOSITORY     ${GIT_URL}/oneapi-src/oneDNN.git)
-SET(MKLDNN_TAG            9a35435c18722ff17a48fb60bceac42bfdf78754)
+SET(MKLDNN_TAG            9b186765dded79066e0cd9c17eb70b680b76fb8e)
 
 
 # Introduce variables:

From 2012672c10c53369936f988aab47ab9001fabab0 Mon Sep 17 00:00:00 2001
From: Shuangchi He <34329208+Yulv-git@users.noreply.github.com>
Date: Thu, 12 May 2022 23:25:03 +0800
Subject: [PATCH 33/49] Fix some typos in paddle/. (#42408)

---
 .../fluid/distributed/ps/service/ps_client.h  |  2 +-
 paddle/fluid/framework/data_set.h             |  2 +-
 .../details/sparse_all_reduce_op_handle.cc    |  4 +-
 .../fluid/framework/heter_pipeline_trainer.cc |  2 +-
 .../fuse_optimizer_op_pass.cc                 |  2 +-
 .../framework/ir/fusion_group/operation.cc    |  2 +-
 .../framework/ir/graph_pattern_detector.cc    |  2 +-
 .../framework/ir/graph_pattern_detector.h     |  2 +-
 .../framework/new_executor/interpretercore.cc |  2 +-
 .../new_executor/interpretercore_util.cc      |  2 +-
 .../new_executor/workqueue/workqueue.h        |  2 +-
 .../framework/paddle2cinn/build_cinn_pass.cc  |  2 +-
 paddle/fluid/framework/prune.cc               |  2 +-
 paddle/fluid/imperative/layer.cc              |  2 +-
 paddle/fluid/imperative/reducer.cc            |  4 +-
 .../fluid/inference/analysis/CMakeLists.txt   |  2 +-
 .../analysis/ir_passes/subgraph_util.cc       |  2 +-
 .../analysis/ir_passes/subgraph_util.h        |  2 +-
 .../ir_passes/tensorrt_subgraph_pass.cc       |  2 +-
 .../inference/api/paddle_analysis_config.h    |  2 +-
 .../inference/tensorrt/convert/swish_op.cc    |  2 +-
 .../analyzer_lexical_analysis_gru_tester.cc   |  6 +--
 paddle/fluid/operators/activation_op.h        |  2 +-
 paddle/fluid/operators/affine_grid_op.cc      |  2 +-
 paddle/fluid/operators/batch_norm_op.cc       |  2 +-
 .../operators/collective/c_broadcast_op.cu.cc |  2 +-
 .../collective/c_broadcast_op_mlu.cc          |  2 +-
 .../collective/c_broadcast_op_npu.cc          |  2 +-
 paddle/fluid/operators/conv_op.h              |  6 +--
 paddle/fluid/operators/conv_op_mlu.cc         |  2 +-
 paddle/fluid/operators/ctc_align_op.cu        |  8 ++--
 .../operators/deformable_psroi_pooling_op.cu  |  2 +-
 .../operators/deformable_psroi_pooling_op.h   |  2 +-
 .../operators/detection/matrix_nms_op.cc      |  2 +-
 paddle/fluid/operators/dropout_op_npu.cc      |  2 +-
 .../operators/elementwise/elementwise_mlu.h   |  2 +-
 paddle/fluid/operators/fc_op.cc               |  4 +-
 paddle/fluid/operators/fc_op.h                |  2 +-
 paddle/fluid/operators/fill_constant_op.cc    |  2 +-
 paddle/fluid/operators/fold_op.cc             | 16 +++----
 .../fluid/operators/fused/conv_fusion_op.cc   |  4 +-
 .../fluid/operators/fused/conv_fusion_op.cu   |  6 +--
 .../fused/cudnn_bn_stats_finalize.cu.h        |  2 +-
 .../operators/fused/cudnn_norm_conv.cu.h      | 12 ++---
 .../fused/cudnn_scale_bias_add_relu.cu.h      |  6 +--
 .../fused_fc_elementwise_layernorm_op.cc      | 16 +++----
 .../fluid/operators/fused/fusion_group_op.cc  |  2 +-
 paddle/fluid/operators/inverse_op.cc          |  6 +--
 paddle/fluid/operators/lod_reset_op.h         |  2 +-
 paddle/fluid/operators/math/cross_entropy.h   |  2 +-
 .../operators/math/selected_rows_functor.cc   | 40 ++++++++--------
 .../operators/math/selected_rows_functor.cu   | 46 +++++++++----------
 paddle/fluid/operators/metrics/accuracy_op.cc |  2 +-
 paddle/fluid/operators/mlu/mlu_baseop.cc      |  4 +-
 .../operators/positive_negative_pair_op.cc    |  4 +-
 .../operators/prune_gate_by_capacity_op.cu    |  2 +-
 .../pscore/heter_listen_and_serv_op.cc        |  4 +-
 paddle/fluid/operators/rnn_op_xpu.cc          |  2 +-
 paddle/fluid/operators/sample_logits_op.cc    |  2 +-
 ...igmoid_cross_entropy_with_logits_op_npu.cc |  2 +-
 paddle/fluid/operators/sum_op.cu              |  4 +-
 paddle/fluid/operators/tdm_child_op.h         |  2 +-
 paddle/fluid/operators/warpctc_op.cc          |  2 +-
 paddle/fluid/platform/CMakeLists.txt          |  2 +-
 .../fluid/platform/device/npu/npu_op_runner.h |  2 +-
 paddle/fluid/platform/profiler/event_node.cc  |  2 +-
 paddle/infrt/common/object.h                  |  2 +-
 paddle/phi/infermeta/binary.cc                |  2 +-
 paddle/phi/infermeta/multiary.cc              |  4 +-
 paddle/phi/infermeta/unary.cc                 | 20 ++++----
 paddle/phi/kernels/cpu/conv_util.h            |  2 +-
 paddle/phi/kernels/cpu/rnn_kernel.cc          |  2 +-
 paddle/phi/kernels/funcs/broadcast_function.h |  8 ++--
 paddle/phi/kernels/funcs/elementwise_base.h   |  2 +-
 paddle/phi/kernels/gpu/bce_loss_kernel.cu     |  2 +-
 .../kernels/impl/searchsorted_kernel_impl.h   |  2 +-
 paddle/phi/ops/compat/scale_sig.cc            |  2 +-
 python/paddle/README.rst                      |  2 +-
 .../auto_parallel/process_group.py            |  2 +-
 .../fleet/base/distributed_strategy.py        |  8 ++--
 python/paddle/distributed/fleet/launch.py     |  2 +-
 .../meta_optimizers/ascend/ascend_parser.py   |  2 +-
 .../dygraph_sharding_optimizer.py             |  4 +-
 python/paddle/distributed/launch/main.py      |  2 +-
 .../distributed/passes/auto_parallel_fp16.py  |  2 +-
 python/paddle/distributed/ps/utils/public.py  |  2 +-
 ..._post_training_quantization_mobilenetv1.py |  2 +-
 python/paddle/fluid/dygraph/checkpoint.py     |  2 +-
 .../dygraph_to_static/convert_operators.py    |  2 +-
 python/paddle/fluid/executor.py               |  2 +-
 .../fleet/parameter_server/ir/trainer_pass.py |  2 +-
 python/paddle/fluid/layer_helper_base.py      |  2 +-
 python/paddle/fluid/layers/nn.py              |  2 +-
 python/paddle/fluid/layers/rnn.py             |  2 +-
 .../unittests/ir/inference/auto_scan_test.py  |  2 +-
 .../test_shuffle_channel_detect_pass.py       |  4 +-
 .../fluid/tests/unittests/ir/pass_test.py     |  2 +-
 .../paddle/fluid/tests/unittests/op_test.py   |  6 +--
 .../tests/unittests/seresnext_test_base.py    |  4 +-
 .../static_model_parallel_fused_attention.py  |  4 +-
 ...static_model_parallel_fused_feedforward.py |  4 +-
 .../tests/unittests/test_fleet_rolemaker.py   |  2 +-
 .../unittests/test_fleet_rolemaker_new.py     |  2 +-
 .../tests/unittests/test_gradient_clip.py     | 18 ++++----
 .../fluid/tests/unittests/test_hsigmoid_op.py |  2 +-
 .../fluid/tests/unittests/test_optimizer.py   |  2 +-
 ...st_parallel_executor_seresnext_base_cpu.py |  2 +-
 ...st_parallel_executor_seresnext_base_gpu.py |  2 +-
 .../fluid/tests/unittests/test_unpool_op.py   |  4 +-
 python/paddle/framework/io.py                 |  2 +-
 python/paddle/hapi/model.py                   |  4 +-
 python/paddle/incubate/autotune.py            |  2 +-
 .../distributed/models/moe/grad_clip.py       |  2 +-
 .../incubate/nn/layer/fused_transformer.py    | 20 ++++----
 python/paddle/nn/functional/loss.py           |  6 +--
 python/paddle/nn/layer/activation.py          |  2 +-
 python/paddle/nn/layer/transformer.py         | 22 ++++-----
 python/paddle/profiler/profiler.py            |  2 +-
 python/paddle/profiler/timer.py               |  8 ++--
 python/paddle/profiler/utils.py               |  2 +-
 python/paddle/tensor/math.py                  |  4 +-
 python/paddle/tensor/to_string.py             |  2 +-
 python/paddle/vision/models/mobilenetv3.py    |  2 +-
 123 files changed, 262 insertions(+), 262 deletions(-)

diff --git a/paddle/fluid/distributed/ps/service/ps_client.h b/paddle/fluid/distributed/ps/service/ps_client.h
index 0d3d23be4e8d1..926bb7e7c9fd3 100644
--- a/paddle/fluid/distributed/ps/service/ps_client.h
+++ b/paddle/fluid/distributed/ps/service/ps_client.h
@@ -109,7 +109,7 @@ class PSClient {
                                          size_t table_id) = 0;  // 保留
 
   // firstly push dense param for parameter server
-  // this is neccessary because dense weight initialized in trainer on cold
+  // this is necessary because dense weight initialized in trainer on cold
   // start
   virtual std::future<int32_t> PushDenseParam(const Region *regions,
                                               size_t region_num,
diff --git a/paddle/fluid/framework/data_set.h b/paddle/fluid/framework/data_set.h
index 1947c669e9bb0..3f10cd7765bc1 100644
--- a/paddle/fluid/framework/data_set.h
+++ b/paddle/fluid/framework/data_set.h
@@ -152,7 +152,7 @@ class Dataset {
   virtual void DestroyPreLoadReaders() = 0;
   // set preload thread num
   virtual void SetPreLoadThreadNum(int thread_num) = 0;
-  // seperate train thread and dataset thread
+  // separate train thread and dataset thread
   virtual void DynamicAdjustChannelNum(int channel_num,
                                        bool discard_remaining_ins = false) = 0;
   virtual void DynamicAdjustReadersNum(int thread_num) = 0;
diff --git a/paddle/fluid/framework/details/sparse_all_reduce_op_handle.cc b/paddle/fluid/framework/details/sparse_all_reduce_op_handle.cc
index d198eb1459288..7e63c5ffb9a44 100644
--- a/paddle/fluid/framework/details/sparse_all_reduce_op_handle.cc
+++ b/paddle/fluid/framework/details/sparse_all_reduce_op_handle.cc
@@ -75,7 +75,7 @@ void SparseAllReduceOpHandle::RunImplEncoded() {
       in_var_handles.size(), places_.size(),
       platform::errors::PreconditionNotMet(
           "The number of input variables should be equal to the number of "
-          "places, but got the number of input variables is %zu and the the "
+          "places, but got the number of input variables is %zu and the "
           "number of places is %zu.",
           in_var_handles.size(), places_.size()));
   PADDLE_ENFORCE_EQ(
@@ -83,7 +83,7 @@ void SparseAllReduceOpHandle::RunImplEncoded() {
       platform::errors::PreconditionNotMet(
           "The number of input variables should be equal to the number of "
           "output variables, but got the number of input variables is %zu and "
-          "the the number of output variables is %zu.",
+          "the number of output variables is %zu.",
           in_var_handles.size(), out_var_handles.size()));
 
   std::vector<const LoDTensor *> ins;
diff --git a/paddle/fluid/framework/heter_pipeline_trainer.cc b/paddle/fluid/framework/heter_pipeline_trainer.cc
index 13eb78874c395..d0d3c2fea3b56 100644
--- a/paddle/fluid/framework/heter_pipeline_trainer.cc
+++ b/paddle/fluid/framework/heter_pipeline_trainer.cc
@@ -282,7 +282,7 @@ void HeterPipelineTrainer::Run() {
   if (threads_.size() > 0) {
     threads_.clear();
   }
-  VLOG(3) << "Epoch Trainging done";
+  VLOG(3) << "Epoch Training done";
 }
 
 void HeterPipelineTrainer::Finalize() {
diff --git a/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.cc b/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.cc
index 48df5869a7a1f..40e1de8a523aa 100644
--- a/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.cc
+++ b/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.cc
@@ -172,7 +172,7 @@ void FuseOptimizerOpPass::ApplyImpl(ir::Graph *graph) const {
       VLOG(6) << "The number of new gradients is " << new_grad_idx.size();
       if (new_grad_idx.size() == 1) return;
       // NOTE(zcd): If the gradients of backward stage and optimization stage
-      // have diff, Only take care of the the gradient of optimization stage.
+      // have diff, Only take care of the gradient of optimization stage.
       GradientsFilter(new_grad_idx, &opt_nodes, &aux_var_map);
     }
   }
diff --git a/paddle/fluid/framework/ir/fusion_group/operation.cc b/paddle/fluid/framework/ir/fusion_group/operation.cc
index 921cf0904f632..2b7a3e1899c76 100644
--- a/paddle/fluid/framework/ir/fusion_group/operation.cc
+++ b/paddle/fluid/framework/ir/fusion_group/operation.cc
@@ -127,7 +127,7 @@ void OperationMap::InsertUnaryElementwiseOperations() {
 
   // scale
   //  out = (bias_after_scale) ? scale * X +  bias : scale(X + bias)
-  //  here we use '=' operator to seperate th default value
+  //  here we use '=' operator to separate th default value
   // TODO(wangchaochaohu): Later we need to support Tensor input for scale and
   //  bias.
   insert_handler(
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc
index fbd8fda131b6d..8c8d9fdddec85 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@@ -720,7 +720,7 @@ bool HasOutput(Node *op, const std::string &argument) {
   PADDLE_ENFORCE_EQ(
       op->IsOp(), true,
       platform::errors::InvalidArgument(
-          "First parameter of function HasOuput must be Node::Op"));
+          "First parameter of function HasOutput must be Node::Op"));
   auto const &names = op->Op()->OutputNames();
   if (std::find(names.begin(), names.end(), argument) == names.end())
     return false;
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h
index c9fea057d444d..9e5a82fc44586 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.h
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.h
@@ -1432,7 +1432,7 @@ struct PriorBox : public PatternBase {
 };
 
 // Conv + ElementwiseAdd + an activation
-// This pattern can futher fuse the conv related ops after the conv+bn fusion.
+// This pattern can further fuse the conv related ops after the conv+bn fusion.
 struct ConvElementwiseaddAct : public PatternBase {
   ConvElementwiseaddAct(PDPattern* pattern, const std::string& name_scope)
       : PatternBase(pattern, name_scope, "conv_elementwiseadd_act") {}
diff --git a/paddle/fluid/framework/new_executor/interpretercore.cc b/paddle/fluid/framework/new_executor/interpretercore.cc
index 6735406aacde7..da2fd0c8c6114 100644
--- a/paddle/fluid/framework/new_executor/interpretercore.cc
+++ b/paddle/fluid/framework/new_executor/interpretercore.cc
@@ -277,7 +277,7 @@ void InterpreterCore::Convert(
   }
 
   for (size_t i = 0; i < vec_instruction_.size(); ++i) {
-    // checkout ouput
+    // checkout output
     for (auto& item : vec_instruction_[i].Outputs()) {
       for (auto var_id : item.second) {
         if (input_var2op_info_.at(var_id).size() == 0) {
diff --git a/paddle/fluid/framework/new_executor/interpretercore_util.cc b/paddle/fluid/framework/new_executor/interpretercore_util.cc
index d6de37a72c772..f601a4ad28bd7 100644
--- a/paddle/fluid/framework/new_executor/interpretercore_util.cc
+++ b/paddle/fluid/framework/new_executor/interpretercore_util.cc
@@ -666,7 +666,7 @@ std::map<int, std::list<int>> get_downstream_map(
   VLOG(6) << "downstream count: " << downstream_map_count();
   VLOG(6) << "downstream_map: " << std::endl << downstream_map_to_str();
 
-  // step2: remove unneccessary downstream ops
+  // step2: remove unnecessary downstream ops
   // for example, a->b->c
   // a: b, c
   // b: c
diff --git a/paddle/fluid/framework/new_executor/workqueue/workqueue.h b/paddle/fluid/framework/new_executor/workqueue/workqueue.h
index e9c658e3b9dc6..2c2576528fe0e 100644
--- a/paddle/fluid/framework/new_executor/workqueue/workqueue.h
+++ b/paddle/fluid/framework/new_executor/workqueue/workqueue.h
@@ -89,7 +89,7 @@ struct WorkQueueOptions {
   // If you need to blocking the calling  thread to wait "queue empty", set
   // track_task = true and set events_waiter. EventsWaiter::WaitEvent will
   // block the calling thread until any of events (including "queue empty")
-  // occured.
+  // occurred.
   bool track_task;
   // If you need to be noticed when a WorkQueue Destruct() , set detached =
   // false and set events_waiter.
diff --git a/paddle/fluid/framework/paddle2cinn/build_cinn_pass.cc b/paddle/fluid/framework/paddle2cinn/build_cinn_pass.cc
index 0de89aaad3b0d..3de702027bb6c 100644
--- a/paddle/fluid/framework/paddle2cinn/build_cinn_pass.cc
+++ b/paddle/fluid/framework/paddle2cinn/build_cinn_pass.cc
@@ -511,7 +511,7 @@ void AddCinnOpToGraph(const GraphNodeSet& cluster,
                        ExtractOpRole(cluster));
   cinn_op_desc.Flush();
   auto* cinn_op_node = graph->CreateOpNode(&cinn_op_desc);
-  // Add new links from or to the the cinn launch op node
+  // Add new links from or to the cinn launch op node
   AddLinkToCinnOp(cluster_inputs, cluster_outputs, cinn_op_node);
 
   VLOG(4) << "Add op [" << kCinnLaunchOp << "] into graph.";
diff --git a/paddle/fluid/framework/prune.cc b/paddle/fluid/framework/prune.cc
index efbab83f7d0e8..4c95f01ae569f 100644
--- a/paddle/fluid/framework/prune.cc
+++ b/paddle/fluid/framework/prune.cc
@@ -421,7 +421,7 @@ void PruneBackwardImpl(proto::BlockDesc* origin, proto::BlockDesc* pruned) {
   for (const auto& name : var_names) {
     if (var_map.count(name)) {
       // NOTE(zhiqiu): For operator in a conditional block, the related vars
-      // may not exist in current block, but in its futher block.
+      // may not exist in current block, but in its further block.
       *pruned_vars->Add() = var_map[name];
     }
   }
diff --git a/paddle/fluid/imperative/layer.cc b/paddle/fluid/imperative/layer.cc
index e928cbb654839..76f64ab73a64b 100644
--- a/paddle/fluid/imperative/layer.cc
+++ b/paddle/fluid/imperative/layer.cc
@@ -186,7 +186,7 @@ template <typename VarType>
 static void SetForwardDataTypeOfGradVars(const NameVarMap<VarType>& outs) {
   for (auto& var_pair : outs) {
     for (auto& var : var_pair.second) {
-      // NOTE(zhiqu): The ouput may be NULL because of pruning.
+      // NOTE(zhiqu): The output may be NULL because of pruning.
       if (var) {
         SetForwardDataTypeOfGradVar(var);
       }
diff --git a/paddle/fluid/imperative/reducer.cc b/paddle/fluid/imperative/reducer.cc
index 03fa46eab5367..c7fd2215eb42a 100644
--- a/paddle/fluid/imperative/reducer.cc
+++ b/paddle/fluid/imperative/reducer.cc
@@ -879,7 +879,7 @@ void Reducer::MarkVarReady(const size_t var_index, const bool is_used_var) {
 }
 
 // TODO(liuyuhui): If BKCL support non-blocking communication, it should be
-// fixed as same as multi gpus card trainging.
+// fixed as same as multi gpus card training.
 void Reducer::MarkGroupReady(size_t group_index) {
   PADDLE_ENFORCE_GE(
       group_index, next_group_,
@@ -957,7 +957,7 @@ void Reducer::FusedAllReduceSchedule(const int run_order, Group &group,
 // default stream for communicating, so there exist some problems in
 // synchronization. And need to add a WaitComm there.
 // TODO(liuyuhui): If BKCL support non-blocking communication, it should be
-// fixed as multi gpus card trainging.
+// fixed as multi gpus card training.
 #ifdef PADDLE_WITH_XPU_BKCL
     if (platform::is_xpu_place(group.dense_tensors_[0].place())) {
       parallel_ctx_->WaitComm(run_order);
diff --git a/paddle/fluid/inference/analysis/CMakeLists.txt b/paddle/fluid/inference/analysis/CMakeLists.txt
index dab1b9f7b1135..3d1a467565c84 100644
--- a/paddle/fluid/inference/analysis/CMakeLists.txt
+++ b/paddle/fluid/inference/analysis/CMakeLists.txt
@@ -1,5 +1,5 @@
 unset(analysis_deps CACHE)
-set(analysis_deps # analysis_deps can be extended accross the project
+set(analysis_deps # analysis_deps can be extended across the project
         framework_proto proto_desc graph pass paddle_inference_io executor pretty_log
         ir_pass_manager
         CACHE INTERNAL "")
diff --git a/paddle/fluid/inference/analysis/ir_passes/subgraph_util.cc b/paddle/fluid/inference/analysis/ir_passes/subgraph_util.cc
index 09494a360270b..0c9f8d7e16558 100644
--- a/paddle/fluid/inference/analysis/ir_passes/subgraph_util.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/subgraph_util.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 /*
- * This file defines the the class to partition a graph.
+ * This file defines the class to partition a graph.
  */
 
 #include "paddle/fluid/inference/analysis/ir_passes/subgraph_util.h"
diff --git a/paddle/fluid/inference/analysis/ir_passes/subgraph_util.h b/paddle/fluid/inference/analysis/ir_passes/subgraph_util.h
index 621c631b8539b..21bfe7582061a 100644
--- a/paddle/fluid/inference/analysis/ir_passes/subgraph_util.h
+++ b/paddle/fluid/inference/analysis/ir_passes/subgraph_util.h
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 /*
- * This file defines the the class to partition a graph.
+ * This file defines the class to partition a graph.
  */
 
 #pragma once
diff --git a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
index 059a9cb21e1d5..bc7dc9704ac5e 100644
--- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
@@ -286,7 +286,7 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
   // There are models with the same structure but the different parameters,
   // when running in the 'use_serialize' mode, there is a bug.
   // serialization is affected by max_batch_size, but calibration is not.
-  // So we use seperate engine keys in serialization and calibration.
+  // So we use separate engine keys in serialization and calibration.
   auto engine_key = GenerateEngineKey(
       input_names_with_id, output_names_with_id, std::to_string(0),
       std::to_string(max_batch_size),
diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h
index 8edbc494ab886..af6cf88a3224f 100644
--- a/paddle/fluid/inference/api/paddle_analysis_config.h
+++ b/paddle/fluid/inference/api/paddle_analysis_config.h
@@ -410,7 +410,7 @@ struct PD_INFER_DECL AnalysisConfig {
   /// \return int The NPU device id.
   ///
   int npu_device_id() const { return npu_device_id_; }
-  /// \brief Get the the number of IPU device .
+  /// \brief Get the number of IPU device .
   ///
   /// \return int The number of IPU device.
   ///
diff --git a/paddle/fluid/inference/tensorrt/convert/swish_op.cc b/paddle/fluid/inference/tensorrt/convert/swish_op.cc
index 0df5c013d34d4..0b9a6917dd972 100644
--- a/paddle/fluid/inference/tensorrt/convert/swish_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/swish_op.cc
@@ -52,7 +52,7 @@ class SwishOpConverter : public OpConverter {
     PADDLE_ENFORCE_EQ(
         output_num, 1UL,
         platform::errors::InvalidArgument(
-            "The ouput Out's size must equal to 1 in TRT swish op. "
+            "The output Out's size must equal to 1 in TRT swish op. "
             "But received Out's size %u.",
             output_num));
     // Get attrs
diff --git a/paddle/fluid/inference/tests/api/analyzer_lexical_analysis_gru_tester.cc b/paddle/fluid/inference/tests/api/analyzer_lexical_analysis_gru_tester.cc
index cca8ac2634c6c..141e60513eb95 100644
--- a/paddle/fluid/inference/tests/api/analyzer_lexical_analysis_gru_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_lexical_analysis_gru_tester.cc
@@ -147,10 +147,10 @@ void SetInput(std::vector<std::vector<PaddleTensor>> *inputs,
   file.read(reinterpret_cast<char *>(&total_words_num), sizeof(int64_t));
   LOG(INFO) << "Total words in file: " << total_words_num;
   size_t lods_beginning_offset = static_cast<size_t>(file.tellg());
-  auto words_begining_offset =
+  auto words_beginning_offset =
       lods_beginning_offset + sizeof(size_t) * total_sentences_num;
   auto targets_beginning_offset =
-      words_begining_offset + sizeof(int64_t) * total_words_num;
+      words_beginning_offset + sizeof(int64_t) * total_words_num;
 
   std::vector<size_t> lod_full =
       ReadSentenceLod(file, lods_beginning_offset, total_sentences_num);
@@ -158,7 +158,7 @@ void SetInput(std::vector<std::vector<PaddleTensor>> *inputs,
   size_t lods_sum = std::accumulate(lod_full.begin(), lod_full.end(), 0UL);
   EXPECT_EQ(lods_sum, static_cast<size_t>(total_words_num));
 
-  TensorReader<int64_t> words_reader(file, words_begining_offset, "words");
+  TensorReader<int64_t> words_reader(file, words_beginning_offset, "words");
   TensorReader<int64_t> targets_reader(file, targets_beginning_offset,
                                        "targets");
   // If FLAGS_iterations is set to 0, run all batches
diff --git a/paddle/fluid/operators/activation_op.h b/paddle/fluid/operators/activation_op.h
index 5448ed2a4bdad..8214b733f86da 100644
--- a/paddle/fluid/operators/activation_op.h
+++ b/paddle/fluid/operators/activation_op.h
@@ -634,7 +634,7 @@ struct SquareGradGradFunctor : public BaseActivationFunctor<T> {
 
 // TODO(dengkaipeng): double gradient calculation for Square/Sqrt need
 // DOut(dy) as input(not output), tensor extraction is different from
-// others. Impliment extraction kernel seperately here.
+// others. Impliment extraction kernel separately here.
 inline void ExtractDoubleGradTensorWithInputDOut(
     const framework::ExecutionContext& ctx, const framework::Tensor** X,
     const framework::Tensor** ddX, framework::Tensor** dX,
diff --git a/paddle/fluid/operators/affine_grid_op.cc b/paddle/fluid/operators/affine_grid_op.cc
index efaea94f26e8d..e311d21bb54d3 100644
--- a/paddle/fluid/operators/affine_grid_op.cc
+++ b/paddle/fluid/operators/affine_grid_op.cc
@@ -136,7 +136,7 @@ class AffineGridOpMaker : public framework::OpProtoAndCheckerMaker {
         .AsExtra();
     AddAttr<bool>("align_corners",
                   "(bool, default false) Whether to align the corners of input"
-                  "and ouput.")
+                  "and output.")
         .SetDefault(true);
     AddAttr<std::vector<int>>(
         "output_shape",
diff --git a/paddle/fluid/operators/batch_norm_op.cc b/paddle/fluid/operators/batch_norm_op.cc
index 36a0d53e05245..2663a08101157 100644
--- a/paddle/fluid/operators/batch_norm_op.cc
+++ b/paddle/fluid/operators/batch_norm_op.cc
@@ -64,7 +64,7 @@ void BatchNormOp::InferShape(framework::InferShapeContext *ctx) const {
         (x_dims[i] == -1) || (x_dims[i] > 0), true,
         platform::errors::InvalidArgument(
             "Each dimension of input tensor is expected to be -1 or a "
-            "positive number, but recieved %d. Input's shape is [%s].",
+            "positive number, but received %d. Input's shape is [%s].",
             x_dims[i], x_dims));
   }
 
diff --git a/paddle/fluid/operators/collective/c_broadcast_op.cu.cc b/paddle/fluid/operators/collective/c_broadcast_op.cu.cc
index 4bed282ace8d1..eeae16a0d71f3 100644
--- a/paddle/fluid/operators/collective/c_broadcast_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_broadcast_op.cu.cc
@@ -77,7 +77,7 @@ class CBroadcastOpCUDAKernel : public framework::OpKernel<T> {
       PADDLE_ENFORCE_GPU_SUCCESS(
           platform::dynload::ncclBcast(out->mutable_data<T>(place), numel,
                                        dtype, root, comm->comm(), stream));
-      VLOG(3) << "rank " << comm->rank() << " invoke Bcast. recieved "
+      VLOG(3) << "rank " << comm->rank() << " invoke Bcast. received "
               << phi::product(out->dims());
     }
 
diff --git a/paddle/fluid/operators/collective/c_broadcast_op_mlu.cc b/paddle/fluid/operators/collective/c_broadcast_op_mlu.cc
index d1e269fb5a4fe..8f07480aaab14 100644
--- a/paddle/fluid/operators/collective/c_broadcast_op_mlu.cc
+++ b/paddle/fluid/operators/collective/c_broadcast_op_mlu.cc
@@ -62,7 +62,7 @@ class CBroadcastOPMLUKernel : public framework::OpKernel<T> {
     } else {
       PADDLE_ENFORCE_MLU_SUCCESS(cnclBcast(out->mutable_data<T>(place), numel,
                                            dtype, root, comm->comm(), stream));
-      VLOG(3) << "rank " << comm->rank() << " invoke Bcast. recieved "
+      VLOG(3) << "rank " << comm->rank() << " invoke Bcast. received "
               << phi::product(out->dims());
     }
 
diff --git a/paddle/fluid/operators/collective/c_broadcast_op_npu.cc b/paddle/fluid/operators/collective/c_broadcast_op_npu.cc
index 31961d8a246a9..a065e49ff72be 100644
--- a/paddle/fluid/operators/collective/c_broadcast_op_npu.cc
+++ b/paddle/fluid/operators/collective/c_broadcast_op_npu.cc
@@ -59,7 +59,7 @@ class CBroadcastOpASCENDKernel : public framework::OpKernel<T> {
     PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclBroadcast(
         ptr, numel, dtype, (uint32_t)root, comm->comm(), stream));
 
-    VLOG(3) << "rank " << comm->rank() << " invoke Bcast. recieved "
+    VLOG(3) << "rank " << comm->rank() << " invoke Bcast. received "
             << phi::product(out->dims());
 
     dev_ctx->Wait();
diff --git a/paddle/fluid/operators/conv_op.h b/paddle/fluid/operators/conv_op.h
index a5d888765bf37..58f2eeee256db 100644
--- a/paddle/fluid/operators/conv_op.h
+++ b/paddle/fluid/operators/conv_op.h
@@ -43,7 +43,7 @@ inline int ConvOutputSize(int input_size, int filter_size, int dilation,
       output_size, 0,
       platform::errors::InvalidArgument(
           "The output's size is expected to be greater than 0. "
-          "But recieved: output's size is %d. The output's size is computed by "
+          "But received: output's size is %d. The output's size is computed by "
           "((input_size + 2 * padding - (dilation * (filter_size - 1) + 1)) / "
           "stride + 1), where input_size is %d, padding is %d, "
           "filter_size is %d, dilation is %d, stride is %d.",
@@ -60,7 +60,7 @@ inline int ConvOutputSize(int input_size, int filter_size, int dilation,
       output_size, 0,
       platform::errors::InvalidArgument(
           "The output's size is expected to be greater than 0. "
-          "But recieved: output's size is %d. The output's size is computed by "
+          "But received: output's size is %d. The output's size is computed by "
           "((input_size + padding_1 + padding_2 - (dilation * (filter_size - "
           "1) + 1)) / stride + 1), where input_size is %d, padding is "
           "(%d, %d), filter_size is %d, dilation is %d, stride is %d.",
@@ -90,7 +90,7 @@ inline void UpdatePaddingAndDilation(std::vector<T>* paddings,
         platform::errors::InvalidArgument(
             "Attribute padding's size should be the same or twice as the "
             "input's dimension. "
-            "But recieved: padding's size is %d, padding is [%s]; input's "
+            "But received: padding's size is %d, padding is [%s]; input's "
             "dimension is %d, input's shape is [%s].",
             paddings->size(), phi::make_ddim(*paddings), data_dims.size(),
             data_dims));
diff --git a/paddle/fluid/operators/conv_op_mlu.cc b/paddle/fluid/operators/conv_op_mlu.cc
index 1ee772ec72950..c1517dbe16f84 100644
--- a/paddle/fluid/operators/conv_op_mlu.cc
+++ b/paddle/fluid/operators/conv_op_mlu.cc
@@ -98,7 +98,7 @@ class MLUConvOpKernel : public framework::OpKernel<T> {
         output_desc.get(), GetBasePtr(&output_tensor));
 
     if (!channel_last) {
-      // transpose ouput from NHWC to NCHW
+      // transpose output from NHWC to NCHW
       const std::vector<int> perm_to_nchw = {0, 3, 1, 2};
       TransposeFromMLUTensor<T>(ctx, perm_to_nchw, &output_tensor, output,
                                 false /*need_reshape_or_alloc*/);
diff --git a/paddle/fluid/operators/ctc_align_op.cu b/paddle/fluid/operators/ctc_align_op.cu
index b1f2e61ef3930..ba90c677570c5 100644
--- a/paddle/fluid/operators/ctc_align_op.cu
+++ b/paddle/fluid/operators/ctc_align_op.cu
@@ -26,19 +26,19 @@ __global__ void MergeAndDelCudaKernel(const int64_t num_token, const T* tokens,
                                       const size_t num_seq, size_t* lod0,
                                       const int blank, const int merge_repeated,
                                       size_t* out_lod0, T* output) {
-  int ouput_idx = 0;
+  int output_idx = 0;
   out_lod0[0] = 0;
 
   for (int i = 0; i < num_seq; ++i) {
     T pre_token = -1;
     for (int j = lod0[i]; j < lod0[i + 1]; ++j) {
       if (tokens[j] != blank && !(merge_repeated && tokens[j] == pre_token)) {
-        output[ouput_idx] = tokens[j];
-        ++ouput_idx;
+        output[output_idx] = tokens[j];
+        ++output_idx;
       }
       pre_token = tokens[j];
     }
-    out_lod0[i + 1] = ouput_idx;
+    out_lod0[i + 1] = output_idx;
   }
 }
 
diff --git a/paddle/fluid/operators/deformable_psroi_pooling_op.cu b/paddle/fluid/operators/deformable_psroi_pooling_op.cu
index 448f67a4bad7a..873950b2d2f65 100644
--- a/paddle/fluid/operators/deformable_psroi_pooling_op.cu
+++ b/paddle/fluid/operators/deformable_psroi_pooling_op.cu
@@ -200,7 +200,7 @@ class DeformablePSROIPoolCUDAKernel : public framework::OpKernel<T> {
         num_rois, out->dims()[0],
         platform::errors::InvalidArgument(
             "The number of Input(ROIs) should be same with the number of "
-            "Ouput(Output), but received ROIs number is:%d, Output number "
+            "Output(Output), but received ROIs number is:%d, Output number "
             "is:%d.",
             num_rois, out->dims()[0]));
     const int count = num_rois * output_dim * pooled_height * pooled_width;
diff --git a/paddle/fluid/operators/deformable_psroi_pooling_op.h b/paddle/fluid/operators/deformable_psroi_pooling_op.h
index 51a0fe4172ca2..3deabce54ed0b 100644
--- a/paddle/fluid/operators/deformable_psroi_pooling_op.h
+++ b/paddle/fluid/operators/deformable_psroi_pooling_op.h
@@ -175,7 +175,7 @@ class DeformablePSROIPoolCPUKernel : public framework::OpKernel<T> {
         num_rois, out->dims()[0],
         platform::errors::InvalidArgument(
             "The number of Input(ROIs) should be same with the number of "
-            "Ouput(Output), but received ROIs number is:%d, Output number "
+            "Output(Output), but received ROIs number is:%d, Output number "
             "is:%d.",
             num_rois, out->dims()[0]));
     framework::Tensor roi_batch_id_list;
diff --git a/paddle/fluid/operators/detection/matrix_nms_op.cc b/paddle/fluid/operators/detection/matrix_nms_op.cc
index 713c2dc7fe9c1..3353739b01bf6 100644
--- a/paddle/fluid/operators/detection/matrix_nms_op.cc
+++ b/paddle/fluid/operators/detection/matrix_nms_op.cc
@@ -385,7 +385,7 @@ independently for each class. The outputs is a 2-D LoDTenosr, for each
 image, the offsets in first dimension of LoDTensor are called LoD, the number
 of offset is N + 1, where N is the batch size. If LoD[i + 1] - LoD[i] == 0,
 means there is no detected bbox for this image. Now this operator has one more
-ouput, which is RoisNum. The size of RoisNum is N, RoisNum[i] means the number of 
+output, which is RoisNum. The size of RoisNum is N, RoisNum[i] means the number of 
 detected bbox for this image.
 
 For more information on Matrix NMS, please refer to:
diff --git a/paddle/fluid/operators/dropout_op_npu.cc b/paddle/fluid/operators/dropout_op_npu.cc
index 07b3b53811625..104ab1b504640 100644
--- a/paddle/fluid/operators/dropout_op_npu.cc
+++ b/paddle/fluid/operators/dropout_op_npu.cc
@@ -54,7 +54,7 @@ class DropoutNPUKernel : public framework::OpKernel<T> {
       return;
     }
 
-    // only achive the default `upscale_in_train` method
+    // only achieve the default `upscale_in_train` method
     if (!is_test) {
       Tensor tmp_x(x->dtype());
       Tensor tmp_out(out->dtype());
diff --git a/paddle/fluid/operators/elementwise/elementwise_mlu.h b/paddle/fluid/operators/elementwise/elementwise_mlu.h
index 156cea81c0f63..ff1e12103be91 100644
--- a/paddle/fluid/operators/elementwise/elementwise_mlu.h
+++ b/paddle/fluid/operators/elementwise/elementwise_mlu.h
@@ -165,7 +165,7 @@ template <UNARY_FUNCTOR func>
 void MLUUnary(const framework::ExecutionContext& ctx,
               cnnlComputationPreference_t prefer,
               const cnnlTensorDescriptor_t input_desc, const void* input,
-              const cnnlTensorDescriptor_t ouput_desc, void* output);
+              const cnnlTensorDescriptor_t output_desc, void* output);
 
 template <>
 inline void MLUUnary<NEG>(const framework::ExecutionContext& ctx,
diff --git a/paddle/fluid/operators/fc_op.cc b/paddle/fluid/operators/fc_op.cc
index 33518953004ae..6e646f0d4bf26 100644
--- a/paddle/fluid/operators/fc_op.cc
+++ b/paddle/fluid/operators/fc_op.cc
@@ -75,7 +75,7 @@ class FCOp : public framework::OperatorWithKernel {
         platform::errors::InvalidArgument(
             "The attribute in_num_col_dims used to flatten Input to "
             "a 2-D tensor, is expected to be less than the number of "
-            "Input's dimensions. But recieved in_num_col_dims is %d, "
+            "Input's dimensions. But received in_num_col_dims is %d, "
             "the number of Input's dimensions is %d, Input's shape is %s.",
             in_num_col_dims, in_dims.size(), in_dims));
 
@@ -93,7 +93,7 @@ class FCOp : public framework::OperatorWithKernel {
           in_dims.size() >= 2 && in_dims.size() <= 4, true,
           platform::errors::Unimplemented(
               "The Input of fc is expected to be a 2-D, 3-D or 4-D tensor when "
-              "use_mkldnn is set. But recieved the number of Input's "
+              "use_mkldnn is set. But received the number of Input's "
               "dimensions is %d, Input's shape is %s.",
               in_dims.size(), in_dims));
     }
diff --git a/paddle/fluid/operators/fc_op.h b/paddle/fluid/operators/fc_op.h
index 6d3b531ce0aa6..47c7128603587 100644
--- a/paddle/fluid/operators/fc_op.h
+++ b/paddle/fluid/operators/fc_op.h
@@ -36,7 +36,7 @@ inline void FCOutputSize(const framework::DDim& in_dims,
       in_mat_dims[1], w_dims0,
       platform::errors::InvalidArgument(
           "The input's second dimension and weight's first dimension is "
-          "expected to be the same. But recieved input's second dimension is "
+          "expected to be the same. But received input's second dimension is "
           "%d, input's shape is %s; weight's first dimension is %d, weight's "
           "shape is %s.",
           in_mat_dims[1], in_mat_dims, w_dims0,
diff --git a/paddle/fluid/operators/fill_constant_op.cc b/paddle/fluid/operators/fill_constant_op.cc
index 2e924da283ab3..07593a70f05b7 100644
--- a/paddle/fluid/operators/fill_constant_op.cc
+++ b/paddle/fluid/operators/fill_constant_op.cc
@@ -32,7 +32,7 @@ class FillConstantOp : public framework::OperatorWithKernel {
             shape[i], 0,
             platform::errors::InvalidArgument(
                 "Each value of attribute 'shape' is expected to be no less "
-                "than 0. But recieved: shape[%u] = %d; shape = [%s].",
+                "than 0. But received: shape[%u] = %d; shape = [%s].",
                 i, shape[i], phi::make_ddim(shape)));
       }
     }
diff --git a/paddle/fluid/operators/fold_op.cc b/paddle/fluid/operators/fold_op.cc
index 92f59e118c3b7..9c9183c8fafa4 100644
--- a/paddle/fluid/operators/fold_op.cc
+++ b/paddle/fluid/operators/fold_op.cc
@@ -76,47 +76,47 @@ class FoldOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_GT(kernel_height, 0,
                       platform::errors::InvalidArgument(
                           "The `kernel_sizes` should be greater than zero, "
-                          "but recieved kernel_height: %d kernel_width: %d.",
+                          "but received kernel_height: %d kernel_width: %d.",
                           kernel_sizes[0], kernel_sizes[1]));
     PADDLE_ENFORCE_GT(kernel_width, 0,
                       platform::errors::InvalidArgument(
                           "The `kernel_sizes` should be greater than zero, "
-                          "but recieved kernel_height: %d kernel_width: %d.",
+                          "but received kernel_height: %d kernel_width: %d.",
                           kernel_sizes[0], kernel_sizes[1]));
     // check strides
     PADDLE_ENFORCE_GT(stride_height, 0,
                       platform::errors::InvalidArgument(
                           "The `strides` should be greater than zero, "
-                          "but recieved strides_height: %d strides_width: %d.",
+                          "but received strides_height: %d strides_width: %d.",
                           strides[0], strides[1]));
     PADDLE_ENFORCE_GT(stride_width, 0,
                       platform::errors::InvalidArgument(
                           "The `strides` should be greater than zero, "
-                          "but recieved strides_height: %d strides_width: %d.",
+                          "but received strides_height: %d strides_width: %d.",
                           strides[0], strides[1]));
     // check dilations
     PADDLE_ENFORCE_GT(output_height, 1,
                       platform::errors::InvalidArgument(
                           "The `output_height` should be greater than one, "
-                          "but recieved output_height: %d .",
+                          "but received output_height: %d .",
                           output_height));
     PADDLE_ENFORCE_GT(output_width, 1,
                       platform::errors::InvalidArgument(
                           "The `output_width` should be greater than one, "
-                          "but recieved output_width: %d .",
+                          "but received output_width: %d .",
                           output_width));
     // check output size
     PADDLE_ENFORCE_GT(
         dilation_height, 0,
         platform::errors::InvalidArgument(
             "The `dilations` should be greater than zero, "
-            "but recieved dilations_height: %d dilations_width: %d.",
+            "but received dilations_height: %d dilations_width: %d.",
             dilations[0], dilations[1]));
     PADDLE_ENFORCE_GT(
         dilation_width, 0,
         platform::errors::InvalidArgument(
             "The `dilations` should be greater than zero, "
-            "but recieved dilations_height: %d dilations_width: %d.",
+            "but received dilations_height: %d dilations_width: %d.",
             dilations[0], dilations[1]));
 
     std::vector<int> out_dims;
diff --git a/paddle/fluid/operators/fused/conv_fusion_op.cc b/paddle/fluid/operators/fused/conv_fusion_op.cc
index e60fc44e9a6ff..671e94061cb5c 100644
--- a/paddle/fluid/operators/fused/conv_fusion_op.cc
+++ b/paddle/fluid/operators/fused/conv_fusion_op.cc
@@ -80,7 +80,7 @@ class Conv2DFusionOp : public operators::ConvOp {
         data_format, "NHWC",
         platform::errors::PermissionDenied(
             "Operator(Conv2DFusion) only supports data format of "
-            "channel first (NCHW) now. But recieved: data_format = '%s'.",
+            "channel first (NCHW) now. But received: data_format = '%s'.",
             data_format));
 
     std::vector<int64_t> output_shape = ComputeOutputShape(ctx);
@@ -113,7 +113,7 @@ class Conv2DFusionOp : public operators::ConvOp {
           split_channels_sum, output_shape[1],
           platform::errors::InvalidArgument(
               "The sum of Attr(split_channels) is expected to be equal to the "
-              "total output channels. But recieved: the sum of "
+              "total output channels. But received: the sum of "
               "Attr(split_channels) = %d, the total output channels = %d.",
               split_channels_sum, output_shape[1]));
 
diff --git a/paddle/fluid/operators/fused/conv_fusion_op.cu b/paddle/fluid/operators/fused/conv_fusion_op.cu
index 5dbf4fb88b2a7..8191c85f2a120 100644
--- a/paddle/fluid/operators/fused/conv_fusion_op.cu
+++ b/paddle/fluid/operators/fused/conv_fusion_op.cu
@@ -130,7 +130,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
         default:
           PADDLE_THROW(platform::errors::PermissionDenied(
               "Operator Conv2DFusion expects Input to be a 4-D or 5-D Tensor. "
-              "But recieved the actual dimension = %d, shape = [%s].",
+              "But received the actual dimension = %d, shape = [%s].",
               rank, transformed_input_channel.dims()));
       }
 
@@ -355,7 +355,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
         workspace_size_in_bytes, workspace_size_limit,
         platform::errors::InvalidArgument(
             "The actual workspace size to be allocated for cuDNN is expected "
-            "to be less than the limit. But recieved: the actual workspace "
+            "to be less than the limit. But received: the actual workspace "
             "size = %d, limit = %d.",
             workspace_size_in_bytes, workspace_size_limit));
 
@@ -414,7 +414,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
       } else {
         // TODO(qingiqng): do copy when batch size large than 1
         PADDLE_THROW(platform::errors::Unimplemented(
-            "Input with batch size greater than 1 is unsupported. The recieved "
+            "Input with batch size greater than 1 is unsupported. The received "
             "batch size is %d, Input's shape is [%s].",
             x_dims[0], phi::make_ddim(x_dims)));
       }
diff --git a/paddle/fluid/operators/fused/cudnn_bn_stats_finalize.cu.h b/paddle/fluid/operators/fused/cudnn_bn_stats_finalize.cu.h
index 74cc92eb8ab62..4b3ed56890e18 100644
--- a/paddle/fluid/operators/fused/cudnn_bn_stats_finalize.cu.h
+++ b/paddle/fluid/operators/fused/cudnn_bn_stats_finalize.cu.h
@@ -40,7 +40,7 @@ struct BNStatsFinalizeArgs {
     PADDLE_ENFORCE_EQ(
         param_shape.size(), 4U,
         platform::errors::InvalidArgument(
-            "The size of param_shape is expected to 4. But recieved "
+            "The size of param_shape is expected to 4. But received "
             "param_shape's size is %d, param_shape is [%s].",
             param_shape.size(), phi::make_ddim(param_shape)));
 
diff --git a/paddle/fluid/operators/fused/cudnn_norm_conv.cu.h b/paddle/fluid/operators/fused/cudnn_norm_conv.cu.h
index f63fe4b96cbeb..b32f2e40933ac 100644
--- a/paddle/fluid/operators/fused/cudnn_norm_conv.cu.h
+++ b/paddle/fluid/operators/fused/cudnn_norm_conv.cu.h
@@ -45,13 +45,13 @@ struct NormConvolutionArgs {
     PADDLE_ENFORCE_EQ(
         input_shape.size(), 4U,
         platform::errors::InvalidArgument(
-            "The size of input_shape is expected to 4. But recieved "
+            "The size of input_shape is expected to 4. But received "
             "input_shape's size is %d, input_shape is [%s].",
             input_shape.size(), phi::make_ddim(input_shape)));
     PADDLE_ENFORCE_EQ(
         filter_shape.size(), 4U,
         platform::errors::InvalidArgument(
-            "The size of filter_shape is expected to 4. But recieved "
+            "The size of filter_shape is expected to 4. But received "
             "filter_shape's size is %d, filter_shape is [%s].",
             filter_shape.size(), phi::make_ddim(filter_shape)));
     PADDLE_ENFORCE_EQ(filter_shape[1] == filter_shape[2] &&
@@ -59,20 +59,20 @@ struct NormConvolutionArgs {
                       true,
                       platform::errors::InvalidArgument(
                           "The filter_shape is expected to store as nhwc, and "
-                          "h = w = 1 or 3. But recieved filter_shape is [%s].",
+                          "h = w = 1 or 3. But received filter_shape is [%s].",
                           phi::make_ddim(filter_shape)));
     PADDLE_ENFORCE_EQ((filter_shape[0] % 32 == 0 && filter_shape[3] % 8 == 0),
                       true,
                       platform::errors::InvalidArgument(
                           "The input channel is expected to be multiple of 8, "
                           "and the output channel is expected to be multiple "
-                          "of 32. But recieved input channel is %d, output "
+                          "of 32. But received input channel is %d, output "
                           "channel is %d.",
                           filter_shape[3], filter_shape[0]));
     PADDLE_ENFORCE_EQ(
         output_shape.size(), 4U,
         platform::errors::InvalidArgument(
-            "The size of output_shape is expected to 4. But recieved "
+            "The size of output_shape is expected to 4. But received "
             "filter_shape's size is %d, filter_shape is [%s].",
             output_shape.size(), phi::make_ddim(output_shape)));
     is_support = IsSupport(ctx, filter_shape, stride, dilation, group);
@@ -83,7 +83,7 @@ struct NormConvolutionArgs {
             "compatiblity greater than or equal to 70 and the kernel size "
             "must be equal to 1 or 3. When the kernel size is 1, "
             "the stride must be 1 if the compatiblity is equal to 70. "
-            "Besides, the dilation and group must be equal to 1. But recieved "
+            "Besides, the dilation and group must be equal to 1. But received "
             "compatiblity is %d, kernel size is %d, stride is %d, "
             "dilation is %d, group is %d",
             ctx.GetComputeCapability(), filter_shape[1], stride, dilation,
diff --git a/paddle/fluid/operators/fused/cudnn_scale_bias_add_relu.cu.h b/paddle/fluid/operators/fused/cudnn_scale_bias_add_relu.cu.h
index 9d3090a7179f0..c8588b0c02e9d 100644
--- a/paddle/fluid/operators/fused/cudnn_scale_bias_add_relu.cu.h
+++ b/paddle/fluid/operators/fused/cudnn_scale_bias_add_relu.cu.h
@@ -43,19 +43,19 @@ struct ScaleBiasAddReluArgs {
     PADDLE_ENFORCE_EQ(
         data_shape.size(), 4U,
         platform::errors::InvalidArgument(
-            "The size of data_shape is expected to 4. But recieved "
+            "The size of data_shape is expected to 4. But received "
             "data_shape's size is %d, data_shape is [%s].",
             data_shape.size(), phi::make_ddim(data_shape)));
     PADDLE_ENFORCE_EQ(
         param_shape.size(), 4U,
         platform::errors::InvalidArgument(
-            "The size of param_shape is expected to 4. But recieved "
+            "The size of param_shape is expected to 4. But received "
             "param_shape's size is %d, param_shape is [%s].",
             param_shape.size(), phi::make_ddim(param_shape)));
     PADDLE_ENFORCE_EQ(
         bitmask_shape.size(), 3U,
         platform::errors::InvalidArgument(
-            "The size of bitmask_shape is expected to 3. But recieved "
+            "The size of bitmask_shape is expected to 3. But received "
             "bitmask_shape's size is %d, bitmask_shape is [%s].",
             bitmask_shape.size(), phi::make_ddim(bitmask_shape)));
 
diff --git a/paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cc b/paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cc
index 27dae27751681..1b5b074ef1c71 100644
--- a/paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cc
+++ b/paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cc
@@ -76,7 +76,7 @@ class FusedFCElementwiseLayerNormOp : public framework::OperatorWithKernel {
         platform::errors::InvalidArgument(
             "The attribute x_num_col_dims used to flatten input X to "
             "a 2-D tensor, is expected to be less than the number of "
-            "input X's dimensions. But recieved x_num_col_dims is %d, "
+            "input X's dimensions. But received x_num_col_dims is %d, "
             "the number of input X's dimensions is %d, input X's shape is %s.",
             x_num_col_dims, x_dims.size(), x_dims));
 
@@ -85,7 +85,7 @@ class FusedFCElementwiseLayerNormOp : public framework::OperatorWithKernel {
         x_mat_dims[1], w_dims[0],
         platform::errors::InvalidArgument(
             "The input's second dimension and weight's first dimension is "
-            "expected to be the same. But recieved input's second dimension is "
+            "expected to be the same. But received input's second dimension is "
             "%d, input's shape is %s; weight's first dimension is %d, weight's "
             "shape is %s.",
             x_mat_dims[1], x_mat_dims, w_dims[0], w_dims));
@@ -100,7 +100,7 @@ class FusedFCElementwiseLayerNormOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(phi::make_ddim(fc_out_dims), y_dims,
                       platform::errors::InvalidArgument(
                           "The output's shape of fc is expected to be equal to "
-                          "that of input Y. But recieved output's shape of fc "
+                          "that of input Y. But received output's shape of fc "
                           "is %s, input Y's shape is %s.",
                           phi::make_ddim(fc_out_dims), y_dims));
 
@@ -110,7 +110,7 @@ class FusedFCElementwiseLayerNormOp : public framework::OperatorWithKernel {
         platform::errors::InvalidArgument(
             "The attribute begin_norm_axis used to flatten input Y to a 2-D "
             "tensor, is expected to be less than the number of input Y's "
-            "dimensions. But recieved begin_norm_axis is %d, the number of "
+            "dimensions. But received begin_norm_axis is %d, the number of "
             "input Y's dimensions is %d, input Y's shape is %s.",
             begin_norm_axis, y_dims.size(), y_dims));
 
@@ -122,7 +122,7 @@ class FusedFCElementwiseLayerNormOp : public framework::OperatorWithKernel {
       PADDLE_ENFORCE_EQ(scale_dims.size(), 1,
                         platform::errors::InvalidArgument(
                             "The input Scale is expected to be an 1-D tensor. "
-                            "But recieved the number of input Scale's "
+                            "But received the number of input Scale's "
                             "dimensions is %d, input Scale's shape is %s.",
                             scale_dims.size(), scale_dims));
 
@@ -132,7 +132,7 @@ class FusedFCElementwiseLayerNormOp : public framework::OperatorWithKernel {
             platform::errors::InvalidArgument(
                 "The first dimension of input Scale is expected to be equal to "
                 "the second dimension of input Y after flattened. "
-                "But recieved the first dimension of input Scale is %d, input "
+                "But received the first dimension of input Scale is %d, input "
                 "Scale's shape is %s; the second dimension of flattened input "
                 "Y is %d, input Y's shape is %s, flattened axis is %d.",
                 scale_dims[0], scale_dims, dim_1, y_dims, begin_norm_axis));
@@ -144,7 +144,7 @@ class FusedFCElementwiseLayerNormOp : public framework::OperatorWithKernel {
           bias1_dims.size(), 1,
           platform::errors::InvalidArgument(
               "The input Bias1 is expected to be an 1-D tensor. "
-              "But recieved the number of input Bias1's dimension is %d, "
+              "But received the number of input Bias1's dimension is %d, "
               "input Bias1's shape is %s.",
               bias1_dims.size(), bias1_dims));
 
@@ -154,7 +154,7 @@ class FusedFCElementwiseLayerNormOp : public framework::OperatorWithKernel {
             platform::errors::InvalidArgument(
                 "The first dimension of input Bias1 is expected to be equal to "
                 "the second dimension of input Y after flattened. "
-                "But recieved the first dimension of input Bias1 is %d, input "
+                "But received the first dimension of input Bias1 is %d, input "
                 "Bias1's shape is %s; the second dimension of flatten input "
                 "Y is %d, input Y's shape is %s, flattened axis is %d.",
                 bias1_dims[0], bias1_dims, dim_1, y_dims, begin_norm_axis));
diff --git a/paddle/fluid/operators/fused/fusion_group_op.cc b/paddle/fluid/operators/fused/fusion_group_op.cc
index 738e069081511..1ebbdf792df85 100644
--- a/paddle/fluid/operators/fused/fusion_group_op.cc
+++ b/paddle/fluid/operators/fused/fusion_group_op.cc
@@ -52,7 +52,7 @@ class FusionGroupOp : public framework::OperatorWithKernel {
             x_dims[0], x_dims[i],
             platform::errors::InvalidArgument(
                 "All the inputs' dims is expected to be the same. "
-                "But recieved [%s] (name: %s) vs [%s] (name: %s).",
+                "But received [%s] (name: %s) vs [%s] (name: %s).",
                 x_dims[0], input_names[0], x_dims[i], input_names[i]));
       }
       std::vector<framework::DDim> out_dims;
diff --git a/paddle/fluid/operators/inverse_op.cc b/paddle/fluid/operators/inverse_op.cc
index 8c1fd34ae87d2..f5b817a0e11fa 100644
--- a/paddle/fluid/operators/inverse_op.cc
+++ b/paddle/fluid/operators/inverse_op.cc
@@ -33,21 +33,21 @@ class InverseOp : public framework::OperatorWithKernel {
         input_rank, 2,
         platform::errors::InvalidArgument(
             "The dimension of Input(Input) is expected to be no less than 2. "
-            "But recieved: Input(Input)'s dimension = %d, shape = [%s].",
+            "But received: Input(Input)'s dimension = %d, shape = [%s].",
             input_rank, input_dims));
     for (int64_t i = 0; i < input_rank; ++i) {
       PADDLE_ENFORCE_EQ(
           (input_dims[i] == -1) || (input_dims[i] > 0), true,
           platform::errors::InvalidArgument(
               "Each dimension of input tensor is expected to be -1 or a "
-              "positive number, but recieved %d. Input's shape is [%s].",
+              "positive number, but received %d. Input's shape is [%s].",
               input_dims[i], input_dims));
     }
     if (input_dims[input_rank - 2] > 0 && input_dims[input_rank - 1] > 0) {
       PADDLE_ENFORCE_EQ(input_dims[input_rank - 2], input_dims[input_rank - 1],
                         platform::errors::InvalidArgument(
                             "The last two dimensions are expected to be equal. "
-                            "But recieved: %d and %d; "
+                            "But received: %d and %d; "
                             "Input(Input)'s shape = [%s].",
                             input_dims[input_rank - 2],
                             input_dims[input_rank - 1], input_dims));
diff --git a/paddle/fluid/operators/lod_reset_op.h b/paddle/fluid/operators/lod_reset_op.h
index 86327a4f2c13a..642c8bcd9ae49 100644
--- a/paddle/fluid/operators/lod_reset_op.h
+++ b/paddle/fluid/operators/lod_reset_op.h
@@ -77,7 +77,7 @@ class LoDResetKernel : public framework::OpKernel<T> {
         platform::errors::InvalidArgument(
             "The last value of 'Target LoD''s last level LoD should be equal "
             "to the first dimension of Input(X). But received the 'Target LoD' "
-            "is %s, Input(X)'s shape is is %s.",
+            "is %s, Input(X)'s shape is %s.",
             phi::make_ddim(level0), in->dims()));
     for (size_t i = 0; i < level0.size() - 1; ++i) {
       PADDLE_ENFORCE_GE(level0[i + 1], level0[i],
diff --git a/paddle/fluid/operators/math/cross_entropy.h b/paddle/fluid/operators/math/cross_entropy.h
index e339be06d69ed..da7340e4eb0b3 100644
--- a/paddle/fluid/operators/math/cross_entropy.h
+++ b/paddle/fluid/operators/math/cross_entropy.h
@@ -38,7 +38,7 @@ struct TolerableValue {
 // NOTE(dzh): float16 value clip behave different.
 // 1. Our ValueClipping has a  hardcore threshold 1e20
 // for float number. 1e20 will resulting in overflow in float16.
-// 2. float16 should expose the the real number overflow to python.
+// 2. float16 should expose the real number overflow to python.
 // because mixed-training depends the inf/nan value to determine
 // if the scale value will be adjusted.
 // Also. In standard implementation of cross entropy, other
diff --git a/paddle/fluid/operators/math/selected_rows_functor.cc b/paddle/fluid/operators/math/selected_rows_functor.cc
index f77287826ffb3..a880afb0e9be3 100644
--- a/paddle/fluid/operators/math/selected_rows_functor.cc
+++ b/paddle/fluid/operators/math/selected_rows_functor.cc
@@ -32,7 +32,7 @@ struct SelectedRowsAdd<platform::CPUDeviceContext, T> {
     PADDLE_ENFORCE_EQ(
         in1_height, input2.height(),
         platform::errors::InvalidArgument("The two inputs height must be equal."
-                                          "But recieved first input height  = "
+                                          "But received first input height  = "
                                           "[%d], second input height = [%d]",
                                           in1_height, input2.height()));
     output->set_height(in1_height);
@@ -56,27 +56,27 @@ struct SelectedRowsAdd<platform::CPUDeviceContext, T> {
         in1_row_numel, in2_value.numel() / in2_rows.size(),
         platform::errors::InvalidArgument(
             "The two inputs width must be equal."
-            "But recieved first input width = [%d], second input width = [%d]",
+            "But received first input width = [%d], second input width = [%d]",
             in1_row_numel, in2_value.numel() / in2_rows.size()));
     PADDLE_ENFORCE_EQ(
         in1_row_numel, out_value->numel() / out_rows.size(),
         platform::errors::InvalidArgument(
             "The input and oupput width must be equal."
-            "But recieved input width = [%d], output width = [%d]",
+            "But received input width = [%d], output width = [%d]",
             in1_row_numel, out_value->numel() / out_rows.size()));
 
     auto in1_place = input1.place();
     PADDLE_ENFORCE_EQ(platform::is_cpu_place(in1_place), true,
                       platform::errors::InvalidArgument(
-                          "The running enviroment is not on the CPU place."));
+                          "The running environment is not on the CPU place."));
     auto in2_place = input2.place();
     PADDLE_ENFORCE_EQ(platform::is_cpu_place(in2_place), true,
                       platform::errors::InvalidArgument(
-                          "The running enviroment is not on the CPU place."));
+                          "The running environment is not on the CPU place."));
     auto out_place = context.GetPlace();
     PADDLE_ENFORCE_EQ(platform::is_cpu_place(out_place), true,
                       platform::errors::InvalidArgument(
-                          "The running enviroment is not on the CPU place."));
+                          "The running environment is not on the CPU place."));
 
     auto* out_data = out_value->data<T>();
     auto* in1_data = in1_value.data<T>();
@@ -103,14 +103,14 @@ struct SelectedRowsAddTensor<platform::CPUDeviceContext, T> {
     PADDLE_ENFORCE_EQ(
         in1_height, in2_dims[0],
         platform::errors::InvalidArgument("The two inputs height must be equal."
-                                          "But recieved first input height = "
+                                          "But received first input height = "
                                           "[%d], second input height = [%d]",
                                           in1_height, in2_dims[0]));
     PADDLE_ENFORCE_EQ(
         in1_height, out_dims[0],
         platform::errors::InvalidArgument(
             "The input and output height must be equal."
-            "But recieved input height = [%d], output height = [%d]",
+            "But received input height = [%d], output height = [%d]",
             in1_height, out_dims[0]));
 
     auto& in1_value = input1.value();
@@ -121,13 +121,13 @@ struct SelectedRowsAddTensor<platform::CPUDeviceContext, T> {
         in1_row_numel, input2.numel() / in1_height,
         platform::errors::InvalidArgument(
             "The two inputs width must be equal."
-            "But recieved first input width = [%d], second input width = [%d]",
+            "But received first input width = [%d], second input width = [%d]",
             in1_row_numel, input2.numel() / in1_height));
     PADDLE_ENFORCE_EQ(
         in1_row_numel, output->numel() / in1_height,
         platform::errors::InvalidArgument(
             "The input and output width must be equal."
-            "But recieved input width = [%d], output width = [%d]",
+            "But received input width = [%d], output width = [%d]",
             in1_row_numel, output->numel() / in1_height));
 
     phi::funcs::SetConstant<platform::CPUDeviceContext, T> functor;
@@ -161,7 +161,7 @@ struct SelectedRowsAddTo<platform::CPUDeviceContext, T> {
     PADDLE_ENFORCE_EQ(
         in1_height, input2->height(),
         platform::errors::InvalidArgument("The two inputs height must be equal."
-                                          "But recieved first input height = "
+                                          "But received first input height = "
                                           "[%d], second input height = [%d]",
                                           in1_height, input2->height()));
 
@@ -178,11 +178,11 @@ struct SelectedRowsAddTo<platform::CPUDeviceContext, T> {
     auto in1_place = input1.place();
     PADDLE_ENFORCE_EQ(platform::is_cpu_place(in1_place), true,
                       platform::errors::InvalidArgument(
-                          "The running enviroment is not on the CPU place."));
+                          "The running environment is not on the CPU place."));
     auto in2_place = input2->place();
     PADDLE_ENFORCE_EQ(platform::is_cpu_place(in2_place), true,
                       platform::errors::InvalidArgument(
-                          "The running enviroment is not on the CPU place."));
+                          "The running environment is not on the CPU place."));
 
     auto* in1_data = in1_value.data<T>();
     auto* in2_data = in2_value->data<T>();
@@ -211,7 +211,7 @@ struct SelectedRowsSumTo<platform::CPUDeviceContext, T> {
       PADDLE_ENFORCE_EQ(in1_height, input2->height(),
                         platform::errors::InvalidArgument(
                             "The two inputs height must be equal."
-                            "But recieved first input height = [%d], second "
+                            "But received first input height = [%d], second "
                             "input height = [%d]",
                             in1_height, input2->height()));
     }
@@ -253,7 +253,7 @@ struct SelectedRowsAddToTensor<platform::CPUDeviceContext, T> {
     PADDLE_ENFORCE_EQ(
         in1_height, in2_dims[0],
         platform::errors::InvalidArgument("The two inputs height must be equal."
-                                          "But recieved first input height = "
+                                          "But received first input height = "
                                           "[%d], second input height = [%d]",
                                           in1_height, in2_dims[0]));
 
@@ -265,7 +265,7 @@ struct SelectedRowsAddToTensor<platform::CPUDeviceContext, T> {
         in1_row_numel, input2->numel() / in1_height,
         platform::errors::InvalidArgument(
             "The two inputs width must be equal."
-            "But recieved first input width = [%d], second input width = [%d]",
+            "But received first input width = [%d], second input width = [%d]",
             in1_row_numel, input2->numel() / in1_height));
 
     auto* in1_data = in1_value.data<T>();
@@ -293,7 +293,7 @@ struct SelectedRowsAddToTensor<phi::CPUContext, T> {
     PADDLE_ENFORCE_EQ(
         in1_height, in2_dims[0],
         platform::errors::InvalidArgument("The two inputs height must be equal."
-                                          "But recieved first input height = "
+                                          "But received first input height = "
                                           "[%d], second input height = [%d]",
                                           in1_height, in2_dims[0]));
 
@@ -305,7 +305,7 @@ struct SelectedRowsAddToTensor<phi::CPUContext, T> {
         in1_row_numel, input2->numel() / in1_height,
         platform::errors::InvalidArgument(
             "The two inputs width must be equal."
-            "But recieved first input width = [%d], second input width = [%d]",
+            "But received first input width = [%d], second input width = [%d]",
             in1_row_numel, input2->numel() / in1_height));
 
     auto* in1_data = in1_value.data<T>();
@@ -842,7 +842,7 @@ struct UpdateToTensor<platform::CPUDeviceContext, T> {
     PADDLE_ENFORCE_EQ(
         in1_height, in2_dims[0],
         platform::errors::InvalidArgument("The two inputs height must be equal."
-                                          "But recieved first input height = "
+                                          "But received first input height = "
                                           "[%d], second input height = [%d]",
                                           in1_height, in2_dims[0]));
 
@@ -854,7 +854,7 @@ struct UpdateToTensor<platform::CPUDeviceContext, T> {
         in1_row_numel, input2->numel() / in1_height,
         platform::errors::InvalidArgument(
             "The two inputs width must be equal."
-            "But recieved first input width = [%d], second input width = [%d]",
+            "But received first input width = [%d], second input width = [%d]",
             in1_row_numel, input2->numel() / in1_height));
 
     auto* in1_data = in1_value.data<T>();
diff --git a/paddle/fluid/operators/math/selected_rows_functor.cu b/paddle/fluid/operators/math/selected_rows_functor.cu
index 542d4c9784352..db5c66d319701 100644
--- a/paddle/fluid/operators/math/selected_rows_functor.cu
+++ b/paddle/fluid/operators/math/selected_rows_functor.cu
@@ -33,7 +33,7 @@ struct SelectedRowsAdd<platform::CUDADeviceContext, T> {
     PADDLE_ENFORCE_EQ(
         in1_height, input2.height(),
         platform::errors::InvalidArgument("The two inputs height must be equal."
-                                          "But recieved first input height  = "
+                                          "But received first input height  = "
                                           "[%d], second input height = [%d]",
                                           in1_height, input2.height()));
     output->set_height(in1_height);
@@ -57,13 +57,13 @@ struct SelectedRowsAdd<platform::CUDADeviceContext, T> {
         in1_row_numel, in2_value.numel() / in2_rows.size(),
         platform::errors::InvalidArgument(
             "The two inputs width must be equal."
-            "But recieved first input width = [%d], second input width = [%d]",
+            "But received first input width = [%d], second input width = [%d]",
             in1_row_numel, in2_value.numel() / in2_rows.size()));
     PADDLE_ENFORCE_EQ(
         in1_row_numel, out_value->numel() / out_rows.size(),
         platform::errors::InvalidArgument(
             "The input and oupput width must be equal."
-            "But recieved input width = [%d], output width = [%d]",
+            "But received input width = [%d], output width = [%d]",
             in1_row_numel, out_value->numel() / out_rows.size()));
 
     auto* out_data = out_value->data<T>();
@@ -72,15 +72,15 @@ struct SelectedRowsAdd<platform::CUDADeviceContext, T> {
     auto in1_place = input1.place();
     PADDLE_ENFORCE_EQ(platform::is_gpu_place(in1_place), true,
                       platform::errors::InvalidArgument(
-                          "The running enviroment is not on the GPU place."));
+                          "The running environment is not on the GPU place."));
     auto in2_place = input2.place();
     PADDLE_ENFORCE_EQ(platform::is_gpu_place(in2_place), true,
                       platform::errors::InvalidArgument(
-                          "The running enviroment is not on the GPU place."));
+                          "The running environment is not on the GPU place."));
     auto out_place = context.GetPlace();
     PADDLE_ENFORCE_EQ(platform::is_gpu_place(out_place), true,
                       platform::errors::InvalidArgument(
-                          "The running enviroment is not on the GPU place."));
+                          "The running environment is not on the GPU place."));
 
     memory::Copy(out_place, out_data, in1_place, in1_data,
                  in1_value.numel() * sizeof(T), context.stream());
@@ -126,13 +126,13 @@ struct SelectedRowsAddTensor<platform::CUDADeviceContext, T> {
         in1_height, in2_dims[0],
         platform::errors::InvalidArgument(
             "The two inputs height must be equal."
-            "But recieved first input height = [%d], first input height = [%d]",
+            "But received first input height = [%d], first input height = [%d]",
             in1_height, in2_dims[0]));
     PADDLE_ENFORCE_EQ(
         in1_height, out_dims[0],
         platform::errors::InvalidArgument(
             "The input and output height must be equal."
-            "But recieved input height = [%d], output height = [%d]",
+            "But received input height = [%d], output height = [%d]",
             in1_height, out_dims[0]));
 
     auto& in1_value = input1.value();
@@ -143,13 +143,13 @@ struct SelectedRowsAddTensor<platform::CUDADeviceContext, T> {
         in1_row_numel, input2.numel() / in1_height,
         platform::errors::InvalidArgument(
             "The two inputs width must be equal."
-            "But recieved first input width = [%d], second input width = [%d]",
+            "But received first input width = [%d], second input width = [%d]",
             in1_row_numel, input2.numel() / in1_height));
     PADDLE_ENFORCE_EQ(
         in1_row_numel, output->numel() / in1_height,
         platform::errors::InvalidArgument(
             "The input and output width must be equal."
-            "But recieved input width = [%d], output width = [%d]",
+            "But received input width = [%d], output width = [%d]",
             in1_row_numel, output->numel() / in1_height));
 
     auto* in1_data = in1_value.data<T>();
@@ -186,13 +186,13 @@ struct SelectedRowsAddTensor<phi::GPUContext, T> {
         in1_height, in2_dims[0],
         platform::errors::InvalidArgument(
             "The two inputs height must be equal."
-            "But recieved first input height = [%d], first input height = [%d]",
+            "But received first input height = [%d], first input height = [%d]",
             in1_height, in2_dims[0]));
     PADDLE_ENFORCE_EQ(
         in1_height, out_dims[0],
         platform::errors::InvalidArgument(
             "The input and output height must be equal."
-            "But recieved input height = [%d], output height = [%d]",
+            "But received input height = [%d], output height = [%d]",
             in1_height, out_dims[0]));
 
     auto& in1_value = input1.value();
@@ -203,13 +203,13 @@ struct SelectedRowsAddTensor<phi::GPUContext, T> {
         in1_row_numel, input2.numel() / in1_height,
         platform::errors::InvalidArgument(
             "The two inputs width must be equal."
-            "But recieved first input width = [%d], second input width = [%d]",
+            "But received first input width = [%d], second input width = [%d]",
             in1_row_numel, input2.numel() / in1_height));
     PADDLE_ENFORCE_EQ(
         in1_row_numel, output->numel() / in1_height,
         platform::errors::InvalidArgument(
             "The input and output width must be equal."
-            "But recieved input width = [%d], output width = [%d]",
+            "But received input width = [%d], output width = [%d]",
             in1_row_numel, output->numel() / in1_height));
 
     auto* in1_data = in1_value.data<T>();
@@ -254,7 +254,7 @@ struct SelectedRowsAddTo<platform::CUDADeviceContext, T> {
     PADDLE_ENFORCE_EQ(
         in1_height, input2->height(),
         platform::errors::InvalidArgument("The two inputs height must be equal."
-                                          "But recieved first input height = "
+                                          "But received first input height = "
                                           "[%d], second input height = [%d]",
                                           in1_height, input2->height()));
 
@@ -273,11 +273,11 @@ struct SelectedRowsAddTo<platform::CUDADeviceContext, T> {
     auto in1_place = input1.place();
     PADDLE_ENFORCE_EQ(platform::is_gpu_place(in1_place), true,
                       platform::errors::InvalidArgument(
-                          "The running enviroment is not on the GPU place."));
+                          "The running environment is not on the GPU place."));
     auto in2_place = input2->place();
     PADDLE_ENFORCE_EQ(platform::is_gpu_place(in1_place), true,
                       platform::errors::InvalidArgument(
-                          "The running enviroment is not on the GPU place."));
+                          "The running environment is not on the GPU place."));
 
     auto* in1_data = in1_value.data<T>();
     auto* in2_data = in2_value->data<T>();
@@ -322,7 +322,7 @@ struct SelectedRowsAddToTensor<platform::CUDADeviceContext, T> {
     PADDLE_ENFORCE_EQ(
         in1_height, in2_dims[0],
         platform::errors::InvalidArgument("The two inputs height must be equal."
-                                          "But recieved first input height = "
+                                          "But received first input height = "
                                           "[%d], second input height = [%d]",
                                           in1_height, in2_dims[0]));
 
@@ -334,7 +334,7 @@ struct SelectedRowsAddToTensor<platform::CUDADeviceContext, T> {
         in1_row_numel, input2->numel() / in1_height,
         platform::errors::InvalidArgument(
             "The two inputs width must be equal."
-            "But recieved first input width = [%d], second input width = [%d]",
+            "But received first input width = [%d], second input width = [%d]",
             in1_row_numel, input2->numel() / in1_height));
 
     auto* in1_data = in1_value.data<T>();
@@ -359,7 +359,7 @@ struct SelectedRowsAddToTensor<phi::GPUContext, T> {
     PADDLE_ENFORCE_EQ(
         in1_height, in2_dims[0],
         platform::errors::InvalidArgument("The two inputs height must be equal."
-                                          "But recieved first input height = "
+                                          "But received first input height = "
                                           "[%d], second input height = [%d]",
                                           in1_height, in2_dims[0]));
 
@@ -371,7 +371,7 @@ struct SelectedRowsAddToTensor<phi::GPUContext, T> {
         in1_row_numel, input2->numel() / in1_height,
         platform::errors::InvalidArgument(
             "The two inputs width must be equal."
-            "But recieved first input width = [%d], second input width = [%d]",
+            "But received first input width = [%d], second input width = [%d]",
             in1_row_numel, input2->numel() / in1_height));
 
     auto* in1_data = in1_value.data<T>();
@@ -675,7 +675,7 @@ struct UpdateToTensor<platform::CUDADeviceContext, T> {
     PADDLE_ENFORCE_EQ(
         in1_height, in2_dims[0],
         platform::errors::InvalidArgument("The two inputs height must be equal."
-                                          "But recieved first input height = "
+                                          "But received first input height = "
                                           "[%d], second input height = [%d]",
                                           in1_height, in2_dims[0]));
 
@@ -687,7 +687,7 @@ struct UpdateToTensor<platform::CUDADeviceContext, T> {
         in1_row_numel, input2->numel() / in1_height,
         platform::errors::InvalidArgument(
             "The two inputs width must be equal."
-            "But recieved first input width = [%d], second input width = [%d]",
+            "But received first input width = [%d], second input width = [%d]",
             in1_row_numel, input2->numel() / in1_height));
 
     auto* in1_data = in1_value.template data<T>();
diff --git a/paddle/fluid/operators/metrics/accuracy_op.cc b/paddle/fluid/operators/metrics/accuracy_op.cc
index 32ef052119883..ed58c90e17022 100644
--- a/paddle/fluid/operators/metrics/accuracy_op.cc
+++ b/paddle/fluid/operators/metrics/accuracy_op.cc
@@ -36,7 +36,7 @@ class AccuracyOpMaker : public framework::OpProtoAndCheckerMaker {
   void Make() override {
     // TODO(typhoonzero): support both inference value and indices.
     AddInput("Out", "The network output of topk (inferences)");
-    AddInput("Indices", "The the network output of topk (indices)");
+    AddInput("Indices", "The network output of topk (indices)");
     AddInput("Label", "Label of the training data");
     // TODO(typhoonzero): AddInput("Weight", ...
     AddOutput("Accuracy", "The accuracy of current batch");
diff --git a/paddle/fluid/operators/mlu/mlu_baseop.cc b/paddle/fluid/operators/mlu/mlu_baseop.cc
index 6b801924446ca..867c5f212ba6c 100644
--- a/paddle/fluid/operators/mlu/mlu_baseop.cc
+++ b/paddle/fluid/operators/mlu/mlu_baseop.cc
@@ -698,14 +698,14 @@ MLUCnnlTrigonDesc::~MLUCnnlTrigonDesc() {
                                   const bool exclusive, const bool reverse,
                                   const cnnlTensorDescriptor_t input_desc,
                                   const void* input,
-                                  const cnnlTensorDescriptor_t ouput_desc,
+                                  const cnnlTensorDescriptor_t output_desc,
                                   void* output) {
   cnnlHandle_t handle = GetHandleFromCTX(ctx);
 
   // NAN propagation mode: Only support CNNL_NOT_PROPAGATE_NAN now.
   cnnlNanPropagation_t mode = CNNL_NOT_PROPAGATE_NAN;
   PADDLE_ENFORCE_MLU_SUCCESS(cnnlCumsum(handle, input_desc, input, axis,
-                                        exclusive, reverse, mode, ouput_desc,
+                                        exclusive, reverse, mode, output_desc,
                                         output));
 }
 
diff --git a/paddle/fluid/operators/positive_negative_pair_op.cc b/paddle/fluid/operators/positive_negative_pair_op.cc
index a9646b2e8acb5..cbe58644f5381 100644
--- a/paddle/fluid/operators/positive_negative_pair_op.cc
+++ b/paddle/fluid/operators/positive_negative_pair_op.cc
@@ -123,7 +123,7 @@ class PositiveNegativePairOp : public framework::OperatorWithKernel {
           column, depth,
           platform::errors::OutOfRange(
               "Attr(column) should be less than depth(the second "
-              "dimension of Input(Score)). Recieved Attr(column): %d, while "
+              "dimension of Input(Score)). Received Attr(column): %d, while "
               "depth is %d.",
               column, depth));
       PADDLE_ENFORCE_GE(
@@ -131,7 +131,7 @@ class PositiveNegativePairOp : public framework::OperatorWithKernel {
           platform::errors::OutOfRange(
               "Attr(column) should be greater than equal to negative "
               "depth, i.e. the second dimension of Input(Score). "
-              "Recieved Attr(column): %d, while negative depth is %d.",
+              "Received Attr(column): %d, while negative depth is %d.",
               column, -depth));
     }
 
diff --git a/paddle/fluid/operators/prune_gate_by_capacity_op.cu b/paddle/fluid/operators/prune_gate_by_capacity_op.cu
index 7228bdbf3805a..6a2ed6592e7fe 100644
--- a/paddle/fluid/operators/prune_gate_by_capacity_op.cu
+++ b/paddle/fluid/operators/prune_gate_by_capacity_op.cu
@@ -98,7 +98,7 @@ static void VisitDataType(paddle::experimental::DataType type,
     visitor.template apply<int64_t>();
   } else {
     PADDLE_THROW(platform::errors::InvalidArgument(
-        "The recieved values gate_id type %s can not meet input requirements. "
+        "The received values gate_id type %s can not meet input requirements. "
         "Because the given gate_id data type of operators must be "
         "int64. Please input appropriate gate_id again! ",
         "framework::DataTypeToString(type)"));
diff --git a/paddle/fluid/operators/pscore/heter_listen_and_serv_op.cc b/paddle/fluid/operators/pscore/heter_listen_and_serv_op.cc
index 2df0d7526a3d3..457e37744d316 100644
--- a/paddle/fluid/operators/pscore/heter_listen_and_serv_op.cc
+++ b/paddle/fluid/operators/pscore/heter_listen_and_serv_op.cc
@@ -63,7 +63,7 @@ void HeterListenAndServOp::RunAsyncLoop(framework::ProgramDesc *program) const {
     PADDLE_ENFORCE_EQ(pieces.size(), 2,
                       platform::errors::PreconditionNotMet(
                           "Invalid format of message_and_id argument. "
-                          "Expected \"message:block_id\". Recieved %s",
+                          "Expected \"message:block_id\". Received %s",
                           grad_and_id.c_str()));
     PADDLE_ENFORCE_EQ(out_map->count(pieces[0]), 0,
                       platform::errors::AlreadyExists(
@@ -82,7 +82,7 @@ void HeterListenAndServOp::RunAsyncLoop(framework::ProgramDesc *program) const {
   PADDLE_ENFORCE_GE(num_blocks, 1,
                     platform::errors::PreconditionNotMet(
                         "Invalid number of blocks in server program. Expected "
-                        "equal or greater than 1. Recieved %zu",
+                        "equal or greater than 1. Received %zu",
                         num_blocks));
   std::vector<int> block_list;
   for (size_t blkid = 1; blkid < num_blocks; ++blkid) {
diff --git a/paddle/fluid/operators/rnn_op_xpu.cc b/paddle/fluid/operators/rnn_op_xpu.cc
index 220d91bf4faab..941e463f63cdc 100644
--- a/paddle/fluid/operators/rnn_op_xpu.cc
+++ b/paddle/fluid/operators/rnn_op_xpu.cc
@@ -65,7 +65,7 @@ class RnnXPUKernel : public framework::OpKernel<T> {
     auto* output = ctx.Output<Tensor>("Out");
     auto* dropout_mask = ctx.Output<Tensor>("DropoutState");
     auto* reserve_data = ctx.Output<Tensor>("Reserve");
-    // Attrbutes
+    // Attributes
     const int& num_layers = ctx.Attr<int>("num_layers");
     const bool& is_bidirec = ctx.Attr<bool>("is_bidirec");
     const int& hidden_size = ctx.Attr<int>("hidden_size");
diff --git a/paddle/fluid/operators/sample_logits_op.cc b/paddle/fluid/operators/sample_logits_op.cc
index 420c4c5f257ca..e02c7ade9a11a 100644
--- a/paddle/fluid/operators/sample_logits_op.cc
+++ b/paddle/fluid/operators/sample_logits_op.cc
@@ -58,7 +58,7 @@ class SampleLogitsOpMaker : public framework::OpProtoAndCheckerMaker {
     AddOutput(
         "Probabilities",
         "(Tensor, default: Tensor<float>), A 2-D tensor with shape [N, NT + S]."
-        "The probabilites of sampled positive and negtive labels.")
+        "The probabilities of sampled positive and negtive labels.")
         .AsIntermediate();
     AddOutput("LogitsDim", "Store dim information of Logits for gradient op")
         .AsIntermediate();
diff --git a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op_npu.cc b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op_npu.cc
index f186f95a2b961..ed173bb3ebfa9 100644
--- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op_npu.cc
+++ b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op_npu.cc
@@ -22,7 +22,7 @@ using Tensor = framework::Tensor;
 const int kIgnoreIndex = -100;
 
 void CheckAttrs(const framework::ExecutionContext& ctx) {
-  // Add this check is is due to Ascend SigmoidCrossEntropyWithLogits
+  // Add this check is due to Ascend SigmoidCrossEntropyWithLogits
   // and SigmoidCrossEntropyWithLogitsGrad does't supoort
   // attr normalize and ignore_index
   bool normalize = ctx.Attr<bool>("normalize");
diff --git a/paddle/fluid/operators/sum_op.cu b/paddle/fluid/operators/sum_op.cu
index 33590c1d7cca0..8c6c083cde880 100644
--- a/paddle/fluid/operators/sum_op.cu
+++ b/paddle/fluid/operators/sum_op.cu
@@ -156,7 +156,7 @@ void SumToLoDTensor(const framework::ExecutionContext &context) {
     }
   }
 
-  // compute select rows seperately.
+  // compute select rows separately.
   if (!selectrow_index.empty()) {
     std::vector<const T *> sr_in_out_data;
     size_t rows = 0;
@@ -241,7 +241,7 @@ class SumKernel<platform::CUDADeviceContext, T>
       LodTensorArrayCompute<platform::CUDADeviceContext, T>(context);
     } else {
       PADDLE_THROW(platform::errors::InvalidArgument(
-          "Expected type of Ouput(out) must be Tensor,  SelectedRows or "
+          "Expected type of Output(out) must be Tensor,  SelectedRows or "
           "LodTensorArray. But got "
           "unsupport type: %s.",
           framework::ToTypeName(out_var->Type())));
diff --git a/paddle/fluid/operators/tdm_child_op.h b/paddle/fluid/operators/tdm_child_op.h
index 963dfd3bf7720..e437975320cc5 100644
--- a/paddle/fluid/operators/tdm_child_op.h
+++ b/paddle/fluid/operators/tdm_child_op.h
@@ -149,7 +149,7 @@ class TDMChildKernel : public framework::OpKernel<T> {
                           output_type == framework::proto::VarType::INT64;
     PADDLE_ENFORCE_EQ(out_type_match, true,
                       platform::errors::InvalidArgument(
-                          "Ouput(Child) & Output(LeafMask) holds the wrong "
+                          "Output(Child) & Output(LeafMask) holds the wrong "
                           "type, it holds %s, but "
                           "desires to be %s or %s",
                           paddle::framework::DataTypeToString(output_type),
diff --git a/paddle/fluid/operators/warpctc_op.cc b/paddle/fluid/operators/warpctc_op.cc
index 5cd9feee82895..1583e5d84b233 100644
--- a/paddle/fluid/operators/warpctc_op.cc
+++ b/paddle/fluid/operators/warpctc_op.cc
@@ -95,7 +95,7 @@ An operator integrating the open-source
 https://arxiv.org/pdf/1512.02595v1.pdf),
 to compute Connectionist Temporal Classification (CTC) loss.
 It can be aliased as softmax with ctc, since a native softmax activation is
-interated to the warp-ctc library, to to normalize values for each row of the
+interated to the warp-ctc library, to normalize values for each row of the
 input tensor.
 
 More detail of CTC loss can be found by referring to
diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt
index 356b5ab2cd23c..24d39c25cf335 100644
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -116,7 +116,7 @@ endif()
 
 cc_library(cudnn_workspace_helper SRCS cudnn_workspace_helper.cc DEPS boost)
 
-# seperate init from device_context to avoid cycle dependencies
+# separate init from device_context to avoid cycle dependencies
 cc_library(init SRCS init.cc DEPS device_context custom_kernel context_pool)
 
 # memcpy depends on device_context, here add deps individually for
diff --git a/paddle/fluid/platform/device/npu/npu_op_runner.h b/paddle/fluid/platform/device/npu/npu_op_runner.h
index 2409c14b760fd..739a3ef41e422 100644
--- a/paddle/fluid/platform/device/npu/npu_op_runner.h
+++ b/paddle/fluid/platform/device/npu/npu_op_runner.h
@@ -70,7 +70,7 @@ class NpuOpRunner {
   NpuOpRunner &AddInput(const Tensor &tensor);
 
   // NOTE(zhiqiu): CANN-5.0.2 support input tensors on host.
-  // Specifically, the tensor of shape, tensor of dims, etc, which are are small
+  // Specifically, the tensor of shape, tensor of dims, etc, which are small
   // vector/list.
   NpuOpRunner &AddInput(const Tensor &tensor, aclMemType mem_type);
 
diff --git a/paddle/fluid/platform/profiler/event_node.cc b/paddle/fluid/platform/profiler/event_node.cc
index 6c8be1811d715..b909fb5f25aa7 100644
--- a/paddle/fluid/platform/profiler/event_node.cc
+++ b/paddle/fluid/platform/profiler/event_node.cc
@@ -51,7 +51,7 @@ void NodeTrees::BuildTrees(
     const std::vector<HostTraceEventNode*>& host_event_nodes,
     std::vector<CudaRuntimeTraceEventNode*>& runtime_event_nodes,
     const std::vector<DeviceTraceEventNode*>& device_event_nodes) {
-  // seperate Host Event Nodes into different threads
+  // separate Host Event Nodes into different threads
   std::map<uint64_t, std::vector<HostTraceEventNode*>>
       thread2host_event_nodes;  // used to store HostTraceEventNodes per thread
   std::map<uint64_t, std::vector<CudaRuntimeTraceEventNode*>>
diff --git a/paddle/infrt/common/object.h b/paddle/infrt/common/object.h
index ab2d00cce985c..797595cc7c58b 100644
--- a/paddle/infrt/common/object.h
+++ b/paddle/infrt/common/object.h
@@ -25,7 +25,7 @@ template <typename T>
 class Shared;
 /**
  * Object is the basic element in the INFRT, with `Shared` wrapper, the object
- * can be shared accross the system.
+ * can be shared across the system.
  */
 struct Object {
   //! Get the type representation of this object.
diff --git a/paddle/phi/infermeta/binary.cc b/paddle/phi/infermeta/binary.cc
index 2139605fb2048..837a43905e723 100644
--- a/paddle/phi/infermeta/binary.cc
+++ b/paddle/phi/infermeta/binary.cc
@@ -1534,7 +1534,7 @@ void MvInferMeta(const MetaTensor& x, const MetaTensor& vec, MetaTensor* out) {
                     phi::errors::InvalidArgument(
                         "X's second dimension is expected to be equal to "
                         "Vec's first dimension"
-                        "but recieved X'shape = [%s], Vec's shape = [%s]",
+                        "but received X'shape = [%s], Vec's shape = [%s]",
                         dim_x,
                         dim_vec));
 
diff --git a/paddle/phi/infermeta/multiary.cc b/paddle/phi/infermeta/multiary.cc
index e793eb8e66872..48c40673ab819 100644
--- a/paddle/phi/infermeta/multiary.cc
+++ b/paddle/phi/infermeta/multiary.cc
@@ -458,7 +458,7 @@ void BatchNormInferMeta(const MetaTensor& x,
         true,
         phi::errors::InvalidArgument(
             "Each dimension of input tensor is expected to be -1 or a "
-            "positive number, but recieved %d. Input's shape is [%s].",
+            "positive number, but received %d. Input's shape is [%s].",
             x_dims[i],
             x_dims));
   }
@@ -755,7 +755,7 @@ inline int ConvOutputSize(
       0,
       phi::errors::InvalidArgument(
           "The output's size is expected to be greater than 0. "
-          "But recieved: output's size is %d. The output's size is computed by "
+          "But received: output's size is %d. The output's size is computed by "
           "((input_size + 2 * padding - (dilation * (filter_size - 1) + 1)) / "
           "stride + 1), where input_size is %d, padding is %d, "
           "filter_size is %d, dilation is %d, stride is %d.",
diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc
index 6d37a31f54562..6c2956417a3a3 100644
--- a/paddle/phi/infermeta/unary.cc
+++ b/paddle/phi/infermeta/unary.cc
@@ -2746,7 +2746,7 @@ void UnfoldInferMeta(const MetaTensor& x,
       phi::errors::InvalidArgument(
           "The dims of X should be larger than that of kernel_sizes "
           "by a number of 2, due to the batch size and input channel dim. "
-          "But recieved dims(X:%u) - dims(kernel_sizes:%u) != 2",
+          "But received dims(X:%u) - dims(kernel_sizes:%u) != 2",
           in_dims.size(),
           kernel_sizes.size()));
   PADDLE_ENFORCE_EQ(
@@ -2754,7 +2754,7 @@ void UnfoldInferMeta(const MetaTensor& x,
       kernel_sizes.size(),
       phi::errors::InvalidArgument(
           "The dims of strides should be the same with that of kernel_sizes. "
-          "But recieved dims(strides: %u) != dims(kernel_sizes: %u).",
+          "But received dims(strides: %u) != dims(kernel_sizes: %u).",
           strides.size(),
           kernel_sizes.size()));
   PADDLE_ENFORCE_EQ(
@@ -2762,7 +2762,7 @@ void UnfoldInferMeta(const MetaTensor& x,
       2 * strides.size(),
       phi::errors::InvalidArgument(
           "The dims of paddings should be 2 times of that of strides. "
-          "But recieved dims(paddings: %u) != 2*dims(strides: %u).",
+          "But received dims(paddings: %u) != 2*dims(strides: %u).",
           paddings.size(),
           strides.size()));
   PADDLE_ENFORCE_EQ(
@@ -2770,7 +2770,7 @@ void UnfoldInferMeta(const MetaTensor& x,
       dilations.size(),
       phi::errors::InvalidArgument(
           "The dims of strides should be the same with that of dilations. "
-          "But recieved dims(strides: %u) != dims(dilations: %u).",
+          "But received dims(strides: %u) != dims(dilations: %u).",
           strides.size(),
           dilations.size()));
 
@@ -2779,14 +2779,14 @@ void UnfoldInferMeta(const MetaTensor& x,
                     0,
                     phi::errors::InvalidArgument(
                         "The `kernel_sizes` should be greater than zero, "
-                        "but recieved kernel_height: %d kernel_width: %d.",
+                        "but received kernel_height: %d kernel_width: %d.",
                         kernel_sizes[0],
                         kernel_sizes[1]));
   PADDLE_ENFORCE_GT(kernel_sizes[1],
                     0,
                     phi::errors::InvalidArgument(
                         "The `kernel_sizes` should be greater than zero, "
-                        "but recieved kernel_height: %d kernel_width: %d.",
+                        "but received kernel_height: %d kernel_width: %d.",
                         kernel_sizes[0],
                         kernel_sizes[1]));
   // check strides
@@ -2794,14 +2794,14 @@ void UnfoldInferMeta(const MetaTensor& x,
                     0,
                     phi::errors::InvalidArgument(
                         "The `strides` should be greater than zero, "
-                        "but recieved strides_height: %d strides_width: %d.",
+                        "but received strides_height: %d strides_width: %d.",
                         strides[0],
                         strides[1]));
   PADDLE_ENFORCE_GT(strides[1],
                     0,
                     phi::errors::InvalidArgument(
                         "The `strides` should be greater than zero, "
-                        "but recieved strides_height: %d strides_width: %d.",
+                        "but received strides_height: %d strides_width: %d.",
                         strides[0],
                         strides[1]));
   // check dilations
@@ -2810,7 +2810,7 @@ void UnfoldInferMeta(const MetaTensor& x,
       0,
       phi::errors::InvalidArgument(
           "The `dilations` should be greater than zero, "
-          "but recieved dilations_height: %d dilations_width: %d.",
+          "but received dilations_height: %d dilations_width: %d.",
           dilations[0],
           dilations[1]));
   PADDLE_ENFORCE_GT(
@@ -2818,7 +2818,7 @@ void UnfoldInferMeta(const MetaTensor& x,
       0,
       phi::errors::InvalidArgument(
           "The `dilations` should be greater than zero, "
-          "but recieved dilations_height: %d dilations_width: %d.",
+          "but received dilations_height: %d dilations_width: %d.",
           dilations[0],
           dilations[1]));
 
diff --git a/paddle/phi/kernels/cpu/conv_util.h b/paddle/phi/kernels/cpu/conv_util.h
index d26d89086b27e..159a5cfbeb6b4 100644
--- a/paddle/phi/kernels/cpu/conv_util.h
+++ b/paddle/phi/kernels/cpu/conv_util.h
@@ -38,7 +38,7 @@ inline void UpdatePaddingAndDilation(std::vector<T>* paddings,
         phi::errors::InvalidArgument(
             "Attribute padding's size should be the same or twice as the "
             "input's dimension. "
-            "But recieved: padding's size is %d, padding is [%s]; input's "
+            "But received: padding's size is %d, padding is [%s]; input's "
             "dimension is %d, input's shape is [%s].",
             paddings->size(),
             make_ddim(*paddings),
diff --git a/paddle/phi/kernels/cpu/rnn_kernel.cc b/paddle/phi/kernels/cpu/rnn_kernel.cc
index cae97eb076453..ae2c7a72635f7 100644
--- a/paddle/phi/kernels/cpu/rnn_kernel.cc
+++ b/paddle/phi/kernels/cpu/rnn_kernel.cc
@@ -808,7 +808,7 @@ struct BidirLayer : public Layer<T, CellType> {
                   mode,
                   is_test);
 
-    // concat the the output result
+    // concat the output result
     funcs::ConcatFunctor<CPUContext, T> concat_functor;
     concat_functor(dev_ctx, output_vec, static_cast<int>(2), output);
   }
diff --git a/paddle/phi/kernels/funcs/broadcast_function.h b/paddle/phi/kernels/funcs/broadcast_function.h
index 38cd41d3b6130..17735c05ada52 100644
--- a/paddle/phi/kernels/funcs/broadcast_function.h
+++ b/paddle/phi/kernels/funcs/broadcast_function.h
@@ -53,7 +53,7 @@ struct DimensionsTransform {
             PADDLE_THROW(phi::errors::InvalidArgument(
                 "The %d-th dimension of input tensor is expected to be equal "
                 "with the %d-th dimension of output tensor %d or 1, but "
-                "recieved %d.",
+                "received %d.",
                 in_idx + 1,
                 axis + 1,
                 out_dims[axis],
@@ -70,7 +70,7 @@ struct DimensionsTransform {
             PADDLE_THROW(phi::errors::InvalidArgument(
                 "The %d-th dimension of input tensor is expected to be equal "
                 "with the %d-th dimension of output tensor %d or 1, but "
-                "recieved %d.",
+                "received %d.",
                 in_idx + 1,
                 in_idx + 1,
                 out_dims[in_idx],
@@ -552,7 +552,7 @@ void BroadcastKernelForDifferentDimSize(
     default: {
       PADDLE_THROW(phi::errors::InvalidArgument(
           "The maximum dimension of input tensor is expected to be less than "
-          "%d, but recieved %d.",
+          "%d, but received %d.",
           merge_dims.dim_size,
           phi::DDim::kMaxRank));
     }
@@ -578,7 +578,7 @@ void BroadcastKernelForDifferentVecSize(
                     kArity,
                     phi::errors::InvalidArgument(
                         "The number of inputs is expected to be equal to the "
-                        "arity of functor. But recieved: the number of inputs "
+                        "arity of functor. But received: the number of inputs "
                         "is %d, the arity of functor is %d.",
                         ins.size(),
                         kArity));
diff --git a/paddle/phi/kernels/funcs/elementwise_base.h b/paddle/phi/kernels/funcs/elementwise_base.h
index 4ee46facc7913..1093bdfa726c8 100644
--- a/paddle/phi/kernels/funcs/elementwise_base.h
+++ b/paddle/phi/kernels/funcs/elementwise_base.h
@@ -849,7 +849,7 @@ void ElementwiseKernel(const KPDevice &ctx,
                     kArity,
                     phi::errors::InvalidArgument(
                         "The number of inputs is expected to be equal to the "
-                        "arity of functor. But recieved: the number of inputs "
+                        "arity of functor. But received: the number of inputs "
                         "is %d, the arity of functor is %d.",
                         ins.size(),
                         kArity));
diff --git a/paddle/phi/kernels/gpu/bce_loss_kernel.cu b/paddle/phi/kernels/gpu/bce_loss_kernel.cu
index adbcd3b2b6207..b190bce474280 100644
--- a/paddle/phi/kernels/gpu/bce_loss_kernel.cu
+++ b/paddle/phi/kernels/gpu/bce_loss_kernel.cu
@@ -38,7 +38,7 @@ struct BCELossFunctor {
   HOSTDEVICE inline T operator()(const T x, const T label) const {
     PADDLE_ENFORCE(
         (x >= static_cast<T>(0)) && (x <= one),
-        "Input is expected to be within the interval [0, 1], but recieved %f.",
+        "Input is expected to be within the interval [0, 1], but received %f.",
         x);
     T term1 = max(phi::kps::details::Log(x), neg_100);
     T term2 = max(phi::kps::details::Log(one - x), neg_100);
diff --git a/paddle/phi/kernels/impl/searchsorted_kernel_impl.h b/paddle/phi/kernels/impl/searchsorted_kernel_impl.h
index 82bd9fba2a66d..e3cd6f5828d04 100644
--- a/paddle/phi/kernels/impl/searchsorted_kernel_impl.h
+++ b/paddle/phi/kernels/impl/searchsorted_kernel_impl.h
@@ -158,7 +158,7 @@ static void VisitDataType(DataType type, Visitor visitor) {
     visitor.template apply<int64_t>();
   } else {
     PADDLE_THROW(errors::InvalidArgument(
-        "The recieved values data type %s can not meet input requirements. "
+        "The received values data type %s can not meet input requirements. "
         "Because the given values data type of searchsorted operators must be "
         "float32, float64, int32 or int64. Please input appropriate "
         "sorted_sequence again! ",
diff --git a/paddle/phi/ops/compat/scale_sig.cc b/paddle/phi/ops/compat/scale_sig.cc
index 95deb007d99d9..8061a1fbd610a 100644
--- a/paddle/phi/ops/compat/scale_sig.cc
+++ b/paddle/phi/ops/compat/scale_sig.cc
@@ -30,7 +30,7 @@ namespace phi {
  * The infrt declare like:
  *
  * def PDKEL_Reshape_to_CPU : Pat<
- *     (PD_ReshapeOp $x, $shape_tensor， $shape_attr), // OpMaker arguements
+ *     (PD_ReshapeOp $x, $shape_tensor， $shape_attr), // OpMaker arguments
  *     (PDKEL_ReshapeKernelAttr $x, fn($shape_attr)>;  // Kernel arguments
  * def PDKEL_Reshape_to_CPU : Pat<
  *     (PD_ReshapeOp $x, $shape_tensor， $shape_attr),
diff --git a/python/paddle/README.rst b/python/paddle/README.rst
index e779f1264c451..2d48ee4b26caf 100644
--- a/python/paddle/README.rst
+++ b/python/paddle/README.rst
@@ -88,7 +88,7 @@ If you want to install paddlepaddle-gpu with cuda version of 9.0 ,10.0 ,10.1 ,or
 
 After the installation is complete, you can use `python` or `python3` to enter the Python interpreter and then use `import paddle.fluid` and `fluid.install_check.run_check()`
 
-If `Your Paddle Fluid is installed succesfully!` appears, to verify that the installation was successful.
+If `Your Paddle Fluid is installed successfully!` appears, to verify that the installation was successful.
 
 
 
diff --git a/python/paddle/distributed/auto_parallel/process_group.py b/python/paddle/distributed/auto_parallel/process_group.py
index 471448b031dde..d1b6e57ddc123 100644
--- a/python/paddle/distributed/auto_parallel/process_group.py
+++ b/python/paddle/distributed/auto_parallel/process_group.py
@@ -156,6 +156,6 @@ def __str__(self):
 
 
 # Note that Process group 0 is reserved for representing all ranks.
-# At the begining, group 0 is empty and new ranks will be added automatically. 
+# At the beginning, group 0 is empty and new ranks will be added automatically. 
 _g_process_group_map = {}
 _g_process_group_map[0] = ProcessGroup(0, [])
diff --git a/python/paddle/distributed/fleet/base/distributed_strategy.py b/python/paddle/distributed/fleet/base/distributed_strategy.py
index 9d20e432d8961..986d8e401e872 100644
--- a/python/paddle/distributed/fleet/base/distributed_strategy.py
+++ b/python/paddle/distributed/fleet/base/distributed_strategy.py
@@ -1168,9 +1168,9 @@ def sharding_configs(self):
 
             dp_degree(int, optional): specific the number of data parallelism group; when dp_degree >= 2, it will introduce dp_degree ways data parallelism as the outer parallelsim for the inner parallelsim. User is responsible to ensure global_world_size = mp_degree * sharding_degree * pp_degree * dp_degree. Default is 1.
 
-            mp_degree(int, optional): [Hybrid parallelism ONLY] specific the the number of gpus within each megatron parallelism group; and megatron parallelism will turn be off if mp_degree=1.  Default is 1.
+            mp_degree(int, optional): [Hybrid parallelism ONLY] specific the number of gpus within each megatron parallelism group; and megatron parallelism will turn be off if mp_degree=1.  Default is 1.
 
-            pp_degree(int, optional): [Hybrid parallelism ONLY] specific the the number of gpus within each pipeline parallelism group; and pipeline parallelism will turn be off if pp_degree=1.  Default is 1.
+            pp_degree(int, optional): [Hybrid parallelism ONLY] specific the number of gpus within each pipeline parallelism group; and pipeline parallelism will turn be off if pp_degree=1.  Default is 1.
 
             pp_allreduce_in_optimize(bool, optional): [Hybrid parallelism ONLY] move the allreduce operations from backward stage to update(optimize) stage when pipeline parallelsim is on. 
             This configuration will affect the communication speed of Hybrid parallelism training depeneded on network topology. this strategy is experimental by now..  Default is False.
@@ -1485,7 +1485,7 @@ def localsgd_configs(self):
 
         **Notes**:
             k_steps(int) The local steps for training before parameter synchronization. Default 1.
-            begin_step(int) The step of begining training by localsgd. Default 1.
+            begin_step(int) The step of beginning training by localsgd. Default 1.
 
         Examples:
 
@@ -1544,7 +1544,7 @@ def adaptive_localsgd_configs(self):
             init_k_steps(int) The initial steps for training before adaptive localsgd.
                               Then, the adaptive localsgd method will modify init_k_steps automatically.
                               Default 1.
-            begin_step(int) The step of begining training by adaptive localsgd. Default 1.
+            begin_step(int) The step of beginning training by adaptive localsgd. Default 1.
 
         Examples:
 
diff --git a/python/paddle/distributed/fleet/launch.py b/python/paddle/distributed/fleet/launch.py
index c5a9df50589cc..343cca7f4f0d3 100644
--- a/python/paddle/distributed/fleet/launch.py
+++ b/python/paddle/distributed/fleet/launch.py
@@ -556,7 +556,7 @@ def launch():
 
         - ``--selected_mlus``: mlus aliases, recommend to use ``--mlus``.
 
-        - ``training_script``: The full path to the single GPU training program/script to be launched in parallel, followed by all the arguments for the training script. e.g., ``traing.py``
+        - ``training_script``: The full path to the single GPU training program/script to be launched in parallel, followed by all the arguments for the training script. e.g., ``training.py``
 
         - ``training_script_args``: The args of training_script. e.g., ``--lr=0.1``
 
diff --git a/python/paddle/distributed/fleet/meta_optimizers/ascend/ascend_parser.py b/python/paddle/distributed/fleet/meta_optimizers/ascend/ascend_parser.py
index 8f1a4de86de0d..3a52041dc7e2c 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/ascend/ascend_parser.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/ascend/ascend_parser.py
@@ -1372,7 +1372,7 @@ def _apply(self):
         max_v = self.op.attr("max")
         seed = self.op.attr("seed")
         dtype = self.op.attr("dtype")
-        assert max_v > min_v, "assert max_v > min_v, but recieved " + \
+        assert max_v > min_v, "assert max_v > min_v, but received " + \
                "as max_v={}, min_v={} ".format(max_v, min_v)
 
         tensor1 = self._create_ge_tensor([len(shape)], 2, shape)
diff --git a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/dygraph_sharding_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/dygraph_sharding_optimizer.py
index b7edf5830025d..d487f35324df9 100755
--- a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/dygraph_sharding_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/dygraph_sharding_optimizer.py
@@ -170,7 +170,7 @@ def minimize(self,
         result = self._inner_optimizer.minimize(loss, startup_program,
                                                 parameters, no_grad_set)
 
-        # sync parameters accross sharding ranks
+        # sync parameters across sharding ranks
         self._sharding_sync_parameters()
 
         return result
@@ -181,7 +181,7 @@ def step(self):
         # actually updating
         self._inner_optimizer.step()
 
-        # sync parameters accross sharding ranks
+        # sync parameters across sharding ranks
         self._sharding_sync_parameters()
 
     # TODO is it a good way to make _grad_clip a property
diff --git a/python/paddle/distributed/launch/main.py b/python/paddle/distributed/launch/main.py
index 400a447260252..b2c87e737c82d 100644
--- a/python/paddle/distributed/launch/main.py
+++ b/python/paddle/distributed/launch/main.py
@@ -54,7 +54,7 @@ def launch():
 
         - ``--devices``: The selected accelerate devices on nodes, can be gpu/xpu/npu/mlu etc.. e.g., ``--devices=0,1,2,3`` will launch four training processes each bound to one device.
 
-        - ``training_script``: The full path to the single GPU training program/script to be launched in parallel, followed by all the arguments for the training script. e.g., ``traing.py``
+        - ``training_script``: The full path to the single GPU training program/script to be launched in parallel, followed by all the arguments for the training script. e.g., ``training.py``
 
         - ``training_script_args``: The args of training_script. e.g., ``--lr=0.1``
 
diff --git a/python/paddle/distributed/passes/auto_parallel_fp16.py b/python/paddle/distributed/passes/auto_parallel_fp16.py
index 69c3eef7e3771..9dda310e5c022 100644
--- a/python/paddle/distributed/passes/auto_parallel_fp16.py
+++ b/python/paddle/distributed/passes/auto_parallel_fp16.py
@@ -306,7 +306,7 @@ def _insert_forward_cast_ops(self, op, idx, block, src_dtype, dst_dtype,
                     in_var_dist_attr = consume_op_attr.get_input_dist_attr(
                         in_var.name)
                     assert in_var_dist_attr is not None
-                    # truely insert cast op
+                    # truly insert cast op
                     if cast_var is None or cast_var.dtype != dst_dtype:
                         # NOTE we make the cast op and var's dist attr as the op that consume the
                         # cast var instead of the op which generates the var
diff --git a/python/paddle/distributed/ps/utils/public.py b/python/paddle/distributed/ps/utils/public.py
index e7edc6fd859a6..7acfd6cfe19f5 100755
--- a/python/paddle/distributed/ps/utils/public.py
+++ b/python/paddle/distributed/ps/utils/public.py
@@ -748,7 +748,7 @@ def _append_heter_op(op, current_heter_block_ops, heter_ops):
 def union_forward_gradient_op(program_block_ops_list):
     """
     before analyzing the input & output of each block in program_block_list, we should
-    union the forward op and corresponding gradient op to elimincate the uneccessary variable
+    union the forward op and corresponding gradient op to elimincate the unnecessary variable
     transmit
     """
     """
diff --git a/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mobilenetv1.py b/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mobilenetv1.py
index 629529ff1b965..56d77f77b5083 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mobilenetv1.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mobilenetv1.py
@@ -405,7 +405,7 @@ def test_post_training_abs_max_mobilenetv1(self):
         is_full_quantize = False
         is_use_cache_file = False
         is_optimize_model = False
-        # The accuracy diff of post-traing quantization (abs_max) maybe bigger
+        # The accuracy diff of post-training quantization (abs_max) maybe bigger
         diff_threshold = 0.05
         self.run_test(model, algo, round_type, data_urls, data_md5s,
                       quantizable_op_type, is_full_quantize, is_use_cache_file,
diff --git a/python/paddle/fluid/dygraph/checkpoint.py b/python/paddle/fluid/dygraph/checkpoint.py
index 1ae57bcb30310..ba5c709b1d877 100644
--- a/python/paddle/fluid/dygraph/checkpoint.py
+++ b/python/paddle/fluid/dygraph/checkpoint.py
@@ -257,7 +257,7 @@ def load_dygraph(model_path, **configs):
                     para_dict = structured_para_dict
         else:
             # load state dict by `io.save_params/persistables` save format
-            # TODO(chenweihang): [ Now only supports loading parameters seperately ]
+            # TODO(chenweihang): [ Now only supports loading parameters separately ]
             # If users save all parameters as one file, the [ variable.name -> variable ]
             # mapping info will lost, so users need to give variable list, but users build 
             # variable list in dygraph mode is difficult, we recommend users to use
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/convert_operators.py b/python/paddle/fluid/dygraph/dygraph_to_static/convert_operators.py
index 3a7b012b02bee..576baf6cc299a 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/convert_operators.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/convert_operators.py
@@ -167,7 +167,7 @@ def convert_logical_not(x):
     A function representation of a Python ``not`` statement.
 
     Args:
-        x(bool|Tensor): Operand of of ``not`` operator.
+        x(bool|Tensor): Operand of ``not`` operator.
 
     Returns:
         A python bool variable or a bool Tensor.
diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py
index c6ff3a583d6a3..164545d0a0595 100644
--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
@@ -873,7 +873,7 @@ def _get_targets(_optimize_ops, _fetch_list, item):
                 _fetch_list.append(item)
             else:
                 raise TypeError(
-                    "The item in fetch_list should be str, variable or optimize_op, but recieved %s.",
+                    "The item in fetch_list should be str, variable or optimize_op, but received %s.",
                     type(item))
 
         for index, item in enumerate(fetch_list):
diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/ir/trainer_pass.py b/python/paddle/fluid/incubate/fleet/parameter_server/ir/trainer_pass.py
index 46f26e8e52cd5..2c09abac9e7ba 100644
--- a/python/paddle/fluid/incubate/fleet/parameter_server/ir/trainer_pass.py
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/ir/trainer_pass.py
@@ -1407,7 +1407,7 @@ def get_communicate_var_info(program,
 def union_forward_gradient_op(program_block_ops_list):
     """
     before analyzing the input & output of each block in program_block_list, we should
-    union the forward op and corresponding gradient op to elimincate the uneccessary variable
+    union the forward op and corresponding gradient op to elimincate the unnecessary variable
     transmit
     """
     """
diff --git a/python/paddle/fluid/layer_helper_base.py b/python/paddle/fluid/layer_helper_base.py
index ce6fe6918b56b..47f0c02d28725 100644
--- a/python/paddle/fluid/layer_helper_base.py
+++ b/python/paddle/fluid/layer_helper_base.py
@@ -234,7 +234,7 @@ def __weight_normalize(g, v, dim):
                 x=g, y=norm)  # The shapes of g and norm are the same.
             # Currently, elementwise_mul only support broadcast when the shape
             # of y is a subset of the shape of x. Thus, we reshape y to squeeze
-            # to achive the subset.
+            # to achieve the subset.
             w = elementwise_mul(
                 x=v,
                 y=scale if dim is None else reshape(
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 200e8feec1e6a..8be719758ef98 100755
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -13744,7 +13744,7 @@ def get_tensor_from_selected_rows(x, name=None):
            x.height = 20
            x.value = [[1, 1] [2, 2] [2, 2] [3, 3] [6, 6]]
 
-        Ouput is LoDTensor:
+        Output is LoDTensor:
            out.shape = [5, 2]
            out.data = [[1, 1],
                        [2, 2],
diff --git a/python/paddle/fluid/layers/rnn.py b/python/paddle/fluid/layers/rnn.py
index 707a1dc2cbc2f..b04cf90e1d8f9 100644
--- a/python/paddle/fluid/layers/rnn.py
+++ b/python/paddle/fluid/layers/rnn.py
@@ -673,7 +673,7 @@ def birnn(cell_fw,
     birnn creates a bidirectional recurrent neural network specified by 
     RNNCell `cell_fw` and `cell_bw`, which performs :code:`cell.call()` 
     (for dygraph mode :code:`cell.forward`) repeatedly until reaches to 
-    the maximum length of `inputs` and then concat the ouputs for both RNNs
+    the maximum length of `inputs` and then concat the outputs for both RNNs
     along the last axis.
 
     Arguments:
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/auto_scan_test.py b/python/paddle/fluid/tests/unittests/ir/inference/auto_scan_test.py
index bb8c6e73fdefa..161c785ef8565 100755
--- a/python/paddle/fluid/tests/unittests/ir/inference/auto_scan_test.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/auto_scan_test.py
@@ -388,7 +388,7 @@ def run_test(prog_config):
         used_time = time.time() - start_time
         if max_duration > 0 and used_time > max_duration:
             logging.error(
-                "The duration exceeds {} seconds, if this is neccessary, try to set a larger number for parameter `max_duration`.".
+                "The duration exceeds {} seconds, if this is necessary, try to set a larger number for parameter `max_duration`.".
                 format(max_duration))
             assert False
 
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_shuffle_channel_detect_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_shuffle_channel_detect_pass.py
index a864e2fe5a1c8..1781eb5048347 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_shuffle_channel_detect_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_shuffle_channel_detect_pass.py
@@ -62,13 +62,13 @@ def generate_reshape2_Input():
             "transpose2",
             inputs={"X": ["reshape2_output1"], },
             outputs={
-                "Out": ["transpose2_ouput"],
+                "Out": ["transpose2_output"],
                 "XShape": ["transpose2_xshape"]
             },
             axis=axis_v)
         reshape2_op2 = OpConfig(
             "reshape2",
-            inputs={"X": ["transpose2_ouput"], },
+            inputs={"X": ["transpose2_output"], },
             outputs={
                 "Out": ["reshape2_output2"],
                 "XShape": ["reshape2_xshape2"]
diff --git a/python/paddle/fluid/tests/unittests/ir/pass_test.py b/python/paddle/fluid/tests/unittests/ir/pass_test.py
index aae1cc65c9220..e92821387aed4 100644
--- a/python/paddle/fluid/tests/unittests/ir/pass_test.py
+++ b/python/paddle/fluid/tests/unittests/ir/pass_test.py
@@ -167,7 +167,7 @@ def check_output_with_place(self, place, startup_on_cpu=False, atol=1e-5):
 
     def _check_fused_ops(self, program):
         '''
-        Check the number of specified fused op is equal to the the expected
+        Check the number of specified fused op is equal to the expected
         number.
         '''
         if self.fused_op_type is None or self.num_fused_ops < 0:
diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py
index 13c72bedefa8e..f7a3dfa1102b2 100644
--- a/python/paddle/fluid/tests/unittests/op_test.py
+++ b/python/paddle/fluid/tests/unittests/op_test.py
@@ -872,7 +872,7 @@ def cal_python_api(python_api, args, kernel_sig):
             eager_tensor_outputs = egr_oups if egr_oups else self.append_input_output_for_dygraph(
                 op_proto, self.outputs, False, False, block)
 
-            # prepare attrbutes
+            # prepare attributes
             attrs_outputs = {}
             if hasattr(self, "attrs"):
                 for attrs_name in self.attrs:
@@ -906,7 +906,7 @@ def _calc_dygraph_output(self, place, parallel=False, no_check_set=None):
             outputs = self.append_input_output_for_dygraph(
                 op_proto, self.outputs, False, False, block)
 
-            # prepare attrbutes
+            # prepare attributes
             attrs_outputs = {}
             if hasattr(self, "attrs"):
                 for attrs_name in self.attrs:
@@ -2016,7 +2016,7 @@ def _get_dygraph_grad(self,
             outputs = self.append_input_output_for_dygraph(
                 op_proto, self.outputs, False, False, block)
 
-            # prepare attrbutes
+            # prepare attributes
             attrs_outputs = {}
             if hasattr(self, "attrs"):
                 for attrs_name in self.attrs:
diff --git a/python/paddle/fluid/tests/unittests/seresnext_test_base.py b/python/paddle/fluid/tests/unittests/seresnext_test_base.py
index cc40b89b585cb..bf33adcf48655 100644
--- a/python/paddle/fluid/tests/unittests/seresnext_test_base.py
+++ b/python/paddle/fluid/tests/unittests/seresnext_test_base.py
@@ -25,7 +25,7 @@ def _compare_result_with_origin_model(self,
                                           check_func,
                                           use_device,
                                           delta2=1e-5,
-                                          compare_seperately=True):
+                                          compare_separately=True):
         if use_device == DeviceType.CUDA and not core.is_compiled_with_cuda():
             return
 
@@ -45,7 +45,7 @@ def _compare_result_with_origin_model(self,
             batch_size=seresnext_net.batch_size(use_device),
             use_device=use_device)
 
-        if compare_seperately:
+        if compare_separately:
             for loss in zip(func_1_first_loss, func_2_first_loss):
                 self.assertAlmostEquals(loss[0], loss[1], delta=1e-5)
             for loss in zip(func_1_last_loss, func_2_last_loss):
diff --git a/python/paddle/fluid/tests/unittests/static_model_parallel_fused_attention.py b/python/paddle/fluid/tests/unittests/static_model_parallel_fused_attention.py
index b57f26776234e..4dc3fe6eab6be 100644
--- a/python/paddle/fluid/tests/unittests/static_model_parallel_fused_attention.py
+++ b/python/paddle/fluid/tests/unittests/static_model_parallel_fused_attention.py
@@ -69,9 +69,9 @@ def __init__(self,
         super(ParallelFusedMultiHeadAttention, self).__init__()
 
         assert embed_dim > 0, ("Expected embed_dim to be greater than 0, "
-                               "but recieved {}".format(embed_dim))
+                               "but received {}".format(embed_dim))
         assert num_heads > 0, ("Expected nhead to be greater than 0, "
-                               "but recieved {}".format(num_heads))
+                               "but received {}".format(num_heads))
 
         self.normalize_before = normalize_before
         self._dtype = self._helper.get_default_dtype()
diff --git a/python/paddle/fluid/tests/unittests/static_model_parallel_fused_feedforward.py b/python/paddle/fluid/tests/unittests/static_model_parallel_fused_feedforward.py
index 5f467da6a6465..ad570fc0acfb3 100644
--- a/python/paddle/fluid/tests/unittests/static_model_parallel_fused_feedforward.py
+++ b/python/paddle/fluid/tests/unittests/static_model_parallel_fused_feedforward.py
@@ -172,10 +172,10 @@ def __init__(self,
                  name=None):
         super(ParallelFusedFeedForward, self).__init__()
         assert d_model > 0, (
-            "Expected d_model to be greater than 0, but recieved {}".format(
+            "Expected d_model to be greater than 0, but received {}".format(
                 d_model))
         assert dim_feedforward > 0, (
-            "Expected dim_feedforward to be greater than 0, but recieved {}".
+            "Expected dim_feedforward to be greater than 0, but received {}".
             format(dim_feedforward))
 
         self._dtype = self._helper.get_default_dtype()
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_rolemaker.py b/python/paddle/fluid/tests/unittests/test_fleet_rolemaker.py
index 4655b628dab4d..f382d61c63743 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_rolemaker.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_rolemaker.py
@@ -52,7 +52,7 @@ def test_ps_rolemaker(self):
         self.assertTrue(ro.is_server())
         self.assertEqual(ro.worker_num(), 2)
 
-    def test_traing_role(self):
+    def test_training_role(self):
         """Test training role."""
         os.environ["TRAINING_ROLE"] = "TEST"
         ro = role_maker.PaddleCloudRoleMaker(is_collective=False)
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_new.py b/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_new.py
index 5e8be9a852273..86ee0db30ef8c 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_new.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_rolemaker_new.py
@@ -116,7 +116,7 @@ def test_ps_rolemaker(self):
         self.assertEqual(ro._all_gather(1, "worker"), 1)
         self.assertEqual(ro._all_reduce(1, "sum", "worker"), 1)
 
-    def test_traing_role(self):
+    def test_training_role(self):
         """Test training role."""
         os.environ["TRAINING_ROLE"] = "TEST"
 
diff --git a/python/paddle/fluid/tests/unittests/test_gradient_clip.py b/python/paddle/fluid/tests/unittests/test_gradient_clip.py
index 7984ca5571658..20a55af15c441 100644
--- a/python/paddle/fluid/tests/unittests/test_gradient_clip.py
+++ b/python/paddle/fluid/tests/unittests/test_gradient_clip.py
@@ -162,7 +162,7 @@ def check_clip_result(self, out, out_clip):
                 "gradient clip by global norm has wrong results!, \nu={}\nv={}\ndiff={}".
                 format(u, v, u - v))
 
-    # test whether the ouput is right when use 'set_gradient_clip'
+    # test whether the output is right when use 'set_gradient_clip'
     def test_old_gradient_clip(self):
         def func(params_grads):
             clip = fluid.clip.GradientClipByGlobalNorm(clip_norm=self.clip_norm)
@@ -172,7 +172,7 @@ def func(params_grads):
         self.clip_gradient = func
         self.check_gradient_clip(fluid.CPUPlace())
 
-    # test whether the ouput is right when use grad_clip
+    # test whether the output is right when use grad_clip
     def test_new_gradient_clip(self):
         def func(params_grads):
             clip = fluid.clip.GradientClipByGlobalNorm(clip_norm=self.clip_norm)
@@ -181,7 +181,7 @@ def func(params_grads):
         self.clip_gradient = func
         self.check_gradient_clip(fluid.CPUPlace())
 
-    # test whether the ouput is right when use grad_clip under float64
+    # test whether the output is right when use grad_clip under float64
     def test_new_gradient_clip_fp64(self):
         def func(params_grads):
             clip = fluid.clip.GradientClipByGlobalNorm(clip_norm=self.clip_norm)
@@ -267,7 +267,7 @@ def check_clip_result(self, out, out_clip):
                     a=u, b=v, rtol=1e-5, atol=1e-8),
                 "gradient clip by norm has wrong results!")
 
-    # test whether the ouput is right when use grad_clip
+    # test whether the output is right when use grad_clip
     def test_gradient_clip(self):
         def func(params_grads):
             clip = fluid.clip.GradientClipByNorm(clip_norm=self.clip_norm)
@@ -311,7 +311,7 @@ def check_clip_result(self, out, out_clip):
                     a=u, b=v, rtol=1e-6, atol=1e-8),
                 "gradient clip by value has wrong results!")
 
-    # test whether the ouput is right when use grad_clip
+    # test whether the output is right when use grad_clip
     def test_gradient_clip(self):
         def func(params_grads):
             clip = fluid.clip.GradientClipByValue(max=self.max, min=self.min)
@@ -397,7 +397,7 @@ def check_clip_result(self, loss, optimizer):
         self.assertTrue(
             np.isclose(
                 a=a, b=b, rtol=1e-6, atol=1e-8),
-            "gradient clip by global norm has wrong results, expetcd:%f, but recieved:%f"
+            "gradient clip by global norm has wrong results, expetcd:%f, but received:%f"
             % (a, b))
 
 
@@ -426,7 +426,7 @@ def check_clip_result(self, loss, optimizer):
             self.assertTrue(
                 np.isclose(
                     a=a, b=b, rtol=1e-6, atol=1e-8),
-                "gradient clip by norm has wrong results, expetcd:%f, but recieved:%f"
+                "gradient clip by norm has wrong results, expetcd:%f, but received:%f"
                 % (a, b))
 
 
@@ -517,7 +517,7 @@ def test_gradient_clip(self):
                 self.assertTrue(
                     np.isclose(
                         a=a, b=b, rtol=1e-3, atol=1e-8),
-                    "gradient clip by global norm has wrong results, expetcd:%f, but recieved:%f"
+                    "gradient clip by global norm has wrong results, expetcd:%f, but received:%f"
                     % (a, b))
 
 
@@ -563,7 +563,7 @@ def test_gradient_clip(self):
             self.assertTrue(
                 np.isclose(
                     a=a, b=b, rtol=1e-6, atol=1e-8),
-                "gradient clip by global norm has wrong results, expetcd:%f, but recieved:%f"
+                "gradient clip by global norm has wrong results, expetcd:%f, but received:%f"
                 % (a, b))
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py b/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py
index 965ae65614a40..51ff8ec943d01 100644
--- a/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py
+++ b/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py
@@ -198,7 +198,7 @@ def test_check_grad(self):
 
 
 @skip_check_grad_ci(
-    reason="For 'TestHSigmoidOpSparse', check_grad is is separately calculated by 'TestHSigmoidOpWithSparseGrad'."
+    reason="For 'TestHSigmoidOpSparse', check_grad is separately calculated by 'TestHSigmoidOpWithSparseGrad'."
 )
 class TestHSigmoidOpSparse(OpTest):
     def setUp(self):
diff --git a/python/paddle/fluid/tests/unittests/test_optimizer.py b/python/paddle/fluid/tests/unittests/test_optimizer.py
index ba1e9be815de6..a0c5ce77f1d25 100644
--- a/python/paddle/fluid/tests/unittests/test_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_optimizer.py
@@ -1123,7 +1123,7 @@ def test_api_eager_dygraph(self):
 
 class TestMasterWeightSaveForFP16(unittest.TestCase):
     '''
-    For Amp-O2, some optimizer(Momentum, Adam ...) will create master weights for parameters to to improve the accuracy.
+    For Amp-O2, some optimizer(Momentum, Adam ...) will create master weights for parameters to improve the accuracy.
     Master weights will be saved by optimizer::state_dict.
     '''
 
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_base_cpu.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_base_cpu.py
index 20a5fcb7af3b1..9b48a87bff7b9 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_base_cpu.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_base_cpu.py
@@ -32,7 +32,7 @@ def test_seresnext_with_learning_rate_decay(self):
         self._compare_result_with_origin_model(
             check_func,
             use_device=DeviceType.CPU,
-            compare_seperately=False,
+            compare_separately=False,
             delta2=1e-3)
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_base_gpu.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_base_gpu.py
index 9d1364cc592fe..ff529ce94bd25 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_base_gpu.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_base_gpu.py
@@ -30,7 +30,7 @@ def test_seresnext_with_learning_rate_decay(self):
             optimizer=seresnext_net.optimizer,
             use_parallel_executor=False)
         self._compare_result_with_origin_model(
-            check_func, use_device=DeviceType.CUDA, compare_seperately=False)
+            check_func, use_device=DeviceType.CUDA, compare_separately=False)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_unpool_op.py b/python/paddle/fluid/tests/unittests/test_unpool_op.py
index f6dc3fba6a214..95ad254a6dfb0 100644
--- a/python/paddle/fluid/tests/unittests/test_unpool_op.py
+++ b/python/paddle/fluid/tests/unittests/test_unpool_op.py
@@ -116,7 +116,7 @@ def init_test_case(self):
         self.output_size = None
 
 
-class TestUnpoolOpOuputsize(TestUnpoolOp):
+class TestUnpoolOpOutputsize(TestUnpoolOp):
     def init_test_case(self):
         self.unpool2d_forward_naive = unpool2dmax_forward_naive
         self.unpooling_type = "max"
@@ -127,7 +127,7 @@ def init_test_case(self):
         self.output_size = [9, 9]
 
 
-class TestUnpoolOpOuput(TestUnpoolOp):
+class TestUnpoolOpOutput(TestUnpoolOp):
     def init_test_case(self):
         self.unpool2d_forward_naive = unpool2dmax_forward_naive
         self.unpooling_type = "max"
diff --git a/python/paddle/framework/io.py b/python/paddle/framework/io.py
index c1891d24b88c9..8e8dd7855113b 100644
--- a/python/paddle/framework/io.py
+++ b/python/paddle/framework/io.py
@@ -1039,7 +1039,7 @@ def _legacy_load(path, **configs):
                                                                      config)
         else:
             # load state dict by `io.save_params/persistables` save format
-            # TODO(chenweihang): [ Now only supports loading parameters seperately ]
+            # TODO(chenweihang): [ Now only supports loading parameters separately ]
             # If users save all parameters as one file, the [ variable.name -> variable ]
             # mapping info will lost, so users need to give variable list, but users build 
             # variable list in dygraph mode is difficult, we recommend users to use
diff --git a/python/paddle/hapi/model.py b/python/paddle/hapi/model.py
index c17a56fc28d88..4b349a1957731 100644
--- a/python/paddle/hapi/model.py
+++ b/python/paddle/hapi/model.py
@@ -915,7 +915,7 @@ class Model(object):
 
     When training on GPU, auto mixed precision (AMP O1) and pure float16 
     (AMP O2) training are both supported in static mode and dynamic mode.
-    In static graph mode, before traing with pure float16 (AMP O2),
+    In static graph mode, before training with pure float16 (AMP O2),
     `multi_precision` could be set to True when creating optimizer, which can
     avoid poor accuracy or slow convergence in a way, and inputs of dtype float
     should be cast to float16 by users. `paddle.static.amp.fp16_guard` API
@@ -2075,7 +2075,7 @@ def _run_one_epoch(
             #    [input1, input2, ..., label1, lable2, ...]
             # 3. custumed iterator yield concated inputs and labels:
             #   [input1, input2, ..., label1, lable2, ...]
-            # 4. custumed iterator yield seperated inputs and labels:
+            # 4. custumed iterator yield separated inputs and labels:
             #   ([input1, input2, ...], [label1, lable2, ...])
             # To handle all of these, flatten (nested) list to list.
             data = flatten(data)
diff --git a/python/paddle/incubate/autotune.py b/python/paddle/incubate/autotune.py
index e98a23bc52d65..7ac555e2520ea 100644
--- a/python/paddle/incubate/autotune.py
+++ b/python/paddle/incubate/autotune.py
@@ -49,7 +49,7 @@ def set_config(config=None):
             dictionary, the key is the tuning type, and the value is a dictionary
             of the corresponding tuning parameters. If it is a string, the path of
             a json file will be specified and the tuning configuration will be set
-            by the the json file. Default: None, auto-tuning for kernel, layout and
+            by the json file. Default: None, auto-tuning for kernel, layout and
             dataloader will be enabled.
 
     Examples:
diff --git a/python/paddle/incubate/distributed/models/moe/grad_clip.py b/python/paddle/incubate/distributed/models/moe/grad_clip.py
index b620253b9f26f..cf56f74d1f12d 100644
--- a/python/paddle/incubate/distributed/models/moe/grad_clip.py
+++ b/python/paddle/incubate/distributed/models/moe/grad_clip.py
@@ -158,7 +158,7 @@ def _dygraph_clip(self, params_grads):
         normal_params_grads = []
         moe_params_grads = []
 
-        # seperate moe params from normal params
+        # separate moe params from normal params
         if self.moe_group is not None and self.moe_group.nranks > 1:
             for p, g in params_grads:
                 if self.is_expert_param_func(p):
diff --git a/python/paddle/incubate/nn/layer/fused_transformer.py b/python/paddle/incubate/nn/layer/fused_transformer.py
index d76b990958c94..072c7d9fccade 100644
--- a/python/paddle/incubate/nn/layer/fused_transformer.py
+++ b/python/paddle/incubate/nn/layer/fused_transformer.py
@@ -101,9 +101,9 @@ def __init__(self,
         super(FusedMultiHeadAttention, self).__init__()
 
         assert embed_dim > 0, ("Expected embed_dim to be greater than 0, "
-                               "but recieved {}".format(embed_dim))
+                               "but received {}".format(embed_dim))
         assert num_heads > 0, ("Expected nhead to be greater than 0, "
-                               "but recieved {}".format(num_heads))
+                               "but received {}".format(num_heads))
 
         self.normalize_before = normalize_before
         self._dtype = self._helper.get_default_dtype()
@@ -278,10 +278,10 @@ def __init__(self,
 
         super(FusedFeedForward, self).__init__()
         assert d_model > 0, (
-            "Expected d_model to be greater than 0, but recieved {}".format(
+            "Expected d_model to be greater than 0, but received {}".format(
                 d_model))
         assert dim_feedforward > 0, (
-            "Expected dim_feedforward to be greater than 0, but recieved {}".
+            "Expected dim_feedforward to be greater than 0, but received {}".
             format(dim_feedforward))
 
         self._dtype = self._helper.get_default_dtype()
@@ -434,12 +434,12 @@ def __init__(self,
 
         super(FusedTransformerEncoderLayer, self).__init__()
         assert d_model > 0, ("Expected d_model to be greater than 0, "
-                             "but recieved {}".format(d_model))
+                             "but received {}".format(d_model))
         assert nhead > 0, ("Expected nhead to be greater than 0, "
-                           "but recieved {}".format(nhead))
+                           "but received {}".format(nhead))
         assert dim_feedforward > 0, (
             "Expected dim_feedforward to be greater than 0, "
-            "but recieved {}".format(dim_feedforward))
+            "but received {}".format(dim_feedforward))
         attn_dropout_rate = dropout_rate if attn_dropout_rate is None else attn_dropout_rate
         act_dropout_rate = dropout_rate if act_dropout_rate is None else act_dropout_rate
         self.normalize_before = normalize_before
@@ -808,11 +808,11 @@ def __init__(self,
         super(FusedMultiTransformer, self).__init__()
 
         assert embed_dim > 0, ("Expected embed_dim to be greater than 0, "
-                               "but recieved {}".format(embed_dim))
+                               "but received {}".format(embed_dim))
         assert num_heads > 0, ("Expected nhead to be greater than 0, "
-                               "but recieved {}".format(num_heads))
+                               "but received {}".format(num_heads))
         assert dim_feedforward > 0, (
-            "Expected dim_feedforward to be greater than 0, but recieved {}".
+            "Expected dim_feedforward to be greater than 0, but received {}".
             format(dim_feedforward))
 
         self.normalize_before = normalize_before
diff --git a/python/paddle/nn/functional/loss.py b/python/paddle/nn/functional/loss.py
index ca3ac1772829d..0d5ad46a4fb5b 100755
--- a/python/paddle/nn/functional/loss.py
+++ b/python/paddle/nn/functional/loss.py
@@ -542,7 +542,7 @@ def margin_ranking_loss(input,
                         name=None):
     r"""
 
-    This op the calcluate the the margin rank loss between the input, other and label, use the math function as follows.
+    This op the calcluate the margin rank loss between the input, other and label, use the math function as follows.
 
     .. math::
         margin\_rank\_loss = max(0, -label * (input - other) + margin)
@@ -879,7 +879,7 @@ def kl_div(input, label, reduction='mean', name=None):
 
     While :attr:`reduction` is :attr:`none`, output loss is in
     the same shape as input, loss in each point is calculated
-    seperately and no reduction is applied.
+    separately and no reduction is applied.
 
     While :attr:`reduction` is :attr:`mean`, output loss is in
     shape of [1] and loss value is the mean value of all losses.
@@ -2006,7 +2006,7 @@ def sigmoid_focal_loss(logit,
             Available dtype is float32, float64.
         normalizer (Tensor, optional): The number normalizes the focal loss. It has to be
             a 1-D Tensor whose shape is `[1, ]`. The data type is float32, float64.
-            For object detection task, it is the the number of positive samples.
+            For object detection task, it is the number of positive samples.
             If set to None, the focal loss will not be normalized. Default is None.
         alpha(int|float, optional): Hyper-parameter to balance the positive and negative example,
             it should be between 0 and 1.  Default value is set to 0.25. 
diff --git a/python/paddle/nn/layer/activation.py b/python/paddle/nn/layer/activation.py
index cd82fe12fff6b..7fd109843bede 100644
--- a/python/paddle/nn/layer/activation.py
+++ b/python/paddle/nn/layer/activation.py
@@ -367,7 +367,7 @@ class PReLU(Layer):
     Parameters:
         num_parameters (int, optional): Number of `weight` to learn. The supported values are:
             1 - a single parameter `alpha` is used for all input channels;
-            Number of channels - a seperate `alpha` is used for each input channel.
+            Number of channels - a separate `alpha` is used for each input channel.
             Default is 1.
         init (float, optional): Init value of learnable `weight`. Default is 0.25.
         weight_attr(ParamAttr, optional): The parameter attribute for the learnable `weight`.
diff --git a/python/paddle/nn/layer/transformer.py b/python/paddle/nn/layer/transformer.py
index b0b6e62a602aa..340372f9b6a4e 100644
--- a/python/paddle/nn/layer/transformer.py
+++ b/python/paddle/nn/layer/transformer.py
@@ -163,9 +163,9 @@ def __init__(self,
         super(MultiHeadAttention, self).__init__()
 
         assert embed_dim > 0, ("Expected embed_dim to be greater than 0, "
-                               "but recieved {}".format(embed_dim))
+                               "but received {}".format(embed_dim))
         assert num_heads > 0, ("Expected num_heads to be greater than 0, "
-                               "but recieved {}".format(num_heads))
+                               "but received {}".format(num_heads))
 
         self.embed_dim = embed_dim
         self.kdim = kdim if kdim is not None else embed_dim
@@ -508,12 +508,12 @@ def __init__(self,
         super(TransformerEncoderLayer, self).__init__()
 
         assert d_model > 0, ("Expected d_model to be greater than 0, "
-                             "but recieved {}".format(d_model))
+                             "but received {}".format(d_model))
         assert nhead > 0, ("Expected nhead to be greater than 0, "
-                           "but recieved {}".format(nhead))
+                           "but received {}".format(nhead))
         assert dim_feedforward > 0, (
             "Expected dim_feedforward to be greater than 0, "
-            "but recieved {}".format(dim_feedforward))
+            "but received {}".format(dim_feedforward))
 
         attn_dropout = dropout if attn_dropout is None else attn_dropout
         act_dropout = dropout if act_dropout is None else act_dropout
@@ -813,12 +813,12 @@ def __init__(self,
         super(TransformerDecoderLayer, self).__init__()
 
         assert d_model > 0, ("Expected d_model to be greater than 0, "
-                             "but recieved {}".format(d_model))
+                             "but received {}".format(d_model))
         assert nhead > 0, ("Expected nhead to be greater than 0, "
-                           "but recieved {}".format(nhead))
+                           "but received {}".format(nhead))
         assert dim_feedforward > 0, (
             "Expected dim_feedforward to be greater than 0, "
-            "but recieved {}".format(dim_feedforward))
+            "but received {}".format(dim_feedforward))
 
         attn_dropout = dropout if attn_dropout is None else attn_dropout
         act_dropout = dropout if act_dropout is None else act_dropout
@@ -1220,12 +1220,12 @@ def __init__(self,
         super(Transformer, self).__init__()
 
         assert d_model > 0, ("Expected d_model to be greater than 0, "
-                             "but recieved {}".format(d_model))
+                             "but received {}".format(d_model))
         assert nhead > 0, ("Expected nhead to be greater than 0, "
-                           "but recieved {}".format(nhead))
+                           "but received {}".format(nhead))
         assert dim_feedforward > 0, (
             "Expected dim_feedforward to be greater than 0, "
-            "but recieved {}".format(dim_feedforward))
+            "but received {}".format(dim_feedforward))
 
         if isinstance(bias_attr, (list, tuple)):
             if len(bias_attr) == 1:
diff --git a/python/paddle/profiler/profiler.py b/python/paddle/profiler/profiler.py
index 77adbaff34859..9df595bc3ae73 100644
--- a/python/paddle/profiler/profiler.py
+++ b/python/paddle/profiler/profiler.py
@@ -150,7 +150,7 @@ def getScheduleState(step: int) -> ProfilerState:
 
 def _default_state_scheduler(step: int):
     r"""
-    A default state scheduler, keep recording from the begining of the profiler until ending.
+    A default state scheduler, keep recording from the beginning of the profiler until ending.
     """
     return ProfilerState.RECORD
 
diff --git a/python/paddle/profiler/timer.py b/python/paddle/profiler/timer.py
index 1fb06ddc55e39..815775ebc6aad 100644
--- a/python/paddle/profiler/timer.py
+++ b/python/paddle/profiler/timer.py
@@ -193,7 +193,7 @@ def begin(self, benchmark):
     def before_reader(self, benchmark):
         """
         Initialize the start time of the dataloader. This function will be
-        called at the begining of `next` method in `_DataLoaderIterMultiProcess` or
+        called at the beginning of `next` method in `_DataLoaderIterMultiProcess` or
         `_DataLoaderIterSingleProcess`.
 
         """
@@ -220,8 +220,8 @@ def after_step(self, benchmark):
         Record the cost for the current step. It will contain the cost of the loading
         data if there is a dataloader. Similar to `after_reader`, it will also update
         the maximum, minimum and the total time from the step 11 to the current step
-        as well as the the maximum and minimum speed of the model. This function will
-        be called in in `Profiler.step()`.
+        as well as the maximum and minimum speed of the model. This function will
+        be called in `Profiler.step()`.
 
         """
 
@@ -401,7 +401,7 @@ def check_if_need_record(self, reader):
                 # enter a new task but not calling beign() to record it.
                 # we pause the timer until the end of new task, so that 
                 # the cost of new task is not added to the current event.
-                # eg. start evaluation in the traing task
+                # eg. start evaluation in the training task
                 self.current_event.need_record = False
         else:
             # when the new task exits, continue timing for the current event.
diff --git a/python/paddle/profiler/utils.py b/python/paddle/profiler/utils.py
index fd75ab9550d52..5e95c83129f53 100644
--- a/python/paddle/profiler/utils.py
+++ b/python/paddle/profiler/utils.py
@@ -78,7 +78,7 @@ def __exit__(self, exc_type: Any, exc_value: Any, traceback: Any):
 
     def begin(self):
         r"""
-        Record the time of begining.
+        Record the time of beginning.
 
         Examples:
 
diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py
index 5ee372f7b956a..7cfae842337c6 100644
--- a/python/paddle/tensor/math.py
+++ b/python/paddle/tensor/math.py
@@ -3346,7 +3346,7 @@ def increment(x, value=1.0, name=None):
 
 def all(x, axis=None, keepdim=False, name=None):
     """
-    Computes the the ``logical and`` of tensor elements over the given dimension.
+    Computes the ``logical and`` of tensor elements over the given dimension.
 
     Args:
         x (Tensor): An N-D Tensor, the input data type should be `bool`.
@@ -3442,7 +3442,7 @@ def all(x, axis=None, keepdim=False, name=None):
 
 def any(x, axis=None, keepdim=False, name=None):
     """
-    Computes the the ``logical or`` of tensor elements over the given dimension.
+    Computes the ``logical or`` of tensor elements over the given dimension.
 
     Args:
         x (Tensor): An N-D Tensor, the input data type should be `bool`.
diff --git a/python/paddle/tensor/to_string.py b/python/paddle/tensor/to_string.py
index 71c97d4cac986..42d3bf9fca364 100644
--- a/python/paddle/tensor/to_string.py
+++ b/python/paddle/tensor/to_string.py
@@ -42,7 +42,7 @@ def set_printoptions(precision=None,
     Args:
         precision (int, optional): Number of digits of the floating number, default 8.
         threshold (int, optional): Total number of elements printed, default 1000.
-        edgeitems (int, optional): Number of elements in summary at the begining and ending of each dimension, default 3.
+        edgeitems (int, optional): Number of elements in summary at the beginning and ending of each dimension, default 3.
         sci_mode (bool, optional): Format the floating number with scientific notation or not, default False.
         linewidth (int, optional): Number of characters each line, default 80.
        
diff --git a/python/paddle/vision/models/mobilenetv3.py b/python/paddle/vision/models/mobilenetv3.py
index da7ae010c58f6..70aa1b833d648 100644
--- a/python/paddle/vision/models/mobilenetv3.py
+++ b/python/paddle/vision/models/mobilenetv3.py
@@ -39,7 +39,7 @@
 class SqueezeExcitation(nn.Layer):
     """
     This block implements the Squeeze-and-Excitation block from https://arxiv.org/abs/1709.01507 (see Fig. 1).
-    Parameters ``activation``, and ``scale_activation`` correspond to ``delta`` and ``sigma`` in in eq. 3.
+    Parameters ``activation``, and ``scale_activation`` correspond to ``delta`` and ``sigma`` in eq. 3.
     This code is based on the torchvision code with modifications.
     You can also see at https://github.com/pytorch/vision/blob/main/torchvision/ops/misc.py#L127
     Args:

From cde2b24d0c01d310889224cbfee9d6b7989e1941 Mon Sep 17 00:00:00 2001
From: Linjie Chen <40840292+linjieccc@users.noreply.github.com>
Date: Fri, 13 May 2022 10:07:09 +0800
Subject: [PATCH 34/49] Update api docs (#42725)

---
 python/paddle/nn/functional/loss.py | 22 +++++++++++++---------
 python/paddle/nn/layer/loss.py      | 14 +++++++++-----
 2 files changed, 22 insertions(+), 14 deletions(-)

diff --git a/python/paddle/nn/functional/loss.py b/python/paddle/nn/functional/loss.py
index 0d5ad46a4fb5b..d08821e510c2b 100755
--- a/python/paddle/nn/functional/loss.py
+++ b/python/paddle/nn/functional/loss.py
@@ -392,20 +392,24 @@ def hsigmoid_loss(input,
 
             paddle.set_device('cpu')
 
-            input = paddle.uniform([2, 3])
-            # [[-0.8018668   0.8736385  -0.9064771 ] # random
-            #  [-0.10228515 -0.87188244 -0.8783718 ]] # random
+            input = paddle.uniform([4, 3])
+            # [[0.45424712  -0.77296764  0.82943869] # random
+            #  [0.85062802  0.63303483  0.35312140] # random
+            #  [0.57170701  0.16627562  0.21588242] # random
+            #  [0.27610803  -0.99303514  -0.17114788]] # random
             label = paddle.to_tensor([0, 1, 4, 5])
             num_classes = 5
             weight=paddle.uniform([num_classes-1, 3])
-            # [[-0.24148715  0.8449961  -0.7399121 ] # random
-            #  [-0.9800559   0.43509364  0.9091208 ] # random
-            #  [ 0.60194826  0.10430074 -0.4521166 ] # random
-            #  [-0.4469818  -0.01536179 -0.604454  ]] # random
+            # [[-0.64477652  0.24821866  -0.17456549] # random
+            #  [-0.04635394  0.07473493  -0.25081766] # random
+            #  [ 0.05986035  -0.12185556  0.45153677] # random
+            #  [-0.66236806  0.91271877  -0.88088769]] # random
 
             out=F.hsigmoid_loss(input, label, num_classes, weight)
-            # [[3.0159328]
-            #  [2.2407534]]
+            # [[1.96709502]
+            #  [2.40019274]
+            #  [2.11009121]
+            #  [1.92374969]]
     """
 
     if _non_static_mode():
diff --git a/python/paddle/nn/layer/loss.py b/python/paddle/nn/layer/loss.py
index d4e059b6dfa49..a20e7de751d16 100644
--- a/python/paddle/nn/layer/loss.py
+++ b/python/paddle/nn/layer/loss.py
@@ -465,14 +465,18 @@ class HSigmoidLoss(Layer):
             import paddle
             paddle.set_device('cpu')
 
-            input = paddle.uniform([2, 3])
-            # [[-0.2820413   0.9528898  -0.81638825] # random
-            #  [-0.6733154  -0.33866507  0.25770962]] # random
+            input = paddle.uniform([4, 3])
+            # [[0.56194401  -0.22450298  -0.10741806] # random
+            #  [0.36136317  0.23556745  0.88748658] # random
+            #  [0.18151939  0.80947340  -0.31078976] # random
+            #  [0.68886101  -0.14239830  -0.41297770]] # random
             label = paddle.to_tensor([0, 1, 4, 5])
             m = paddle.nn.HSigmoidLoss(3, 5)
             out = m(input, label)
-            # [[2.4543471]
-            #  [1.9359267]]
+            # [[2.42524505]
+            #  [1.74917245]
+            #  [3.14571381]
+            #  [2.34564662]]
     """
 
     def __init__(self,

From 82ce2d56f4587a01ff2f96c922f5101d681217f4 Mon Sep 17 00:00:00 2001
From: Allen Guo <alleng@graphcore.ai>
Date: Fri, 13 May 2022 10:12:36 +0800
Subject: [PATCH 35/49] update new feature of mlperf (#735) (#42706)

* update new feature of mlperf

Co-authored-by: yaozhixin <522190855@qq.com>
---
 paddle/fluid/platform/device/ipu/ipu_executor.cc | 7 ++++++-
 paddle/fluid/platform/device/ipu/ipu_strategy.cc | 2 ++
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/paddle/fluid/platform/device/ipu/ipu_executor.cc b/paddle/fluid/platform/device/ipu/ipu_executor.cc
index 96c2b4f9a9ded..d01f8b65bbcc8 100644
--- a/paddle/fluid/platform/device/ipu/ipu_executor.cc
+++ b/paddle/fluid/platform/device/ipu/ipu_executor.cc
@@ -215,7 +215,12 @@ void Executor::Run(const std::vector<const Tensor *> &inputs,
 
   popart::StepIO stepio(popart_inputs, popart_anchors);
   VLOG(10) << "Running...";
-  session_->run(stepio);
+  if (ipu_strategy_->popart_options.createImplicitPipeliningFwdOnlyProgram &&
+      ipu_strategy_->runtime_options.enable_eval) {
+    session_->run("implicitPipeliningFwdOnly", stepio);
+  } else {
+    session_->run(stepio);
+  }
   VLOG(10) << "Running...done";
 }
 
diff --git a/paddle/fluid/platform/device/ipu/ipu_strategy.cc b/paddle/fluid/platform/device/ipu/ipu_strategy.cc
index 5bf705864ef3c..714f44c69b0d9 100644
--- a/paddle/fluid/platform/device/ipu/ipu_strategy.cc
+++ b/paddle/fluid/platform/device/ipu/ipu_strategy.cc
@@ -285,6 +285,8 @@ IpuStrategy::IpuStrategy() {
   ADD_POPART_BOOL_OPTION_ALIAS(
       schedule_non_weight_update_gradient_consumers_early,
       scheduleNonWeightUpdateGradientConsumersEarly);
+  ADD_POPART_BOOL_OPTION_ALIAS(create_implicit_pipelining_fwd_only_program,
+                               createImplicitPipeliningFwdOnlyProgram);
 
   ADD_POPART_DOUBLE_OPTION_ALIAS(outline_sequence_break_cost,
                                  outlineSequenceBreakCost);

From 3e7e0af689554b6dd4b2adfd99c2af7cd7d3cadf Mon Sep 17 00:00:00 2001
From: Weilong Wu <veyron_wu@163.com>
Date: Fri, 13 May 2022 10:46:23 +0800
Subject: [PATCH 36/49] [Eager] Roll back legacy for fused_attention and
 fused_feedforward op test (#42719)

---
 python/paddle/fluid/tests/unittests/test_fused_attention_op.py | 3 ++-
 .../paddle/fluid/tests/unittests/test_fused_feedforward_op.py  | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/test_fused_attention_op.py b/python/paddle/fluid/tests/unittests/test_fused_attention_op.py
index a3ae2a20dba23..67160f59952ef 100644
--- a/python/paddle/fluid/tests/unittests/test_fused_attention_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fused_attention_op.py
@@ -26,7 +26,8 @@
 from paddle.fluid import layers
 import unittest
 from op_test import OpTest
-from paddle.fluid.framework import default_main_program
+from paddle.fluid.framework import default_main_program, _enable_legacy_dygraph
+_enable_legacy_dygraph()
 
 default_main_program().random_seed = 42
 
diff --git a/python/paddle/fluid/tests/unittests/test_fused_feedforward_op.py b/python/paddle/fluid/tests/unittests/test_fused_feedforward_op.py
index a533b5d87a5a9..8c68eb243aea8 100644
--- a/python/paddle/fluid/tests/unittests/test_fused_feedforward_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fused_feedforward_op.py
@@ -23,7 +23,8 @@
 from paddle.nn.layer.common import Linear, Dropout
 import unittest
 from op_test import OpTest
-from paddle.fluid.framework import default_main_program
+from paddle.fluid.framework import default_main_program, _enable_legacy_dygraph
+_enable_legacy_dygraph()
 
 
 class TestFusedFFNOp(OpTest):

From 99c9265f408a5e972cb1753162f968bc35086bbf Mon Sep 17 00:00:00 2001
From: wangna11BD <79366697+wangna11BD@users.noreply.github.com>
Date: Fri, 13 May 2022 11:24:20 +0800
Subject: [PATCH 37/49] fix spectral_norm en doc (#42728)

---
 python/paddle/nn/utils/spectral_norm_hook.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/paddle/nn/utils/spectral_norm_hook.py b/python/paddle/nn/utils/spectral_norm_hook.py
index 75266abdf0d13..56c9e83c38b06 100644
--- a/python/paddle/nn/utils/spectral_norm_hook.py
+++ b/python/paddle/nn/utils/spectral_norm_hook.py
@@ -178,7 +178,7 @@ def spectral_norm(layer,
        .. code-block:: python
 
             from paddle.nn import Conv2D
-            from paddle.nn.utils import Spectralnorm
+            from paddle.nn.utils import spectral_norm
 
             conv = Conv2D(3, 1, 3)
             sn_conv = spectral_norm(conv)

From 0c6baf3c95c8e8e364ad8175daaf6d62d0020e78 Mon Sep 17 00:00:00 2001
From: Leo Chen <chenqiuliang@baidu.com>
Date: Fri, 13 May 2022 11:28:26 +0800
Subject: [PATCH 38/49] refine directory of build-time (#42717)

---
 CMakeLists.txt          | 4 ++--
 tools/get_build_time.sh | 5 +++--
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 51c0ef35f1efa..433081ee2256b 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -256,8 +256,8 @@ option(WITH_CUSTOM_DEVICE "Compile with custom device support"    OFF)
 option(WITH_ARM_BRPC "Supprot Brpc in Arm"    OFF)
 
 if(WITH_RECORD_BUILDTIME)
-    set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE "${CMAKE_CURRENT_SOURCE_DIR}/tools/get_build_time.sh")
-    set_property(GLOBAL PROPERTY RULE_LAUNCH_LINK "${CMAKE_CURRENT_SOURCE_DIR}/tools/get_build_time.sh")
+    set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE "${CMAKE_CURRENT_SOURCE_DIR}/tools/get_build_time.sh ${CMAKE_CURRENT_BINARY_DIR}")
+    set_property(GLOBAL PROPERTY RULE_LAUNCH_LINK "${CMAKE_CURRENT_SOURCE_DIR}/tools/get_build_time.sh ${CMAKE_CURRENT_BINARY_DIR}")
 else()            
     include(ccache) # set ccache for compilation ; if WITH_RECORD_BUILDTIME=ON can't use ccache
 endif()
diff --git a/tools/get_build_time.sh b/tools/get_build_time.sh
index 1563fefff3799..496c8c12d6ca3 100755
--- a/tools/get_build_time.sh
+++ b/tools/get_build_time.sh
@@ -14,9 +14,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-CUR_ROOT=$(dirname "$0")/..
+CMAKE_BINARY_DIR=$1
+shift
 start=$(date +%s.%N)
 duration=$("/usr/bin/time" -f "%C, %E elapsed, %U user, %S sys" "$@" 2>&1)
 end=$(date +%s.%N)
 
-echo ${duration}, 'start', $start, 'end', $end, 'process', $$ >> $CUR_ROOT/build/build-time
+echo ${duration}, 'start', $start, 'end', $end, 'process', $$ >> $CMAKE_BINARY_DIR/build-time

From 3052f36c3a4cd35894f48ecc0bd19895b4da9b5f Mon Sep 17 00:00:00 2001
From: kuizhiqing <kuizhiqing@baidu.com>
Date: Fri, 13 May 2022 13:41:54 +0800
Subject: [PATCH 39/49] [Launch] add gpu report during training (#42675)

* add nvsmi

* collect gpu info to log

* fix unitest

* rm ret_type
---
 .../launch/controllers/controller.py          |   6 +
 .../distributed/launch/controllers/watcher.py |  95 ++++++++++++++
 .../paddle/distributed/launch/utils/nvsmi.py  | 117 ++++++++++++++++++
 .../paddle/fluid/tests/unittests/test_run.py  |   4 +-
 4 files changed, 221 insertions(+), 1 deletion(-)
 create mode 100644 python/paddle/distributed/launch/controllers/watcher.py
 create mode 100644 python/paddle/distributed/launch/utils/nvsmi.py

diff --git a/python/paddle/distributed/launch/controllers/controller.py b/python/paddle/distributed/launch/controllers/controller.py
index 69b2237f0ba7d..f069bfbcd3501 100644
--- a/python/paddle/distributed/launch/controllers/controller.py
+++ b/python/paddle/distributed/launch/controllers/controller.py
@@ -21,6 +21,7 @@
 from paddle.distributed.launch.job.container import Container
 
 from .master import Master
+from .watcher import Watcher
 
 import time
 
@@ -39,6 +40,8 @@ def __init__(self, ctx):
         self.ctx = ctx
         self.master = Master.factory(self.ctx)
 
+        self.watcher = Watcher(self.ctx)
+
         self.job = Job(nnodes=self.ctx.args.nnodes,
                        mode=self.ctx.args.run_mode,
                        jid=self.ctx.args.job_id)
@@ -114,6 +117,9 @@ def watch(self) -> bool:
 
     def stop(self, sigint=None):
         self.ctx.logger.debug("Controller stop")
+
+        self.watcher.stop()
+
         self.master.stop()
         self.pod.stop(sigint)
 
diff --git a/python/paddle/distributed/launch/controllers/watcher.py b/python/paddle/distributed/launch/controllers/watcher.py
new file mode 100644
index 0000000000000..4d49b924f1e81
--- /dev/null
+++ b/python/paddle/distributed/launch/controllers/watcher.py
@@ -0,0 +1,95 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ..utils.nvsmi import get_gpu_process, get_gpu_util, get_gpu_info
+import time
+import os
+
+from threading import Thread
+
+
+class Watcher(object):
+    def __init__(self, ctx):
+        self.ctx = ctx
+
+        self.interval = 10
+
+        self.gpu_util = []
+
+        # gpu log file
+        self.gpus = self.ctx.args.devices or self.ctx.node.device.labels
+        if len(self.gpus) > 0:
+            fn = os.path.join(self.ctx.args.log_dir,
+                              "{}.gpu.log".format(self.ctx.args.job_id))
+            os.makedirs(os.path.dirname(fn), exist_ok=True)
+            self.gpu_fd = open(fn, 'w')
+        else:
+            return
+
+        # start
+        self.proc = Thread(target=self.watch)
+        self.proc.daemon = True
+        self.proc.start()
+
+    def watch(self):
+        if not len(self.gpus) > 0:
+            return
+
+        self._print_gpu_info()
+
+        util_key = "index,utilization_gpu,memory_total,memory_used,memory_free,timestamp"
+        self.gpu_fd.write(util_key)
+        self.gpu_fd.write('\n')
+
+        while not self.ctx.status.is_done():
+            self._save_gpu_log(util_key)
+            time.sleep(self.interval)
+
+        if hasattr(self, "gpu_fd"):
+            self.gpu_fd.close()
+
+    def _print_gpu_info(self):
+        try:
+            info_key = "index,uuid,driver_version,name,gpu_serial,display_active,display_mode"
+            self.gpu_fd.write(info_key)
+            self.gpu_fd.write('\n')
+            for line in get_gpu_info(self.gpus):
+                self.gpu_fd.write(line.str(info_key))
+                self.gpu_fd.write('\n')
+            self.gpu_fd.write('\n')
+
+            process_key = "pid,process_name,gpu_uuid,gpu_name,used_memory"
+            self.gpu_fd.write(process_key)
+            self.gpu_fd.write('\n')
+            for line in get_gpu_process(self.gpus):
+                self.gpu_fd.write(line.str(process_key))
+                self.gpu_fd.write('\n')
+            self.gpu_fd.write('\n')
+
+            self.gpu_fd.flush()
+        except:
+            self.ctx.log.error("save gpu info failed")
+
+    def _save_gpu_log(self, util_key):
+        try:
+            for line in get_gpu_util(self.gpus):
+                self.gpu_fd.write(line.str(util_key))
+                self.gpu_fd.write('\n')
+            self.gpu_fd.flush()
+        except:
+            self.ctx.log.error("save gpu log failed")
+
+    def stop(self):
+        if hasattr(self, "proc"):
+            self.proc.join()
diff --git a/python/paddle/distributed/launch/utils/nvsmi.py b/python/paddle/distributed/launch/utils/nvsmi.py
new file mode 100644
index 0000000000000..82a23189ac6af
--- /dev/null
+++ b/python/paddle/distributed/launch/utils/nvsmi.py
@@ -0,0 +1,117 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import subprocess
+import shlex
+import os
+import json
+import shutil
+
+
+class Info(object):
+    def __repr__(self):
+        return str(self.__dict__)
+
+    def json(self):
+        return json.dumps(self.__dict__)
+
+    def dict(self):
+        return self.__dict__
+
+    def str(self, keys=None):
+        if keys is None:
+            keys = self.__dict__.keys()
+
+        if isinstance(keys, str):
+            keys = keys.split(',')
+
+        values = [str(self.__dict__.get(k, '')) for k in keys]
+        return ",".join(values)
+
+
+def query_smi(query=None, query_type="gpu", index=None, dtype=None):
+    """
+    query_type: gpu/compute
+    """
+
+    if not has_nvidia_smi():
+        return []
+
+    cmd = ["nvidia-smi", "--format=csv,noheader,nounits"]
+    if isinstance(query, list) and query_type == "gpu":
+        cmd.extend(["--query-gpu={}".format(",".join(query))])
+    elif isinstance(query, list) and query_type.startswith("compute"):
+        cmd.extend(["--query-compute-apps={}".format(",".join(query))])
+    else:
+        return
+
+    if isinstance(index, list) and len(index) > 0:
+        cmd.extend(["--id={}".format(",".join(index))])
+    if not isinstance(dtype, list) or len(dtype) != len(query):
+        dtype = [str] * len(query)
+
+    output = subprocess.check_output(cmd, timeout=3)
+    lines = output.decode("utf-8").split(os.linesep)
+    ret = []
+    for line in lines:
+        if not line:
+            continue
+        info = Info()
+        for k, v, d in zip(query, line.split(", "), dtype):
+            setattr(info, k.replace(".", "_"), d(v))
+        ret.append(info)
+    return ret
+
+
+def get_gpu_info(index=None):
+    q = "index,uuid,driver_version,name,gpu_serial,display_active,display_mode".split(
+        ",")
+    d = [int, str, str, str, str, str, str]
+    index = index if index is None or isinstance(
+        index, list) else str(index).split(",")
+
+    return query_smi(q, index=index, dtype=d)
+
+
+def get_gpu_util(index=None):
+    q = "index,utilization.gpu,memory.total,memory.used,memory.free,timestamp".split(
+        ",")
+    d = [int, int, int, int, int, str]
+    index = index if index is None or isinstance(
+        index, list) else str(index).split(",")
+
+    return query_smi(q, index=index, dtype=d)
+
+
+def get_gpu_process(index=None):
+    q = "pid,process_name,gpu_uuid,gpu_name,used_memory".split(",")
+    d = [int, str, str, str, int]
+    index = index if index is None or isinstance(
+        index, list) else str(index).split(",")
+
+    return query_smi(q, index=index, query_type="compute", dtype=d)
+
+
+def has_nvidia_smi():
+    return shutil.which("nvidia-smi")
+
+
+if __name__ == '__main__':
+    print(get_gpu_info(0))
+    print(get_gpu_util(0))
+    print(get_gpu_process(0))
+
+    u = get_gpu_util()
+    for i in u:
+        print(i.str())
diff --git a/python/paddle/fluid/tests/unittests/test_run.py b/python/paddle/fluid/tests/unittests/test_run.py
index 365d3f931c27c..28bcc379fb9a0 100644
--- a/python/paddle/fluid/tests/unittests/test_run.py
+++ b/python/paddle/fluid/tests/unittests/test_run.py
@@ -51,7 +51,9 @@ def write_file(name, ct):
 
 def get_files(pth, prefix):
     return [
-        f for f in listdir(pth) if isfile(join(pth, f)) and f.startswith(prefix)
+        f for f in listdir(pth)
+        if isfile(join(pth, f)) and f.startswith(prefix) and f !=
+        f"{prefix}.gpu.log"
     ]
 
 

From cbc5ca0fb9869e94b34e7687fc45754cd17b98ab Mon Sep 17 00:00:00 2001
From: Tao CHANG <wuziyou199217@gmail.com>
Date: Fri, 13 May 2022 13:50:25 +0800
Subject: [PATCH 40/49] add communication cost for cost model (#42727)

---
 .../auto_parallel/cost/__init__.py            |  13 +-
 .../auto_parallel/cost/base_cost.py           |  18 +-
 .../auto_parallel/cost/comm_op_cost.py        | 140 +++++++++++++++-
 .../unittests/auto_parallel/CMakeLists.txt    |   1 +
 .../unittests/auto_parallel/test_comm_cost.py | 158 ++++++++++++++++++
 .../auto_parallel/test_new_cost_model.py      |  26 ++-
 6 files changed, 347 insertions(+), 9 deletions(-)
 create mode 100644 python/paddle/fluid/tests/unittests/auto_parallel/test_comm_cost.py

diff --git a/python/paddle/distributed/auto_parallel/cost/__init__.py b/python/paddle/distributed/auto_parallel/cost/__init__.py
index 9ea58d6979527..ea6b3bc5b7e76 100644
--- a/python/paddle/distributed/auto_parallel/cost/__init__.py
+++ b/python/paddle/distributed/auto_parallel/cost/__init__.py
@@ -14,7 +14,16 @@
 
 from .base_cost import _g_op_cost_factory
 from .base_cost import Cost
-from .comm_op_cost import AllreduceSumCost
-from .comp_op_cost import MatmulV2OpCost
+from .base_cost import CommContext
+from .base_cost import build_comm_desc
 from .tensor_cost import TensorCost
 from .estimate_cost import CostEstimator
+
+from .comp_op_cost import MatmulV2OpCost
+
+from .comm_op_cost import SendOpCost
+from .comm_op_cost import RecvOpCost
+from .comm_op_cost import IdentityOpCost
+from .comm_op_cost import BroadcastOpCost
+from .comm_op_cost import AllgatherOpCost
+from .comm_op_cost import AllreduceSumOpCost
diff --git a/python/paddle/distributed/auto_parallel/cost/base_cost.py b/python/paddle/distributed/auto_parallel/cost/base_cost.py
index cb16d522bc9e3..f1843b8f16527 100644
--- a/python/paddle/distributed/auto_parallel/cost/base_cost.py
+++ b/python/paddle/distributed/auto_parallel/cost/base_cost.py
@@ -13,15 +13,31 @@
 # limitations under the License
 
 from collections import OrderedDict
+from functools import reduce
+
 import paddle
 
+from ..cluster import LinkType
+from ..process_group import get_process_group
+
 COMM_OP_TYPE = [
-    "send_v2", "recv_v2", "c_broadcast", "c_allgather", "c_allreduce_sum"
+    "send_v2", "recv_v2", "c_broadcast", "c_allgather", "c_allreduce_sum",
+    "c_identity"
 ]
 NON_COMP_TYPE = ["while"] + COMM_OP_TYPE
 _g_op_cost_factory = {}
 
 
+def build_comm_desc(op_type, group_ranks, dtype, shape, attrs=None):
+    desc = {}
+    desc["op"] = op_type
+    desc["group_ranks"] = group_ranks
+    desc["inputs"] = {"X": [(dtype, shape)]}
+    if attrs is not None:
+        desc["attrs"] = attrs
+    return desc
+
+
 def _parse_op_to_desc(op, dist_context=None):
     desc = {}
     desc["op"] = op.type
diff --git a/python/paddle/distributed/auto_parallel/cost/comm_op_cost.py b/python/paddle/distributed/auto_parallel/cost/comm_op_cost.py
index 235741ba12f4f..a32fdf1824e62 100644
--- a/python/paddle/distributed/auto_parallel/cost/comm_op_cost.py
+++ b/python/paddle/distributed/auto_parallel/cost/comm_op_cost.py
@@ -12,17 +12,149 @@
 # See the License for the specific language governing permissions and
 # limitations under the License
 
-from .base_cost import register_op_cost, CommOpCost
+import math
+
+from .base_cost import register_op_cost, CommOpCost, _g_op_cost_factory
 
 
 @register_op_cost
-class AllreduceSumCost(CommOpCost):
+class AllreduceSumOpCost(CommOpCost):
     OP_TYPE = "c_allreduce_sum"
 
     def __init__(self, op=None, op_desc=None, comm_context=None):
-        super(AllreduceSumCost, self).__init__(
+        super(AllreduceSumOpCost, self).__init__(
+            op=op, op_desc=op_desc, comm_context=comm_context)
+
+    def calc_time(self):
+        # use tree if cross machine and use ring if in a single machine
+        time = None
+        cluster = self.comm_context.cluster
+        if not cluster.cross_machine(self.group_ranks):
+            time = self.calc_time_ring()
+        else:
+            time = self.calc_time_tree()
+
+        return time
+
+    def calc_time_ring(self):
+        alpha = self.comm_context.base_ring
+        alpha += 2 * (
+            self.rank_count - self.machine_count) * self.comm_context.intra_ring
+        alpha += 2 * (self.machine_count - 1) * (
+            self.comm_context.inter_ring + self.hops * self.comm_context.switch)
+        beta = self.comm_context.get_max_beta(self.group_ranks)
+        time = alpha + 2 * (self.rank_count - 1
+                            ) / self.rank_count * self.comm_count * beta
+
+        return time
+
+    def calc_time_tree(self):
+        alpha = self.comm_context.base_tree
+        alpha += 2 * (self.rank_count / self.machine_count - 1
+                      ) * self.comm_context.intra_tree
+        alpha += math.log2(self.machine_count) * (
+            self.comm_context.inter_tree + self.hops * self.comm_context.switch)
+        beta = self.comm_context.get_max_beta(self.group_ranks)
+
+        time = alpha + 2 * self.comm_count * beta
+
+        return time
+
+
+@register_op_cost
+class AllgatherOpCost(CommOpCost):
+    OP_TYPE = "c_allgather"
+
+    def __init__(self, op=None, op_desc=None, comm_context=None):
+        super(AllgatherOpCost, self).__init__(
+            op=op, op_desc=op_desc, comm_context=comm_context)
+
+    def calc_time(self):
+        time = self.calc_time_ring()
+        return time
+
+    def calc_time_ring(self):
+        alpha = self.comm_context.base_ring
+        alpha += (
+            self.rank_count - self.machine_count) * self.comm_context.intra_ring
+        alpha += (self.machine_count - 1) * (
+            self.comm_context.inter_ring + self.hops * self.comm_context.switch)
+        beta = self.comm_context.get_max_beta(self.group_ranks)
+        time = alpha + (self.rank_count - 1
+                        ) / self.rank_count * self.comm_count * beta
+        return time
+
+
+@register_op_cost
+class BroadcastOpCost(CommOpCost):
+    OP_TYPE = "c_broadcast"
+
+    def __init__(self, op=None, op_desc=None, comm_context=None):
+        super(BroadcastOpCost, self).__init__(
+            op=op, op_desc=op_desc, comm_context=comm_context)
+
+    def calc_time(self):
+        time = self.calc_time_ring()
+        return time
+
+    def calc_time_ring(self):
+        alpha = self.comm_context.base_ring
+        if self.machine_count > 1:
+            alpha += self.comm_context.inter_ring + self.hops * self.comm_context.switch
+        else:
+            alpha += self.comm_context.intra_ring
+        beta = self.comm_context.get_max_beta(self.group_ranks)
+        time = alpha + self.comm_count * beta
+
+        return time
+
+
+@register_op_cost
+class IdentityOpCost(CommOpCost):
+    OP_TYPE = "c_identity"
+
+    def __init__(self, op=None, op_desc=None, comm_context=None):
+        super(IdentityOpCost, self).__init__(
             op=op, op_desc=op_desc, comm_context=comm_context)
 
     def calc_time(self):
-        # NOTE: The actual formula will be filled in the future.
         return 0
+
+
+@register_op_cost
+class RecvOpCost(CommOpCost):
+    OP_TYPE = "recv_v2"
+
+    def __init__(self, op=None, op_desc=None, comm_context=None):
+        super(RecvOpCost, self).__init__(
+            op=op, op_desc=op_desc, comm_context=comm_context)
+
+    def calc_time(self):
+        alpha = self.comm_context.base_ring
+        if self.machine_count > 1:
+            alpha += self.comm_context.inter_ring + self.hops * self.comm_context.switch
+        else:
+            alpha += self.comm_context.intra_ring
+        beta = self.comm_context.get_max_beta(self.group_ranks)
+        time = alpha + self.comm_count * beta
+        return time
+
+
+@register_op_cost
+class SendOpCost(CommOpCost):
+    OP_TYPE = "send_v2"
+
+    def __init__(self, op=None, op_desc=None, comm_context=None):
+        super(SendOpCost, self).__init__(
+            op=op, op_desc=op_desc, comm_context=comm_context)
+
+    def calc_time(self):
+        alpha = self.comm_context.base_ring
+        if self.machine_count > 1:
+            alpha += self.comm_context.inter_ring + self.hops * self.comm_context.switch
+        else:
+            alpha += self.comm_context.intra_ring
+        beta = self.comm_context.get_max_beta(self.group_ranks)
+        time = alpha + self.comm_count * beta
+
+        return time
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt b/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt
index 7c747338593a3..1f846f5d7361c 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt
@@ -29,4 +29,5 @@ if(WITH_DISTRIBUTE AND WITH_GPU)
     py_test_modules(test_dist_pnorm MODULES test_dist_pnorm ENVS ${dist_ENVS})
     py_test_modules(test_dist_slice MODULES test_dist_slice ENVS ${dist_ENVS})
     py_test_modules(test_cluster MODULES test_cluster ENVS ${dist_ENVS})
+    py_test_modules(test_comm_cost MODULES test_comm_cost ENVS ${dist_ENVS})
 endif()
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_comm_cost.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_comm_cost.py
new file mode 100644
index 0000000000000..f0ad1f4ed314d
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_comm_cost.py
@@ -0,0 +1,158 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import os
+import json
+
+import paddle
+from paddle.distributed.auto_parallel.cluster import Cluster
+from paddle.distributed.auto_parallel.cost import CommContext
+from paddle.distributed.auto_parallel.cost import build_comm_desc
+from paddle.distributed.auto_parallel.cost import AllreduceSumOpCost
+from paddle.distributed.auto_parallel.cost import AllgatherOpCost
+from paddle.distributed.auto_parallel.cost import BroadcastOpCost
+from paddle.distributed.auto_parallel.cost import SendOpCost
+from paddle.distributed.auto_parallel.cost import RecvOpCost
+from paddle.distributed.auto_parallel.cost import IdentityOpCost
+
+from test_cluster import cluster_json, multi_cluster_json
+
+
+class TestCommOpCost(unittest.TestCase):
+    def test_comm_cost(self):
+        # Build cluster
+        file_dir = os.path.dirname(os.path.abspath(__file__))
+        cluster_json_path = os.path.join(file_dir, "auto_parallel_cluster.json")
+        cluster_json_object = json.loads(cluster_json)
+        with open(cluster_json_path, "w") as cluster_json_file:
+            json.dump(cluster_json_object, cluster_json_file)
+        cluster = Cluster()
+        cluster.build_from_file(cluster_json_path)
+
+        # Build CommConetxt
+        CommContext._has_instance = None
+        CommContext._instance = None
+        comm_context = CommContext(cluster)
+
+        # Check AllreduceSumCost 128MB ring cost
+        allreduce_sum_op_desc = build_comm_desc(
+            "c_allreduce_sum", [0, 1, 2, 3, 4, 5, 6, 7], paddle.float32,
+            [1, 32 * (10**6)])
+        allreduce_sum_op_cost = AllreduceSumOpCost(
+            op_desc=allreduce_sum_op_desc, comm_context=comm_context)
+
+        # Check AllgatherOpCost cost
+        allgather_op_desc = build_comm_desc("c_allgather",
+                                            [0, 1, 2, 3, 4, 5, 6, 7],
+                                            paddle.float32, [1, 32 * (10**6)])
+        allgather_op_cost = AllgatherOpCost(
+            op_desc=allgather_op_desc, comm_context=comm_context)
+        self.assertTrue(allgather_op_cost.time > 0)
+
+        # Check BroadcastOpCost cost
+        broadcast_op_desc = build_comm_desc("c_broadcast",
+                                            [0, 1, 2, 3, 4, 5, 6, 7],
+                                            paddle.float32, [1, 32 * (10**6)])
+        broadcast_op_cost = BroadcastOpCost(
+            op_desc=broadcast_op_desc, comm_context=comm_context)
+        self.assertTrue(broadcast_op_cost.time > 0)
+
+        # Check SendOpCost cost
+        send_op_desc = build_comm_desc("send_v2", [0, 1], paddle.float32,
+                                       [1, 32 * (10**6)])
+        send_op_cost = SendOpCost(
+            op_desc=send_op_desc, comm_context=comm_context)
+        self.assertTrue(send_op_cost.time > 0)
+
+        # Check RecvOpCost cost
+        recv_op_desc = build_comm_desc("recv_v2", [0, 1], paddle.float32,
+                                       [1, 32 * (10**6)])
+        recv_op_cost = RecvOpCost(
+            op_desc=recv_op_desc, comm_context=comm_context)
+        self.assertTrue(recv_op_cost.time > 0)
+
+        # Check IdentityOpCost cost
+        identity_op_desc = build_comm_desc("c_identity", [0, 1], paddle.float32,
+                                           [1, 32 * (10**6)])
+        identity_op_cost = IdentityOpCost(
+            op_desc=identity_op_desc, comm_context=comm_context)
+        self.assertTrue(identity_op_cost.time >= 0)
+
+        # Remove unnecessary files
+        if os.path.exists(cluster_json_path):
+            os.remove(cluster_json_path)
+
+    def test_cross_machine_comm_cost(self):
+        # Build cluster
+        file_dir = os.path.dirname(os.path.abspath(__file__))
+        cluster_json_path = os.path.join(file_dir, "auto_parallel_cluster.json")
+        cluster_json_object = json.loads(multi_cluster_json)
+        with open(cluster_json_path, "w") as cluster_json_file:
+            json.dump(cluster_json_object, cluster_json_file)
+        cluster = Cluster()
+        cluster.build_from_file(cluster_json_path)
+
+        # Build CommConetxt
+        CommContext._has_instance = None
+        CommContext._instance = None
+        comm_context = CommContext(cluster)
+
+        # Check AllreduceSumCost 128MB ring cost
+        allreduce_sum_op_desc = build_comm_desc(
+            "c_allreduce_sum",
+            [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
+            paddle.float32, [1, 32 * (10**6)])
+        allreduce_sum_op_cost = AllreduceSumOpCost(
+            op_desc=allreduce_sum_op_desc, comm_context=comm_context)
+
+        # Check AllgatherOpCost cost
+        allgather_op_desc = build_comm_desc(
+            "c_allgather",
+            [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
+            paddle.float32, [1, 32 * (10**6)])
+        allgather_op_cost = AllgatherOpCost(
+            op_desc=allgather_op_desc, comm_context=comm_context)
+        self.assertTrue(allgather_op_cost.time > 0)
+
+        # Check BroadcastOpCost cost
+        broadcast_op_desc = build_comm_desc(
+            "c_broadcast",
+            [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
+            paddle.float32, [1, 32 * (10**6)])
+        broadcast_op_cost = BroadcastOpCost(
+            op_desc=broadcast_op_desc, comm_context=comm_context)
+        self.assertTrue(broadcast_op_cost.time > 0)
+
+        # Check SendOpCost cost
+        send_op_desc = build_comm_desc("send_v2", [0, 1], paddle.float32,
+                                       [1, 32 * (10**6)])
+        send_op_cost = SendOpCost(
+            op_desc=send_op_desc, comm_context=comm_context)
+        self.assertTrue(send_op_cost.time > 0)
+
+        # Check RecvOpCost cost
+        recv_op_desc = build_comm_desc("recv_v2", [0, 1], paddle.float32,
+                                       [1, 32 * (10**6)])
+        recv_op_cost = RecvOpCost(
+            op_desc=recv_op_desc, comm_context=comm_context)
+        self.assertTrue(recv_op_cost.time > 0)
+
+        # Remove unnecessary files
+        if os.path.exists(cluster_json_path):
+            os.remove(cluster_json_path)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_new_cost_model.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_new_cost_model.py
index 6d6fbfe78e9e6..c0df01ada58f9 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/test_new_cost_model.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_new_cost_model.py
@@ -13,12 +13,17 @@
 # limitations under the License.
 
 import unittest
+import os
+import json
 
 import paddle
 import paddle.distributed.auto_parallel.cost as cost_model
 from paddle.distributed.auto_parallel.cost.base_cost import parse_to_desc
 from paddle.distributed.auto_parallel.cost.base_cost import parse_desc_to_str
 from paddle.distributed.auto_parallel.cost.base_cost import calc_time_by_modeling
+from paddle.distributed.auto_parallel.cluster import Cluster
+from paddle.distributed.auto_parallel.cost import CommContext
+from test_cluster import cluster_json, multi_cluster_json
 
 paddle.enable_static()
 
@@ -58,14 +63,31 @@ def test_comp_cost(self):
         self.assertEqual(tensor_cost.cost.memory, 1600)
 
     def test_comm_cost(self):
+        # Build cluster
+        file_dir = os.path.dirname(os.path.abspath(__file__))
+        cluster_json_path = os.path.join(file_dir, "auto_parallel_cluster.json")
+        cluster_json_object = json.loads(cluster_json)
+        with open(cluster_json_path, "w") as cluster_json_file:
+            json.dump(cluster_json_object, cluster_json_file)
+        cluster = Cluster()
+        cluster.build_from_file(cluster_json_path)
+
+        # Build CommConetxt
+        CommContext._has_instance = None
+        CommContext._instance = None
+        comm_context = CommContext(cluster)
         desc = {}
         desc["op"] = "c_allreduce_sum"
-        desc["inputs"] = {"X": [([100, 200], paddle.float32)]}
+        desc["inputs"] = {"X": [(paddle.float32, [100, 200])]}
         desc["group_ranks"] = [0, 1]
         allreduce_cost = cost_model._g_op_cost_factory["c_allreduce_sum"](
-            op_desc=desc)
+            op_desc=desc, comm_context=CommContext(cluster))
         self.assertTrue(check_cost(allreduce_cost.cost))
 
+        # Remove unnecessary files
+        if os.path.exists(cluster_json_path):
+            os.remove(cluster_json_path)
+
     def test_cost_estimator(self):
         train_program = paddle.static.Program()
         cost_estimator = cost_model.CostEstimator(train_program)

From 9840fb70c71da7d16ce80f0f36b1f216fc0cb5a1 Mon Sep 17 00:00:00 2001
From: Weilong Wu <veyron_wu@163.com>
Date: Fri, 13 May 2022 14:09:32 +0800
Subject: [PATCH 41/49] [Eager] Support test_dist_hapi_model under eager mode
 (#42702)

* [Eager] Support test_dist_hapi_model under eager mode

* [Eager] Polish code

* Fix code-format issue, coverage-ci issue
---
 python/paddle/fluid/layers/collective.py      | 19 ++++++++++++++++++-
 python/paddle/hapi/model.py                   | 12 ++++++++++--
 .../paddle/tests/dist_hapi_mnist_dynamic.py   |  2 +-
 python/paddle/tests/test_dist_hapi_model.py   | 11 ++++++++++-
 4 files changed, 39 insertions(+), 5 deletions(-)

diff --git a/python/paddle/fluid/layers/collective.py b/python/paddle/fluid/layers/collective.py
index 43eb436f65e78..0b4211cbb63dc 100644
--- a/python/paddle/fluid/layers/collective.py
+++ b/python/paddle/fluid/layers/collective.py
@@ -14,7 +14,9 @@
 
 from __future__ import print_function
 from ..layer_helper import LayerHelper, unique_name
-from ..framework import Variable
+from ..framework import Variable, in_dygraph_mode, _in_legacy_dygraph
+import paddle
+from paddle import _C_ops
 
 
 def _allreduce(x, out=None, reduce_type="sum", sync_mode=False):
@@ -107,6 +109,21 @@ def _c_broadcast(x, root=0, ring_id=0, use_calc_stream=False):
 
 def _c_allgather(x, nranks, ring_id=0, use_calc_stream=False):
     op_type = 'c_allgather'
+
+    if in_dygraph_mode():
+        group = paddle.distributed.collective._get_default_group()
+        tensor_shape = list(x.shape)
+        tensor_shape[0] *= nranks
+        out = paddle.empty(tensor_shape, x.dtype)
+        task = group.process_group.all_gather(x, out)
+        task.wait()
+        return out
+
+    if _in_legacy_dygraph():
+        attrs = ('nranks', nranks, 'ring_id', ring_id, 'use_calc_stream',
+                 use_calc_stream)
+        return _C_ops.c_allgather(x, *attrs)
+
     helper = LayerHelper(op_type, **locals())
     out_shape = list(x.shape[:])
     if out_shape[0] > 0:
diff --git a/python/paddle/hapi/model.py b/python/paddle/hapi/model.py
index 4b349a1957731..a7a5e59f39409 100644
--- a/python/paddle/hapi/model.py
+++ b/python/paddle/hapi/model.py
@@ -29,7 +29,7 @@
 import paddle
 from paddle import fluid
 from paddle.fluid import core
-from paddle.fluid.framework import _non_static_mode
+from paddle.fluid.framework import _non_static_mode, in_dygraph_mode
 from paddle.fluid.framework import Variable
 from paddle.fluid.framework import _get_paddle_place
 from paddle.fluid.framework import _current_expected_place as _get_device
@@ -761,6 +761,15 @@ def eval_batch(self, inputs, labels=None):
         labels = [to_variable(l) for l in to_list(labels)]
 
         outputs = self.model.network.forward(*[to_variable(x) for x in inputs])
+
+        # Transfrom data to expected device
+        expected_device = paddle.device.get_device()
+        for o in to_list(outputs):
+            o._to(device=expected_device)
+
+        for l in labels:
+            l._to(device=expected_device)
+
         if self.model._loss:
             losses = self.model._loss(*(to_list(outputs) + labels))
             losses = to_list(losses)
@@ -2088,7 +2097,6 @@ def _run_one_epoch(
             callbacks.on_batch_begin(mode, step, logs)
 
             if mode != 'predict':
-
                 _inputs = [data[:len(self._inputs)], data[len(self._inputs):]]
                 if mode == 'train':
                     _inputs.append((step + 1) % self._accumulate == 0 or
diff --git a/python/paddle/tests/dist_hapi_mnist_dynamic.py b/python/paddle/tests/dist_hapi_mnist_dynamic.py
index eab34a6dafbc3..de0518e229b0a 100644
--- a/python/paddle/tests/dist_hapi_mnist_dynamic.py
+++ b/python/paddle/tests/dist_hapi_mnist_dynamic.py
@@ -58,7 +58,7 @@ def compute_accuracy(pred, gt):
 @unittest.skipIf(not fluid.is_compiled_with_cuda(),
                  'CPU testing is not supported')
 class TestDistTraning(unittest.TestCase):
-    def test_static_multiple_gpus(self):
+    def test_dynamic_multiple_gpus(self):
         device = set_device('gpu')
 
         im_shape = (-1, 1, 28, 28)
diff --git a/python/paddle/tests/test_dist_hapi_model.py b/python/paddle/tests/test_dist_hapi_model.py
index 16788e4656192..006800d3caeee 100644
--- a/python/paddle/tests/test_dist_hapi_model.py
+++ b/python/paddle/tests/test_dist_hapi_model.py
@@ -52,6 +52,7 @@ def get_gpus(selected_gpus):
 def start_local_trainers(cluster,
                          pod,
                          training_script,
+                         eager_mode,
                          training_script_args,
                          log_dir=None):
     current_env = copy.copy(os.environ.copy())
@@ -72,6 +73,9 @@ def start_local_trainers(cluster,
             "PADDLE_TRAINER_ENDPOINTS": ",".join(cluster.trainers_endpoints())
         }
 
+        if not eager_mode:
+            proc_env["FLAGS_enable_eager_mode"] = "%d" % 0
+
         current_env.update(proc_env)
 
         print("trainer proc env:{}".format(current_env))
@@ -99,7 +103,7 @@ def start_local_trainers(cluster,
 
 
 class TestMultipleGpus(unittest.TestCase):
-    def run_mnist_2gpu(self, target_file_name):
+    def run_mnist_2gpu(self, target_file_name, eager_mode=True):
         if fluid.core.get_cuda_device_count() == 0:
             return
 
@@ -112,6 +116,7 @@ def run_mnist_2gpu(self, target_file_name):
         procs = start_local_trainers(
             cluster,
             pod,
+            eager_mode=eager_mode,
             training_script=target_file_name,
             training_script_args=[])
 
@@ -125,13 +130,17 @@ def run_mnist_2gpu(self, target_file_name):
 
     def test_hapi_multiple_gpus_static(self):
         self.run_mnist_2gpu('dist_hapi_mnist_static.py')
+        self.run_mnist_2gpu('dist_hapi_mnist_static.py', eager_mode=False)
 
     def test_hapi_multiple_gpus_dynamic(self):
         self.run_mnist_2gpu('dist_hapi_mnist_dynamic.py')
+        self.run_mnist_2gpu('dist_hapi_mnist_dynamic.py', eager_mode=False)
 
     def test_hapi_amp_static(self):
         self.run_mnist_2gpu('dist_hapi_pure_fp16_static.py')
+        self.run_mnist_2gpu('dist_hapi_pure_fp16_static.py', eager_mode=False)
 
 
 if __name__ == "__main__":
+    os.environ["FLAGS_enable_eager_mode"] = "1"
     unittest.main()

From 9029fde7487f6c168e4ee22c886a3e7f2a255537 Mon Sep 17 00:00:00 2001
From: Qi Li <qili93@qq.com>
Date: Fri, 13 May 2022 14:50:31 +0800
Subject: [PATCH 42/49] [IPU] fix ipu and add python infer api, test=develop
 (#42724)

* [IPU] fix ipu and add python infer api, test=develop

* [IPU] add paddlepaddle-ipu package name, test=develop
---
 paddle/fluid/framework/ir/CMakeLists.txt | 1 -
 paddle/fluid/pybind/inference_api.cc     | 8 ++++++++
 python/CMakeLists.txt                    | 2 ++
 3 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt
index b430a409e9965..bfefb89ade145 100755
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -164,7 +164,6 @@ if(WITH_IPU)
     pass_library(infer_shape_pass base DIR ipu)
     pass_library(delete_scale_op_pass base DIR ipu)
     pass_library(avg_shard_pass base DIR ipu)
-    pass_library(transfer_cast_op_pass base DIR ipu)
 endif()
 
 cc_library(fuse_bn_act_pass SRCS fuse_bn_act_pass.cc DEPS pass graph_pattern_detector )
diff --git a/paddle/fluid/pybind/inference_api.cc b/paddle/fluid/pybind/inference_api.cc
index 1bbe6808b2846..944781484076b 100644
--- a/paddle/fluid/pybind/inference_api.cc
+++ b/paddle/fluid/pybind/inference_api.cc
@@ -601,6 +601,14 @@ void BindAnalysisConfig(py::module *m) {
       .def("set_xpu_device_id", &AnalysisConfig::SetXpuDeviceId,
            py::arg("device_id") = 0)
       .def("enable_npu", &AnalysisConfig::EnableNpu, py::arg("device_id") = 0)
+      .def("enable_ipu", &AnalysisConfig::EnableIpu,
+           py::arg("ipu_device_num") = 1, py::arg("ipu_micro_batch_size") = 1,
+           py::arg("ipu_enable_pipelining") = false,
+           py::arg("ipu_batches_per_step") = 1)
+      .def("set_ipu_config", &AnalysisConfig::SetIpuConfig,
+           py::arg("ipu_enable_fp16") = false, py::arg("ipu_replica_num") = 1,
+           py::arg("ipu_available_memory_proportion") = 1.0,
+           py::arg("ipu_enable_half_partial") = false)
       .def("disable_gpu", &AnalysisConfig::DisableGpu)
       .def("enable_onnxruntime", &AnalysisConfig::EnableONNXRuntime)
       .def("disable_onnxruntime", &AnalysisConfig::DisableONNXRuntime)
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index fe5f2c25ca551..fdcd560658146 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -14,6 +14,8 @@ elseif(WITH_ASCEND_CL)
   SET(PACKAGE_NAME "paddlepaddle-npu")
 elseif(WITH_XPU)
   SET(PACKAGE_NAME "paddlepaddle-xpu")
+elseif(WITH_IPU)
+  SET(PACKAGE_NAME "paddlepaddle-ipu")
 else()
   SET(PACKAGE_NAME "paddlepaddle")
 endif()

From 73342c67b255ff4d98406fb5403529b6b6b51ff2 Mon Sep 17 00:00:00 2001
From: Chen Long <1300851984@qq.com>
Date: Fri, 13 May 2022 15:17:49 +0800
Subject: [PATCH 43/49] Fix example code bugs (#42739)

* update readme test=document_fix

* fix api docs bugs test=document_fix
---
 python/paddle/nn/utils/weight_norm_hook.py | 24 ++++++++++++++--------
 1 file changed, 15 insertions(+), 9 deletions(-)

diff --git a/python/paddle/nn/utils/weight_norm_hook.py b/python/paddle/nn/utils/weight_norm_hook.py
index c131d218a1cde..84644ccc48445 100755
--- a/python/paddle/nn/utils/weight_norm_hook.py
+++ b/python/paddle/nn/utils/weight_norm_hook.py
@@ -213,15 +213,21 @@ def remove_weight_norm(layer, name='weight'):
     Examples:
         .. code-block:: python
           
-          import paddle
-          from paddle.nn import Conv2D
-          from paddle.nn.utils import weight_norm, remove_weight_norm
-
-          conv = Conv2D(3, 5, 3)
-          wn = weight_norm(conv)
-          remove_weight_norm(conv)
-          print(conv.weight_g)
-          # AttributeError: 'Conv2D' object has no attribute 'weight_g'
+            import paddle
+            from paddle.nn import Conv2D
+            from paddle.nn.utils import weight_norm, remove_weight_norm
+
+            conv = Conv2D(3, 5, 3)
+            wn = weight_norm(conv)
+            print(conv.weight_g)
+            # Parameter containing:
+            # Tensor(shape=[5], dtype=float32, place=Place(gpu:0), stop_gradient=False,
+            #        [0., 0., 0., 0., 0.])
+            # Conv2D(3, 5, kernel_size=[3, 3], data_format=NCHW)
+
+            remove_weight_norm(conv)
+            # print(conv.weight_g)
+            # AttributeError: 'Conv2D' object has no attribute 'weight_g'
     """
     for k, hook in layer._forward_pre_hooks.items():
         if isinstance(hook, WeightNorm) and hook.name == name:

From a52f8e4abc79c5332ab31933065088aa970e06df Mon Sep 17 00:00:00 2001
From: LielinJiang <50691816+LielinJiang@users.noreply.github.com>
Date: Fri, 13 May 2022 16:04:47 +0800
Subject: [PATCH 44/49] Fix decode_jpeg example code (#42733)

* fix decode_jpeg example code

* fix decode_jpeg example code
---
 python/paddle/vision/ops.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/python/paddle/vision/ops.py b/python/paddle/vision/ops.py
index e4dd4c797fef6..d45c652885b69 100644
--- a/python/paddle/vision/ops.py
+++ b/python/paddle/vision/ops.py
@@ -895,7 +895,10 @@ def decode_jpeg(x, mode='unchanged', name=None):
 
     Examples:
         .. code-block:: python
+
+            # required: gpu
             import cv2
+            import numpy as np
             import paddle
 
             fake_img = (np.random.random(

From b372ab3e2dcba01a29aa5b6f21358496095953fb Mon Sep 17 00:00:00 2001
From: Walter <dongshl1226@hotmail.com>
Date: Fri, 13 May 2022 16:23:51 +0800
Subject: [PATCH 45/49] fix adaptive_avg_pool1d doc bug (#42721)

* fix adaptive_avg_pool1d doc bug

* fix adaptive_avg_pool1d doc bug
---
 python/paddle/nn/functional/pooling.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/python/paddle/nn/functional/pooling.py b/python/paddle/nn/functional/pooling.py
index 3160f04e830d2..121028c1f0ae5 100755
--- a/python/paddle/nn/functional/pooling.py
+++ b/python/paddle/nn/functional/pooling.py
@@ -1267,10 +1267,9 @@ def adaptive_avg_pool1d(x, output_size, name=None):
     Returns:
             Tensor: The output tensor of adaptive average pooling result. The data type is same
                       as input tensor.
-    Raises:
-            ValueError: 'output_size' should be an integer.
     Examples:
         .. code-block:: python
+          :name: code-example1
 
               # average adaptive pool1d
               # suppose input data in shape of [N, C, L], `output_size` is m or [m],
@@ -1286,10 +1285,9 @@ def adaptive_avg_pool1d(x, output_size, name=None):
               #
               import paddle
               import paddle.nn.functional as F
-              import numpy as np
 
-              data = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32]).astype(np.float32))
-              pool_out = F.adaptive_average_pool1d(data, output_size=16)
+              data = paddle.uniform([1, 3, 32])
+              pool_out = F.adaptive_avg_pool1d(data, output_size=16)
               # pool_out shape: [1, 3, 16])
     """
     pool_type = 'avg'

From c3d50af8204db2dacdc4b462476883780baa783e Mon Sep 17 00:00:00 2001
From: Wei Shengyu <weisy11@163.com>
Date: Fri, 13 May 2022 16:26:18 +0800
Subject: [PATCH 46/49] Fix max_pool3d doc, test=document_fix (#42715)

* fix pooling doc

* fix typo test=document_fix

* fix doc typo, test=document_fix
---
 python/paddle/nn/functional/pooling.py | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/python/paddle/nn/functional/pooling.py b/python/paddle/nn/functional/pooling.py
index 121028c1f0ae5..6a573005f4514 100755
--- a/python/paddle/nn/functional/pooling.py
+++ b/python/paddle/nn/functional/pooling.py
@@ -1160,22 +1160,21 @@ def max_pool3d(x,
 
             import paddle
             import paddle.nn.functional as F
-            import numpy as np
 
             # max pool3d
-            x = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32, 32, 32]).astype(np.float32))
-            output = F.max_pool2d(x,
+            x = paddle.uniform([1, 3, 32, 32, 32])
+            output = F.max_pool3d(x,
                                   kernel_size=2,
                                   stride=2, padding=0)
-            output.shape [1, 3, 16, 16, 16]
+            # output.shape [1, 3, 16, 16, 16]
             # for return_mask=True
-            x = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32, 32, 32]).astype(np.float32))
+            x = paddle.uniform([1, 3, 32, 32, 32])
             output, max_indices = paddle.nn.functional.max_pool3d(x,
                                           kernel_size = 2,
                                           stride = 2,
                                           padding=0,
                                           return_mask=True)
-            # output.shape [None, 3, 16, 16, 16], max_indices.shape [None, 3, 16, 16, 16],
+            # output.shape [1, 3, 16, 16, 16], max_indices.shape [1, 3, 16, 16, 16]
     """
     kernel_size = utils.convert_to_list(kernel_size, 3, 'pool_size')
     if stride is None:

From cc0fa79b185d0ca627c54d9222c921f1d39370b6 Mon Sep 17 00:00:00 2001
From: wuhuanzhou <mr.avin0323@gmail.com>
Date: Fri, 13 May 2022 17:02:07 +0800
Subject: [PATCH 47/49] fix sample code error of paddle.lerp, test=document_fix
 (#42748)

---
 python/paddle/tensor/math.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py
index 7cfae842337c6..d1d8581766747 100644
--- a/python/paddle/tensor/math.py
+++ b/python/paddle/tensor/math.py
@@ -3810,7 +3810,7 @@ def lerp(x, y, weight, name=None):
             x = paddle.arange(1., 5., dtype='float32')
             y = paddle.empty([4], dtype='float32')
             y.fill_(10.)
-            out = paddle.lerp(start, end, 0.5)
+            out = paddle.lerp(x, y, 0.5)
             # out: [5.5., 6., 6.5, 7.]
 
     """

From 757b5d31d1b22b04f9b64e2e5e33aa05012e1185 Mon Sep 17 00:00:00 2001
From: Ruibiao Chen <chenruibiao@baidu.com>
Date: Fri, 13 May 2022 17:04:33 +0800
Subject: [PATCH 48/49] Refactor test_tensordot (#42650)

* Refactor test_tensordot

* Add test_static

* Fix CI errors
---
 .../fluid/tests/unittests/CMakeLists.txt      |   4 +-
 .../fluid/tests/unittests/test_tensordot.py   | 222 +++++++++++-------
 2 files changed, 132 insertions(+), 94 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 0b53046d056ee..b211fe37bf52b 100755
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -1036,6 +1036,7 @@ set_tests_properties(test_imperative_selected_rows_to_lod_tensor PROPERTIES TIME
 set_tests_properties(test_index_select_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_parallel_ssa_graph_inference_feed_partial_data PROPERTIES TIMEOUT 120)
 set_tests_properties(test_parallel_executor_crf PROPERTIES TIMEOUT 120)
+set_tests_properties(test_tensordot PROPERTIES TIMEOUT 200)
 set_tests_properties(test_imperative_save_load PROPERTIES TIMEOUT 120)
 set_tests_properties(test_partial_eager_deletion_transformer PROPERTIES TIMEOUT 120)
 set_tests_properties(test_parallel_executor_seresnext_with_reduce_gpu PROPERTIES TIMEOUT 120)
@@ -1233,9 +1234,6 @@ if(WITH_GPU OR WITH_ROCM)
 endif()
 set_tests_properties(test_inplace_addto_strategy PROPERTIES TIMEOUT 120)
 set_tests_properties(test_eigvals_op PROPERTIES TIMEOUT 400)
-set_tests_properties(test_tensordot PROPERTIES TIMEOUT 1000)
-set_tests_properties(test_tensordot PROPERTIES LABELS "RUN_TYPE=NIGHTLY")
-set_tests_properties(test_tensordot PROPERTIES ENVIRONMENT "FLAGS_USE_STANDALONE_EXECUTOR=False")
 set_tests_properties(test_cuda_memory_reserved PROPERTIES ENVIRONMENT "FLAGS_allocator_strategy=auto_growth")
 if (WITH_GLOO)
     set_tests_properties(test_parallel_dygraph_dataparallel_cpuonly PROPERTIES TIMEOUT 30)
diff --git a/python/paddle/fluid/tests/unittests/test_tensordot.py b/python/paddle/fluid/tests/unittests/test_tensordot.py
index 29f3308988f6d..9ac016511c20d 100644
--- a/python/paddle/fluid/tests/unittests/test_tensordot.py
+++ b/python/paddle/fluid/tests/unittests/test_tensordot.py
@@ -12,13 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import paddle
-import unittest
-import paddle.fluid.core as core
-import numpy as np
 import itertools as it
+import numpy as np
+import unittest
 
-np.set_printoptions(threshold=np.inf)
+import paddle
+import paddle.fluid.core as core
 
 
 def tensordot_np(x, y, axes):
@@ -68,9 +67,16 @@ def tensordot_np(x, y, axes):
 
 class TestTensordotAPI(unittest.TestCase):
     def setUp(self):
+        self.set_place()
         self.set_dtype()
         self.set_input_shape()
         self.set_input_data()
+        self.set_test_axes()
+
+    def set_place(self):
+        self.places = [core.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            self.places.append(core.CUDAPlace(0))
 
     def set_dtype(self):
         self.dtype = np.float32
@@ -82,33 +88,8 @@ def set_input_shape(self):
     def set_input_data(self):
         self.x = np.random.random(self.x_shape).astype(self.dtype)
         self.y = np.random.random(self.y_shape).astype(self.dtype)
-        self.all_axes = [2]
 
-    def run_dygraph(self, place):
-        paddle.disable_static()
-        x = paddle.to_tensor(self.x, place=place)
-        y = paddle.to_tensor(self.y, place=place)
-        paddle_res = paddle.tensordot(x, y, self.axes)
-        np_res = tensordot_np(self.x, self.y, self.axes)
-        np.testing.assert_allclose(paddle_res, np_res, rtol=1e-6)
-
-    def run_static(self, place):
-        paddle.enable_static()
-        with paddle.static.program_guard(paddle.static.Program(),
-                                         paddle.static.Program()):
-            x = paddle.static.data(
-                name='x', shape=self.x_shape, dtype=self.dtype)
-            y = paddle.static.data(
-                name='y', shape=self.y_shape, dtype=self.dtype)
-            z = paddle.tensordot(x, y, self.axes)
-            exe = paddle.static.Executor(place)
-            paddle_res = exe.run(feed={'x': self.x,
-                                       'y': self.y},
-                                 fetch_list=[z])
-            np_res = tensordot_np(self.x, self.y, self.axes)
-            np.testing.assert_allclose(paddle_res[0], np_res, rtol=1e-6)
-
-    def test_cases(self):
+    def set_test_axes(self):
         self.all_axes = []
         axial_index = range(4)
         all_permutations = list(it.permutations(axial_index, 0)) + list(
@@ -136,57 +117,146 @@ def test_cases(self):
 
         self.all_axes.extend(range(5))
 
-        places = [core.CPUPlace()]
-        if core.is_compiled_with_cuda():
-            places.append(core.CUDAPlace(0))
-
+    def test_dygraph(self):
+        paddle.disable_static()
+        for axes in self.all_axes:
+            for place in self.places:
+                x = paddle.to_tensor(self.x, place=place)
+                y = paddle.to_tensor(self.y, place=place)
+                paddle_res = paddle.tensordot(x, y, axes)
+                np_res = tensordot_np(self.x, self.y, axes)
+                np.testing.assert_allclose(paddle_res, np_res, rtol=1e-6)
+
+    def test_static(self):
+        paddle.enable_static()
         for axes in self.all_axes:
-            self.axes = axes
-            for place in places:
-                self.run_dygraph(place)
-                self.run_static(place)
+            for place in self.places:
+                with paddle.static.program_guard(paddle.static.Program(),
+                                                 paddle.static.Program()):
+                    x = paddle.static.data(
+                        name='x', shape=self.x_shape, dtype=self.dtype)
+                    y = paddle.static.data(
+                        name='y', shape=self.y_shape, dtype=self.dtype)
+                    z = paddle.tensordot(x, y, axes)
+                    exe = paddle.static.Executor(place)
+                    paddle_res = exe.run(feed={'x': self.x,
+                                               'y': self.y},
+                                         fetch_list=[z])
+                    np_res = tensordot_np(self.x, self.y, axes)
+                    np.testing.assert_allclose(paddle_res[0], np_res, rtol=1e-6)
 
 
 class TestTensordotAPIFloat64(TestTensordotAPI):
+    # Only test a small part of axes case for Float64 type
+    def set_test_axes(self):
+        self.all_axes = [
+            [[3, 2], [3]], [[2, 1, 0], [2, 1]], [[1, 2, 0], [1, 3, 2]], [3, 0],
+            [[], [0, 3, 1]], [[2, 1, 0, 3], [2, 0, 1, 3]],
+            [[3, 1, 2], [1, 3, 2, 0]], [[2, 1], [0, 2]], [[2, 0, 1, 3], [2]],
+            [[1, 2, 0, 3], [0, 2, 1]], [[2, 1, 3, 0], [1, 2, 3]],
+            [[2, 0, 1, 3], [3, 1, 0, 2]], [[0, 3], [0, 3, 2, 1]],
+            [[1, 3, 2, 0], [2, 1, 0, 3]], [[1, 3, 2, 0], [1, 3, 2, 0]],
+            [[1, 0, 2], [0, 1]], [[2, 3, 0], [3, 1]],
+            [[1, 3, 2, 0], [3, 0, 1, 2]], [[3, 2, 1], [2, 0, 1]], [[0], []],
+            [[2, 3, 0], [1, 2, 0]], [[3, 0, 2, 1], [2, 1, 0, 3]],
+            [[3, 1, 2], [2, 3, 1]], [[1, 0, 2, 3], []], [[1, 2], [1, 2, 3]],
+            [[2, 0, 1, 3], [2, 0, 1]], [[3, 1, 2], [1, 3, 2]],
+            [[3, 1, 2, 0], [1, 2, 3, 0]], [[0, 2, 3], [0, 1, 2]],
+            [[3, 2, 0], [2, 0, 3, 1]], [[2, 1, 0, 3], [3, 1, 2, 0]],
+            [[1, 2, 3, 0], [1, 3, 0, 2]], [[3, 0], [2, 1]],
+            [[0, 1, 3, 2], [0, 2, 1, 3]], [[1, 0], [2, 1, 3]],
+            [[1, 0, 3, 2], [2, 3, 0, 1]], [[1, 2], [3]],
+            [[1, 2, 3, 0], [3, 2, 1, 0]], [[0, 3, 2, 1], [2, 1, 3, 0]], [0],
+            [[0, 2, 3], [3, 2, 0, 1]], [[1, 2, 3, 0], [3, 2, 1, 0]],
+            [[3, 1], [3]], [[3, 2, 0, 1], [3, 2, 0]], [[2, 3, 0, 1], [0, 3, 2]],
+            [[1], [1, 3]], [[1, 2], [2, 1, 0]], [[3, 1, 2], [3, 1, 0]],
+            [[1, 3], [3, 1, 2]], [[2, 0, 1, 3], [3, 1, 0, 2]],
+            [[1, 3, 0], [1, 3]], [[2, 3, 1], [1, 0, 2]],
+            [[1, 2, 0, 3], [0, 2, 1, 3]], [[2], [0, 1, 3]], [[1], [1, 2]],
+            [[1, 0, 2, 3], [3, 0, 1, 2]], [[0, 1, 3, 2], [1, 3, 0, 2]],
+            [[3, 0, 2, 1], [0, 2, 3]], [[1, 2, 0], [1, 2, 3]],
+            [[1, 0, 3], [2, 3, 0]], [[2, 3, 0], [3, 1, 0]], [[1, 3], [1, 0]],
+            [[2, 1, 0, 3], [2, 0, 3, 1]], [[3, 2, 0], [2, 1, 0]],
+            [[0, 1, 3], [0, 3, 1]], [[3, 1, 0], [3, 2, 1]], [[3, 2], [3, 1]],
+            [[3], [2, 1, 0]], [[1, 2, 3, 0], []], [[1, 3, 2, 0], [3, 1, 2]],
+            [[1], [0, 2]], [[3, 2, 0], [3, 2, 0]], [[3], []],
+            [[1, 0, 3], [2, 1]], [[3, 1, 0, 2], [2, 3, 1, 0]],
+            [[0, 1], [0, 3, 2]], [[0, 2, 3], [0, 2, 1]], [[1, 3, 0], [3, 0, 2]],
+            [[3, 1, 2], [1, 2, 3]], [[3, 1, 2], [3, 1, 0]],
+            [[0, 3, 1, 2], [3, 2, 1, 0]], [[0, 3], [3, 2, 1]],
+            [[2, 3], [1, 3, 0]], [[0, 3, 2], [2, 0, 3, 1]], [[2, 3], [1, 3]],
+            [[3, 1, 2, 0], [2, 3, 1, 0]], [[1, 0, 3, 2], [3, 0, 1, 2]],
+            [[3, 2, 1, 0], [0, 1, 3, 2]], [[3, 1, 2], [3]],
+            [[0, 1, 3, 2], [2, 3, 0, 1]], [[1, 2, 3, 0], [1, 3, 0, 2]],
+            [3, 1, 2], [[3, 1, 2], [0, 3, 2]], [[2, 3, 0], [1, 2, 0]],
+            [[2, 0, 3], [2, 0]], [[3, 1, 0, 2], [3, 1, 0, 2]],
+            [[0, 1, 2], [2, 0, 1]], [[1, 0, 3], [2, 3, 0]],
+            [[2, 0, 1], [0, 1, 3]], [[2, 1], [0, 1, 3]]
+        ]
+
     def set_dtype(self):
         self.dtype = np.float64
 
 
+class TestTensordotAPIBroadcastCase1(TestTensordotAPIFloat64):
+    def set_input_shape(self):
+        self.x_shape = [1, 1, 1, 5]
+        self.y_shape = [1, 5, 1, 1]
+
+
+class TestTensordotAPIBroadcastCase2(TestTensordotAPIFloat64):
+    def set_input_shape(self):
+        self.x_shape = [1, 5, 5, 5]
+        self.y_shape = [1, 1, 1, 5]
+
+
+class TestTensordotAPIBroadcastCase3(TestTensordotAPIFloat64):
+    def set_input_shape(self):
+        self.x_shape = [5, 5, 5, 1]
+        self.y_shape = [5, 5, 1, 5]
+
+
+class TestTensordotAPIBroadcastCase4(TestTensordotAPIFloat64):
+    def set_input_shape(self):
+        self.x_shape = [5, 5, 5, 1]
+        self.y_shape = [1, 1, 1, 1]
+
+
+class TestTensordotAPIBroadcastCase5(TestTensordotAPIFloat64):
+    def set_input_shape(self):
+        self.x_shape = [1, 1, 5, 5]
+        self.y_shape = [5, 5, 1, 5]
+
+
 class TestTensordotAPIAxesType(TestTensordotAPI):
     def set_input_shape(self):
         self.x_shape = [3, 4, 4]
         self.y_shape = [4, 4, 5]
 
-    def test_cases(self):
+    def set_test_axes(self):
         self.all_axes = [
             0, 1, 2, (1, ), [1], ((1, ), ), ([1], ), ((2, 1), (0, )), (
                 (1, 2), (0, 1)), ([1, 2], [0, 1]), ([1, 2], [0, 1]),
             [[1, 2], [0, 1]]
         ]
 
-        places = [core.CPUPlace()]
-        if core.is_compiled_with_cuda():
-            places.append(core.CUDAPlace(0))
-
-        for axes in self.all_axes:
-            self.axes = axes
-            for place in places:
-                self.run_dygraph(place)
-                self.run_static(place)
-
+    def test_tensor_axes(self):
         # The 'axes' with type 'Tensor' in tensordot is not available in static mode
         paddle.disable_static()
-        for place in places:
-            self.all_axes = [
-                paddle.to_tensor([1]), (paddle.to_tensor([1])),
-                (paddle.to_tensor([1, 2]), paddle.to_tensor([0, 1])),
-                [paddle.to_tensor([1, 2]), paddle.to_tensor([0, 1])],
-                paddle.to_tensor([[1, 2], [0, 1]])
-            ]
-            for axes in self.all_axes:
-                self.axes = axes
-                for place in places:
-                    self.run_dygraph(place)
+        tensor_axes = [
+            paddle.to_tensor([1]), (paddle.to_tensor([1])),
+            (paddle.to_tensor([1, 2]), paddle.to_tensor([0, 1])),
+            [paddle.to_tensor([1, 2]), paddle.to_tensor([0, 1])],
+            paddle.to_tensor([[1, 2], [0, 1]])
+        ]
+
+        for place in self.places:
+            for axes in tensor_axes:
+                x = paddle.to_tensor(self.x, place=place)
+                y = paddle.to_tensor(self.y, place=place)
+                paddle_res = paddle.tensordot(x, y, axes)
+                np_res = tensordot_np(self.x, self.y, axes)
+                np.testing.assert_allclose(paddle_res, np_res, rtol=1e-6)
 
     def test_error(self):
         self.all_axes = [[[[0], [1]]], 0.1, -1, 100, [[1, 2], [0, 0]],
@@ -204,35 +274,5 @@ def set_dtype(self):
         self.dtype = np.float64
 
 
-class TestTensordotAPIBroadcastCase1(TestTensordotAPI):
-    def set_input_shape(self):
-        self.x_shape = [1, 1, 1, 5]
-        self.y_shape = [1, 5, 1, 1]
-
-
-class TestTensordotAPIBroadcastCase2(TestTensordotAPI):
-    def set_input_shape(self):
-        self.x_shape = [1, 5, 5, 5]
-        self.y_shape = [1, 1, 1, 5]
-
-
-class TestTensordotAPIBroadcastCase3(TestTensordotAPI):
-    def set_input_shape(self):
-        self.x_shape = [5, 5, 5, 1]
-        self.y_shape = [5, 5, 1, 5]
-
-
-class TestTensordotAPIBroadcastCase4(TestTensordotAPI):
-    def set_input_shape(self):
-        self.x_shape = [5, 5, 5, 1]
-        self.y_shape = [1, 1, 1, 1]
-
-
-class TestTensordotAPIBroadcastCase5(TestTensordotAPI):
-    def set_input_shape(self):
-        self.x_shape = [1, 1, 5, 5]
-        self.y_shape = [5, 5, 1, 5]
-
-
 if __name__ == "__main__":
     unittest.main()

From 1280f2947d7920676267df3f3ed5354d05edfdae Mon Sep 17 00:00:00 2001
From: Wilber <jiweibo@baidu.com>
Date: Fri, 13 May 2022 17:05:20 +0800
Subject: [PATCH 49/49] add gpu resources. (#42723)

---
 paddle/fluid/inference/api/CMakeLists.txt     |   4 +-
 paddle/fluid/inference/api/infer_context.cc   |  17 +
 paddle/fluid/inference/api/infer_context.h    |  46 +++
 .../fluid/inference/api/resource_manager.cc   | 290 +++++++++++++++
 paddle/fluid/inference/api/resource_manager.h | 109 ++++++
 paddle/phi/backends/gpu/CMakeLists.txt        |   3 +-
 paddle/phi/backends/gpu/gpu_context.cc        | 343 +++++-------------
 paddle/phi/backends/gpu/gpu_context.h         |   4 +
 paddle/phi/backends/gpu/gpu_resources.cc      | 271 ++++++++++++++
 paddle/phi/backends/gpu/gpu_resources.h       |  51 +++
 10 files changed, 876 insertions(+), 262 deletions(-)
 create mode 100644 paddle/fluid/inference/api/infer_context.cc
 create mode 100644 paddle/fluid/inference/api/infer_context.h
 create mode 100644 paddle/fluid/inference/api/resource_manager.cc
 create mode 100644 paddle/fluid/inference/api/resource_manager.h
 create mode 100644 paddle/phi/backends/gpu/gpu_resources.cc
 create mode 100644 paddle/phi/backends/gpu/gpu_resources.h

diff --git a/paddle/fluid/inference/api/CMakeLists.txt b/paddle/fluid/inference/api/CMakeLists.txt
index edec1b1c7d0e4..56cc4aa755bda 100755
--- a/paddle/fluid/inference/api/CMakeLists.txt
+++ b/paddle/fluid/inference/api/CMakeLists.txt
@@ -50,10 +50,10 @@ if(WITH_GPU AND TENSORRT_FOUND)
 endif()
 
 if (WITH_ONNXRUNTIME)
-    cc_library(analysis_predictor SRCS analysis_predictor.cc onnxruntime_predictor.cc ${mkldnn_quantizer_src} DEPS ${inference_deps} 
+    cc_library(analysis_predictor SRCS analysis_predictor.cc onnxruntime_predictor.cc resource_manager.cc infer_context.cc ${mkldnn_quantizer_src} DEPS ${inference_deps}
               zero_copy_tensor ir_pass_manager op_compatible_info infer_io_utils onnxruntime paddle2onnx)
 else (WITH_ONNXRUNTIME)
-    cc_library(analysis_predictor SRCS analysis_predictor.cc ${mkldnn_quantizer_src} DEPS ${inference_deps} 
+    cc_library(analysis_predictor SRCS analysis_predictor.cc resource_manager.cc infer_context.cc ${mkldnn_quantizer_src} DEPS ${inference_deps}
               zero_copy_tensor ir_pass_manager op_compatible_info infer_io_utils)
 endif (WITH_ONNXRUNTIME)
 
diff --git a/paddle/fluid/inference/api/infer_context.cc b/paddle/fluid/inference/api/infer_context.cc
new file mode 100644
index 0000000000000..7706f2d0824e3
--- /dev/null
+++ b/paddle/fluid/inference/api/infer_context.cc
@@ -0,0 +1,17 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/api/infer_context.h"
+
+namespace paddle {}  // namespace paddle
diff --git a/paddle/fluid/inference/api/infer_context.h b/paddle/fluid/inference/api/infer_context.h
new file mode 100644
index 0000000000000..b7a8bf637d872
--- /dev/null
+++ b/paddle/fluid/inference/api/infer_context.h
@@ -0,0 +1,46 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include "paddle/phi/backends/all_context.h"
+
+namespace paddle {
+
+class InferCPUContext : public phi::CPUContext {
+ public:
+  using phi::CPUContext::SetEigenDevice;
+};
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+class InferGPUContext : public phi::GPUContext {
+ public:
+  using phi::GPUContext::SetStream;
+  using phi::GPUContext::SetEigenDevice;
+  using phi::GPUContext::SetBlasHandle;
+  using phi::GPUContext::SetBlasTensorCoreHandle;
+  using phi::GPUContext::SetBlasTF32Handle;
+  using phi::GPUContext::SetDnnHandle;
+  using phi::GPUContext::SetSolverHandle;
+  using phi::GPUContext::SetSparseHandle;
+  // using phi::GPUContext::SetDnnWorkspaceHandle;
+  using phi::GPUContext::SetComputeCapability;
+  using phi::GPUContext::SetMaxThreadsPerMultiProcessor;
+  using phi::GPUContext::SetMultiProcessors;
+  using phi::GPUContext::SetMaxThreadsPerBlock;
+  using phi::GPUContext::SetMaxGridDimSize;
+  using phi::GPUContext::SetDriverVersion;
+  using phi::GPUContext::SetRuntimeVersion;
+};
+#endif
+}  // namespace paddle
diff --git a/paddle/fluid/inference/api/resource_manager.cc b/paddle/fluid/inference/api/resource_manager.cc
new file mode 100644
index 0000000000000..d88f282ce7a62
--- /dev/null
+++ b/paddle/fluid/inference/api/resource_manager.cc
@@ -0,0 +1,290 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/api/resource_manager.h"
+
+#include <unordered_map>
+
+#include "paddle/fluid/memory/allocation/allocator_facade.h"
+#include "paddle/phi/backends/gpu/forwards.h"
+#include "paddle/phi/backends/gpu/gpu_decls.h"
+#include "paddle/phi/backends/gpu/gpu_info.h"
+#include "paddle/phi/backends/gpu/gpu_resources.h"
+#include "paddle/phi/common/place.h"
+#include "paddle/phi/core/allocator.h"
+#include "paddle/phi/core/generator.h"
+#include "unsupported/Eigen/CXX11/Tensor"
+
+namespace paddle {
+namespace internal {
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+class EigenGpuStreamDevice : public Eigen::StreamInterface {
+ public:
+  EigenGpuStreamDevice() : scratch_(nullptr), semaphore_(nullptr) {
+    Eigen::initializeDeviceProp();
+  }
+  ~EigenGpuStreamDevice() override {}
+
+  void Reinitialize(gpuStream_t cuda_stream, phi::Allocator* allocator,
+                    GPUPlace place) {
+    stream_ = cuda_stream;
+    allocator_ = allocator;
+    device_prop_ = &Eigen::m_deviceProperties[place.device];
+  }
+
+  const gpuStream_t& stream() const override { return stream_; }
+
+  const gpuDeviceProp& deviceProperties() const override {
+    return *device_prop_;
+  }
+
+  void* allocate(size_t num_bytes) const override {
+    if (UNLIKELY(num_bytes == 0)) {
+      return nullptr;
+    }
+    auto buf = allocator_->Allocate(num_bytes);
+    VLOG(4) << "Eigen allocated at " << buf->ptr() << " requested "
+            << num_bytes;
+    void* retv = buf->ptr();
+    {
+      std::lock_guard<std::mutex> lock(mtx_);
+      allocations_.emplace(retv, std::move(buf));
+    }
+    return retv;
+  }
+
+  void deallocate(void* buffer) const override {
+    if (LIKELY(buffer)) {
+      std::lock_guard<std::mutex> lock(mtx_);
+      allocations_.erase(buffer);
+    }
+  }
+
+  void* scratchpad() const override {
+    if (scratch_ == NULL) {
+      scratch_ = allocate(Eigen::kGpuScratchSize + sizeof(unsigned int));
+    }
+    return scratch_;
+  }
+
+  unsigned int* semaphore() const override {
+    if (semaphore_ == NULL) {
+      char* scratch = static_cast<char*>(scratchpad()) + Eigen::kGpuScratchSize;
+      semaphore_ = reinterpret_cast<unsigned int*>(scratch);
+#ifdef PADDLE_WITH_HIP
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          hipMemsetAsync(semaphore_, 0, sizeof(unsigned int), stream_));
+#else
+      PADDLE_ENFORCE_GPU_SUCCESS(
+          cudaMemsetAsync(semaphore_, 0, sizeof(unsigned int), stream_));
+#endif
+    }
+    return semaphore_;
+  }
+
+ private:
+  gpuStream_t stream_;                // not owned;
+  phi::Allocator* allocator_;         // not owned;
+  const gpuDeviceProp* device_prop_;  // not owned;
+  mutable void* scratch_;
+  mutable unsigned int* semaphore_;
+  mutable std::mutex mtx_;  // to protect allocations_
+  mutable std::unordered_map<void*, phi::Allocator::AllocationPtr> allocations_;
+};
+#endif
+}  // namespace internal
+
+ResourceManager::ResourceManager(const phi::Place& place, void* stream)
+    : place_(place) {
+  InitCPUResource();
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  InitGPUResource(stream);
+#endif
+}
+
+ResourceManager::~ResourceManager() {
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  DestroyGPUResource();
+#endif
+}
+
+void ResourceManager::InitCPUResource() {
+  cpu_eigen_device_.reset(new Eigen::DefaultDevice());
+}
+
+Eigen::DefaultDevice* ResourceManager::GetCpuEigenDevice() {
+  return cpu_eigen_device_.get();
+}
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+void ResourceManager::InitGPUResource(void* stream) {
+  if (stream == nullptr) {
+    owned_stream_ = true;
+    phi::InitStream(&stream_);
+  } else {
+    owned_stream_ = false;
+    stream_ = reinterpret_cast<gpuStream_t>(stream);
+  }
+
+  InitGpuProperties();
+  InitGpuEigenDevice();
+  InitDnnHanlde();
+  InitBlasHandle();
+  InitBlasLtHandle();
+  InitSolverHandle();
+  InitSparseHandle();
+}
+
+void ResourceManager::DestroyGPUResource() {
+  if (owned_stream_) {
+#ifdef PADDLE_WITH_HIP
+    PADDLE_ENFORCE_GPU_SUCCESS(hipStreamDestroy(stream_));
+#else
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamDestroy(stream_));
+#endif
+    stream_ = nullptr;
+  }
+
+  DestroyDnnHandle();
+  DestroyBlasHandle();
+  DestroyBlasLtHandle();
+  DestroySolverHandle();
+  DestroySparseHandle();
+}
+
+void ResourceManager::InitGpuProperties() {
+  phi::backends::gpu::GPUDeviceGuard guard(place_.device);
+  phi::InitGpuProperties(place_, &compute_capability_, &runtime_version_,
+                         &driver_version_, &multi_process_,
+                         &max_threads_per_mp_, &max_threads_per_block_,
+                         &max_grid_dim_size_);
+}
+
+void ResourceManager::InitGpuEigenDevice() {
+  auto* allocator = paddle::memory::allocation::AllocatorFacade::Instance()
+                        .GetAllocator(place_)
+                        .get();
+  eigen_stream_.reset(new internal::EigenGpuStreamDevice());
+  eigen_stream_->Reinitialize(stream_, allocator, place_);
+  gpu_eigen_device_.reset(new Eigen::GpuDevice(eigen_stream_.get()));
+}
+
+void ResourceManager::InitDnnHanlde() {
+  phi::InitDnnHandle(&dnn_handle_, stream_, place_);
+}
+
+void ResourceManager::DestroyDnnHandle() { phi::DestroyDnnHandle(dnn_handle_); }
+
+void ResourceManager::InitBlasHandle() {
+  phi::InitBlasHandle(&blas_handle_, stream_);
+#ifdef PADDLE_WITH_CUDA
+#if CUDA_VERSION >= 9000
+  phi::InitBlasHandle(&blas_tensor_core_handle_, stream_);
+  PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cublasSetMathMode(
+      blas_tensor_core_handle_, CUBLAS_TENSOR_OP_MATH));
+#endif
+#if CUDA_VERSION >= 11000
+  phi::InitBlasHandle(&blas_tf32_tensor_core_handle_, stream_);
+  PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cublasSetMathMode(
+      blas_tf32_tensor_core_handle_, CUBLAS_TF32_TENSOR_OP_MATH));
+#endif
+#endif
+}
+
+void ResourceManager::DestroyBlasHandle() {
+  phi::DestroyBlasHandle(blas_handle_);
+  phi::DestroyBlasHandle(blas_tensor_core_handle_);
+  phi::DestroyBlasHandle(blas_tf32_tensor_core_handle_);
+}
+
+void ResourceManager::InitBlasLtHandle() {
+  phi::InitBlasLtHandle(&blaslt_handle_);
+}
+
+void ResourceManager::DestroyBlasLtHandle() {
+  phi::DestroyBlasLtHandle(blaslt_handle_);
+}
+
+void ResourceManager::InitSolverHandle() {
+  phi::InitSolverHandle(&solver_handle_, stream_);
+}
+
+void ResourceManager::DestroySolverHandle() {
+  phi::DestroySolverHandle(solver_handle_);
+}
+
+void ResourceManager::InitSparseHandle() {
+  phi::InitSparseHandle(&sparse_handle_, stream_);
+}
+
+void ResourceManager::DestroySparseHandle() {
+  phi::DestroySparseHandle(sparse_handle_);
+}
+
+gpuStream_t ResourceManager::GetStream() const { return stream_; }
+
+dnnHandle_t ResourceManager::GetDnnHandle() const { return dnn_handle_; }
+
+blasHandle_t ResourceManager::GetBlasHandle() const { return blas_handle_; }
+
+blasHandle_t ResourceManager::GetBlasTensorCoreHandle() const {
+  return blas_tensor_core_handle_;
+}
+
+blasHandle_t ResourceManager::GetBlasTF32Handle() const {
+  return blas_tf32_tensor_core_handle_;
+}
+
+blasLtHandle_t ResourceManager::GetBlasLtHandle() const {
+  return blaslt_handle_;
+}
+
+phi::solverHandle_t ResourceManager::GetSolverDnHandle() const {
+  return solver_handle_;
+}
+
+phi::sparseHandle_t ResourceManager::GetSparseHandle() const {
+  return sparse_handle_;
+}
+
+Eigen::GpuDevice* ResourceManager::GetGpuEigenDevice() const {
+  return gpu_eigen_device_.get();
+}
+
+int ResourceManager::GetGpuComputeCapability() const {
+  return compute_capability_;
+}
+
+int ResourceManager::GetGpuRuntimeVersion() const { return runtime_version_; }
+
+int ResourceManager::GetGpuDriverVersion() const { return driver_version_; }
+
+int ResourceManager::GetGPUMultiProcessors() const { return multi_process_; }
+
+int ResourceManager::GetGpuMaxThreadsPerMp() const {
+  return max_threads_per_mp_;
+}
+
+int ResourceManager::GetGpuMaxThreadsPerBlock() const {
+  return max_threads_per_block_;
+}
+
+std::array<int, 3> ResourceManager::GetGpuMaxGridDimSize() const {
+  return max_grid_dim_size_;
+}
+
+#endif
+}  // namespace paddle
diff --git a/paddle/fluid/inference/api/resource_manager.h b/paddle/fluid/inference/api/resource_manager.h
new file mode 100644
index 0000000000000..c41968dc58590
--- /dev/null
+++ b/paddle/fluid/inference/api/resource_manager.h
@@ -0,0 +1,109 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include <functional>
+#include <memory>
+#include "paddle/phi/api/include/tensor.h"
+#include "paddle/phi/backends/cpu/forwards.h"
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#include "paddle/fluid/platform/device/gpu/gpu_types.h"
+#include "paddle/phi/backends/gpu/forwards.h"
+#include "paddle/phi/backends/gpu/gpu_decls.h"
+#include "paddle/phi/backends/gpu/gpu_resources.h"
+#endif
+
+namespace paddle {
+namespace internal {
+class EigenGpuStreamDevice;
+}  // namespace internal
+
+class ResourceManager {
+ public:
+  explicit ResourceManager(const phi::Place& place, void* stream);
+  ~ResourceManager();
+
+ public:
+  Eigen::DefaultDevice* GetCpuEigenDevice();
+
+ private:
+  void InitCPUResource();
+
+ private:
+  phi::Place place_;
+  std::unique_ptr<Eigen::DefaultDevice> cpu_eigen_device_;
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+
+ public:
+  gpuStream_t GetStream() const;
+  dnnHandle_t GetDnnHandle() const;
+  blasHandle_t GetBlasHandle() const;
+  blasHandle_t GetBlasTensorCoreHandle() const;
+  blasHandle_t GetBlasTF32Handle() const;
+  blasLtHandle_t GetBlasLtHandle() const;
+  phi::solverHandle_t GetSolverDnHandle() const;
+  phi::sparseHandle_t GetSparseHandle() const;
+  Eigen::GpuDevice* GetGpuEigenDevice() const;
+  int GetGpuComputeCapability() const;
+  int GetGpuRuntimeVersion() const;
+  int GetGpuDriverVersion() const;
+  int GetGPUMultiProcessors() const;
+  int GetGpuMaxThreadsPerMp() const;
+  int GetGpuMaxThreadsPerBlock() const;
+  std::array<int, 3> GetGpuMaxGridDimSize() const;
+
+ private:
+  void InitGPUResource(void* stream);
+  void DestroyGPUResource();
+  void InitGpuProperties();
+  void InitGpuEigenDevice();
+  void InitDnnHanlde();
+  void DestroyDnnHandle();
+  void InitBlasHandle();
+  void DestroyBlasHandle();
+  void InitBlasLtHandle();
+  void DestroyBlasLtHandle();
+  void InitSolverHandle();
+  void DestroySolverHandle();
+  void InitSparseHandle();
+  void DestroySparseHandle();
+
+ private:
+  int compute_capability_;
+  int runtime_version_;
+  int driver_version_;
+  int multi_process_;
+  int max_threads_per_mp_;
+  int max_threads_per_block_;
+  std::array<int, 3> max_grid_dim_size_;
+
+  bool owned_stream_{true};
+  gpuStream_t stream_;
+  std::unique_ptr<Eigen::GpuDevice> gpu_eigen_device_;
+  std::unique_ptr<internal::EigenGpuStreamDevice> eigen_stream_;
+
+  blasHandle_t blas_handle_{nullptr};
+  blasHandle_t blas_tensor_core_handle_{nullptr};
+  blasHandle_t blas_tf32_tensor_core_handle_{nullptr};
+  blasLtHandle_t blaslt_handle_{nullptr};
+  dnnHandle_t dnn_handle_{nullptr};
+  phi::solverHandle_t solver_handle_{nullptr};
+  phi::sparseHandle_t sparse_handle_{nullptr};
+// DnnWorkspaceHandle
+#endif
+};
+
+}  // namespace paddle
diff --git a/paddle/phi/backends/gpu/CMakeLists.txt b/paddle/phi/backends/gpu/CMakeLists.txt
index d14e94024f90f..ebe8f1ca4c101 100644
--- a/paddle/phi/backends/gpu/CMakeLists.txt
+++ b/paddle/phi/backends/gpu/CMakeLists.txt
@@ -6,4 +6,5 @@ elseif(WITH_ROCM)
   hip_library(phi_gpu_info SRCS gpu_info.cc DEPS phi_rocm_info gflags glog enforce phi_dynload_cuda)
 endif()
 
-cc_library(gpu_context SRCS gpu_context.cc DEPS phi_device_context phi_gpu_info eigen3)
+cc_library(gpu_resources SRCS gpu_resources.cc DEPS phi_device_context phi_gpu_info)
+cc_library(gpu_context SRCS gpu_context.cc DEPS phi_device_context phi_gpu_info eigen3 gpu_resources)
diff --git a/paddle/phi/backends/gpu/gpu_context.cc b/paddle/phi/backends/gpu/gpu_context.cc
index ff238b7997865..e5d34376834dd 100644
--- a/paddle/phi/backends/gpu/gpu_context.cc
+++ b/paddle/phi/backends/gpu/gpu_context.cc
@@ -25,6 +25,7 @@ limitations under the License. */
 
 #include "paddle/phi/backends/gpu/gpu_decls.h"
 #include "paddle/phi/backends/gpu/gpu_info.h"
+#include "paddle/phi/backends/gpu/gpu_resources.h"
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/allocator.h"
@@ -202,27 +203,65 @@ struct GPUContext::Impl {
   void Init() {
     owned_ = true;
     backends::gpu::GPUDeviceGuard guard(place_.device);
-    InitGpuProperties();
-    InitStream();
+    phi::InitGpuProperties(place_,
+                           &compute_capability_,
+                           &runtime_version_,
+                           &driver_version_,
+                           &multi_process_,
+                           &max_threads_per_mp_,
+                           &max_threads_per_block_,
+                           &max_grid_dim_size_);
+    phi::InitStream(&stream_);
     InitEigenDevice();
-    InitBlasHandle();
-    InitBlasLtHandle();
-    InitDNNHandle();
-    InitSolverHandle();
-    InitSparseHandle();
+    phi::InitBlasHandle(&blas_handle_, stream_);
+#ifdef PADDLE_WITH_CUDA
+#if CUDA_VERSION >= 9000
+    phi::InitBlasHandle(&blas_tensor_core_handle_, stream_);
+    PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cublasSetMathMode(
+        blas_tensor_core_handle_, CUBLAS_TENSOR_OP_MATH));
+#endif
+#if CUDA_VERSION >= 11000
+    phi::InitBlasHandle(&blas_tf32_tensor_core_handle_, stream_);
+    PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cublasSetMathMode(
+        blas_tf32_tensor_core_handle_, CUBLAS_TF32_TENSOR_OP_MATH));
+#endif
+#endif
+    phi::InitBlasLtHandle(&blaslt_handle_);
+    phi::InitDnnHandle(&dnn_handle_, stream_, place_);
+    phi::InitSolverHandle(&solver_handle_, stream_);
+    phi::InitSparseHandle(&sparse_handle_, stream_);
     InitDnnWorkspace();
   }
 
   void PartialInitWithoutAllocator() {
     owned_ = true;
     backends::gpu::GPUDeviceGuard guard(place_.device);
-    InitGpuProperties();
-    InitStream();
-    InitBlasHandle();
-    InitBlasLtHandle();
-    InitDNNHandle();
-    InitSolverHandle();
-    InitSparseHandle();
+    phi::InitGpuProperties(place_,
+                           &compute_capability_,
+                           &runtime_version_,
+                           &driver_version_,
+                           &multi_process_,
+                           &max_threads_per_mp_,
+                           &max_threads_per_block_,
+                           &max_grid_dim_size_);
+    phi::InitStream(&stream_);
+    phi::InitBlasHandle(&blas_handle_, stream_);
+#ifdef PADDLE_WITH_CUDA
+#if CUDA_VERSION >= 9000
+    phi::InitBlasHandle(&blas_tensor_core_handle_, stream_);
+    PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cublasSetMathMode(
+        blas_tensor_core_handle_, CUBLAS_TENSOR_OP_MATH));
+#endif
+#if CUDA_VERSION >= 11000
+    phi::InitBlasHandle(&blas_tf32_tensor_core_handle_, stream_);
+    PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cublasSetMathMode(
+        blas_tf32_tensor_core_handle_, CUBLAS_TF32_TENSOR_OP_MATH));
+#endif
+#endif
+    phi::InitBlasLtHandle(&blaslt_handle_);
+    phi::InitDnnHandle(&dnn_handle_, stream_, place_);
+    phi::InitSolverHandle(&solver_handle_, stream_);
+    phi::InitSparseHandle(&sparse_handle_, stream_);
   }
 
   void PartialInitWithAllocator() {
@@ -238,19 +277,23 @@ struct GPUContext::Impl {
 
   ~Impl() {
     backends::gpu::GPUDeviceGuard guard(place_.device);
-    DestoryInternalWorkspace();
-    DestoryInternalEigenDevice();
-    DestroyInternalSparseHandle();
-    DestroyInternalSolverHandle();
-    DestroyInternalDnnHandle();
+    if (owned_) {
+      DestoryInternalWorkspace();
+      DestoryInternalEigenDevice();
+      phi::DestroySparseHandle(sparse_handle_);
+      phi::DestroySolverHandle(solver_handle_);
+      phi::DestroyDnnHandle(dnn_handle_);
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-    if (nccl_comm_) {
-      PADDLE_ENFORCE_GPU_SUCCESS(dynload::ncclCommDestroy(nccl_comm_));
-    }
+      if (nccl_comm_) {
+        PADDLE_ENFORCE_GPU_SUCCESS(dynload::ncclCommDestroy(nccl_comm_));
+      }
 #endif
-    DestroyInternalBlasHandle();
-    DestroyInternalBlasLtHandle();
-    DestoryInternalStream();
+      phi::DestroyBlasHandle(blas_handle_);
+      phi::DestroyBlasHandle(blas_tensor_core_handle_);
+      phi::DestroyBlasHandle(blas_tf32_tensor_core_handle_);
+      phi::DestroyBlasLtHandle(blaslt_handle_);
+      phi::DestoryStream(stream_);
+    }
   }
 
   const Place& GetPlace() const { return place_; }
@@ -259,73 +302,6 @@ struct GPUContext::Impl {
     return blas_tensor_core_handle_ != nullptr;
   }
 
-  void InitGpuProperties() {
-    backends::gpu::GPUDeviceGuard guard(place_.GetDeviceId());
-    compute_capability_ =
-        backends::gpu::GetGPUComputeCapability(place_.GetDeviceId());
-    multi_process_ = backends::gpu::GetGPUMultiProcessors(place_.GetDeviceId());
-    max_threads_per_mp_ =
-        backends::gpu::GetGPUMaxThreadsPerMultiProcessor(place_.GetDeviceId());
-    max_grid_dim_size_ =
-        backends::gpu::GetGpuMaxGridDimSize(place_.GetDeviceId());
-    max_threads_per_block_ =
-        backends::gpu::GetGPUMaxThreadsPerBlock(place_.GetDeviceId());
-    driver_version_ = backends::gpu::GetGPUDriverVersion(place_.GetDeviceId());
-    runtime_version_ =
-        backends::gpu::GetGPURuntimeVersion(place_.GetDeviceId());
-
-    // TODO(wilber): glog may be replaced in the future?
-    LOG_FIRST_N(WARNING, 1)
-        << "Please NOTE: device: " << static_cast<int>(place_.device)
-        << ", GPU Compute Capability: " << compute_capability_ / 10 << "."
-        << compute_capability_ % 10
-        << ", Driver API Version: " << driver_version_ / 1000 << "."
-        << (driver_version_ % 100) / 10
-        << ", Runtime API Version: " << runtime_version_ / 1000 << "."
-        << (runtime_version_ % 100) / 10;
-#ifdef PADDLE_WITH_HIP
-    size_t miopen_major, miopen_minor, miopen_patch;
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        dynload::miopenGetVersion(&miopen_major, &miopen_minor, &miopen_patch));
-    auto cudnn_dso_ver =
-        (miopen_major * 1000 + miopen_minor * 10 + miopen_patch) / 10;
-    auto compile_miopen_version = MIOPEN_VERSION / 10;
-    if (cudnn_dso_ver < static_cast<size_t>(compile_miopen_version)) {
-      LOG_FIRST_N(WARNING, 1)
-          << "WARNING: device: " << static_cast<int>(place_.device)
-          << ". The installed Paddle is compiled with MIOPEN "
-          << compile_miopen_version / 100 << "." << compile_miopen_version % 100
-          << ", but MIOPEN version in your machine is " << cudnn_dso_ver / 100
-          << "." << cudnn_dso_ver % 100
-          << ", which may cause serious incompatible bug. "
-          << "Please recompile or reinstall Paddle with compatible MIOPEN "
-             "version.";
-    }
-#else
-    size_t cudnn_dso_ver = dynload::cudnnGetVersion();
-    LOG_FIRST_N(WARNING, 1) << "device: " << static_cast<int>(place_.device)
-                            << ", cuDNN Version: " << cudnn_dso_ver / 1000
-                            << "." << (cudnn_dso_ver % 1000) / 100 << ".";
-
-    // Check CUDA/CUDNN version compatiblity
-    auto local_cuda_version =
-        (driver_version_ / 1000) * 10 + (driver_version_ % 100) / 10;
-    auto compile_cuda_version =
-        (CUDA_VERSION / 1000) * 10 + (CUDA_VERSION % 100) / 10;
-    if (local_cuda_version < compile_cuda_version) {
-      LOG_FIRST_N(WARNING, 1)
-          << "WARNING: device: " << static_cast<int>(place_.device)
-          << ". The installed Paddle is compiled with CUDA "
-          << compile_cuda_version / 10 << "." << compile_cuda_version % 10
-          << ", but CUDA runtime version in your machine is "
-          << local_cuda_version / 10 << "." << local_cuda_version % 10
-          << ", which may cause serious incompatible bug. "
-          << "Please recompile or reinstall Paddle with compatible CUDA "
-             "version.";
-    }
-#endif
-  }
-
   void InitDnnWorkspace() {
     PD_CHECK(allocator_ != nullptr,
              "the device allocator for gpu context is nullptr.");
@@ -350,27 +326,6 @@ struct GPUContext::Impl {
     return DnnWorkspaceHandle(allocator_, stream_);
   }
 
-  void InitStream() {
-#ifdef PADDLE_WITH_HIP
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        hipStreamCreateWithPriority(&stream_, hipStreamDefault, 0));
-#else
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        cudaStreamCreateWithPriority(&stream_, cudaStreamDefault, 0));
-#endif
-  }
-
-  void DestoryInternalStream() {
-    if (owned_ && stream_ != nullptr) {
-#ifdef PADDLE_WITH_HIP
-      PADDLE_ENFORCE_GPU_SUCCESS(hipStreamDestroy(stream_));
-#else
-      PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamDestroy(stream_));
-#endif
-    }
-    stream_ = nullptr;
-  }
-
   void SetStream(gpuStream_t stream) { stream_ = stream; }
 
   gpuStream_t GetStream() const {
@@ -400,55 +355,6 @@ struct GPUContext::Impl {
     return eigen_device_;
   }
 
-  void InitBlasHandle() {
-#ifdef PADDLE_WITH_HIP
-    phi::dynload::rocblas_create_handle(&blas_handle_);
-    phi::dynload::rocblas_set_stream(blas_handle_, stream_);
-#else  // PADDLE_WITH_CUDA
-    PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cublasCreate(&blas_handle_));
-    PADDLE_RETRY_CUDA_SUCCESS(
-        phi::dynload::cublasSetStream(blas_handle_, stream_));
-#if CUDA_VERSION >= 9000
-    PADDLE_RETRY_CUDA_SUCCESS(
-        phi::dynload::cublasCreate(&blas_tensor_core_handle_));
-    PADDLE_RETRY_CUDA_SUCCESS(
-        phi::dynload::cublasSetStream(blas_tensor_core_handle_, stream_));
-    PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cublasSetMathMode(
-        blas_tensor_core_handle_, CUBLAS_TENSOR_OP_MATH));
-#if CUDA_VERSION >= 11000
-    PADDLE_RETRY_CUDA_SUCCESS(
-        phi::dynload::cublasCreate(&blas_tf32_tensor_core_handle_));
-    PADDLE_RETRY_CUDA_SUCCESS(
-        phi::dynload::cublasSetStream(blas_tf32_tensor_core_handle_, stream_));
-    PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cublasSetMathMode(
-        blas_tf32_tensor_core_handle_, CUBLAS_TF32_TENSOR_OP_MATH));
-#endif  // CUDA_VERSION >= 11000
-#endif  // CUDA_VERSION >= 9000
-#endif  // PADDLE_WITH_HIP
-  }
-
-  void DestroyInternalBlasHandle() {
-#ifdef PADDLE_WITH_HIP
-    if (owned_ && blas_handle_ != nullptr) {
-      phi::dynload::rocblas_destroy_handle(blas_handle_);
-      blas_handle_ = nullptr;
-    }
-#else
-    if (owned_ && blas_handle_ != nullptr) {
-      phi::dynload::cublasDestroy(blas_handle_);
-      blas_handle_ = nullptr;
-    }
-    if (owned_ && blas_tensor_core_handle_ != nullptr) {
-      phi::dynload::cublasDestroy(blas_tensor_core_handle_);
-      blas_tensor_core_handle_ = nullptr;
-    }
-    if (owned_ && blas_tf32_tensor_core_handle_ != nullptr) {
-      phi::dynload::cublasDestroy(blas_tf32_tensor_core_handle_);
-      blas_tf32_tensor_core_handle_ = nullptr;
-    }
-#endif  // PADDLE_WITH_HIP
-  }
-
   blasHandle_t GetBlasHandle() const {
     PD_CHECK(blas_handle_ != nullptr, "the gpu blas handle is nullptr.");
     return blas_handle_;
@@ -456,16 +362,12 @@ struct GPUContext::Impl {
 
   void SetBlasHandle(blasHandle_t blas) { blas_handle_ = blas; }
 
-  void InitBlasLtHandle() {
-#if defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 11060
-    phi::dynload::cublasLtCreate(&blaslt_handle_);
-#endif
+  void SetBlasTensorCoreHandle(blasHandle_t handle) {
+    blas_tensor_core_handle_ = handle;
   }
 
-  void DestroyInternalBlasLtHandle() {
-#if defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 11060
-    phi::dynload::cublasLtDestroy(blaslt_handle_);
-#endif
+  void SetBlasTF32Handle(blasHandle_t handle) {
+    blas_tf32_tensor_core_handle_ = handle;
   }
 
   void SetBlasLtHandle(blasLtHandle_t blaslt) { blaslt_handle_ = blaslt; }
@@ -475,53 +377,6 @@ struct GPUContext::Impl {
     return blaslt_handle_;
   }
 
-  void InitDNNHandle() {
-    if (phi::dynload::HasCUDNN()) {
-#ifdef PADDLE_WITH_HIP
-      size_t miopen_major, miopen_minor, miopen_patch;
-      PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenGetVersion(
-          &miopen_major, &miopen_minor, &miopen_patch));
-      auto local_miopen_version =
-          (miopen_major * 1000 + miopen_minor * 10 + miopen_patch) / 10;
-      auto compile_miopen_version = MIOPEN_VERSION / 10;
-      if (local_miopen_version < static_cast<size_t>(compile_miopen_version)) {
-        LOG_FIRST_N(WARNING, 1)
-            << "WARNING: device: " << place_.device
-            << ". The installed Paddle is compiled with MIOPEN "
-            << compile_miopen_version / 100 << "."
-            << compile_miopen_version % 100
-            << ", but MIOPEN version in your machine is "
-            << local_miopen_version / 100 << "." << local_miopen_version % 100
-            << ", which may cause serious incompatible bug. "
-            << "Please recompile or reinstall Paddle with compatible MIOPEN "
-               "version.";
-      }
-      PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenCreate(&dnn_handle_));
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          dynload::miopenSetStream(dnn_handle_, stream_));
-#else
-      auto local_cudnn_version = phi::dynload::cudnnGetVersion() / 100;
-      auto compile_cudnn_version = CUDNN_VERSION / 100;
-      if (local_cudnn_version < static_cast<size_t>(compile_cudnn_version)) {
-        LOG_FIRST_N(WARNING, 1)
-            << "WARNING: device: " << place_.device
-            << ". The installed Paddle is compiled with CUDNN "
-            << compile_cudnn_version / 10 << "." << compile_cudnn_version % 10
-            << ", but CUDNN version in your machine is "
-            << local_cudnn_version / 10 << "." << local_cudnn_version % 10
-            << ", which may cause serious incompatible bug. "
-            << "Please recompile or reinstall Paddle with compatible CUDNN "
-               "version.";
-      }
-      PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cudnnCreate(&dnn_handle_));
-      PADDLE_RETRY_CUDA_SUCCESS(
-          phi::dynload::cudnnSetStream(dnn_handle_, stream_));
-#endif
-    } else {
-      dnn_handle_ = nullptr;
-    }
-  }
-
   dnnHandle_t GetDnnHandle() {
     PD_CHECK(dnn_handle_ != nullptr, "the gpu dnn handle is nullptr.");
     return dnn_handle_;
@@ -543,24 +398,6 @@ struct GPUContext::Impl {
 
   void SetDnnHandle(dnnHandle_t handle) { dnn_handle_ = handle; }
 
-  void InitSolverHandle() {
-#ifndef PADDLE_WITH_HIP
-    PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cusolverDnCreate(&solver_handle_));
-    PADDLE_RETRY_CUDA_SUCCESS(
-        phi::dynload::cusolverDnSetStream(solver_handle_, stream_));
-#endif
-  }
-
-  void DestroyInternalSolverHandle() {
-#ifndef PADDLE_WITH_HIP
-    if (owned_ && solver_handle_ != nullptr) {
-      PADDLE_ENFORCE_GPU_SUCCESS(
-          phi::dynload::cusolverDnDestroy(solver_handle_));
-      solver_handle_ = nullptr;
-    }
-#endif
-  }
-
   solverHandle_t GetSolverHandle() const {
     PD_CHECK(solver_handle_ != nullptr, "the gpu solver handle is nullptr.");
     return solver_handle_;
@@ -568,29 +405,6 @@ struct GPUContext::Impl {
 
   void SetSolverHandle(solverHandle_t handle) { solver_handle_ = handle; }
 
-  void InitSparseHandle() {
-// ROCM is not yet supported
-#if defined(PADDLE_WITH_CUDA)
-// The generic APIs is supported from CUDA10.1
-#if CUDA_VERSION >= 10010
-    PADDLE_RETRY_CUDA_SUCCESS(dynload::cusparseCreate(&sparse_handle_));
-    PADDLE_RETRY_CUDA_SUCCESS(
-        dynload::cusparseSetStream(sparse_handle_, stream_));
-#endif
-#endif
-  }
-
-  void DestroyInternalSparseHandle() {
-#ifdef PADDLE_WITH_CUDA
-#if CUDA_VERSION >= 10010
-    if (owned_ && sparse_handle_ != nullptr) {
-      PADDLE_RETRY_CUDA_SUCCESS(dynload::cusparseDestroy(sparse_handle_));
-      sparse_handle_ = nullptr;
-    }
-#endif
-#endif
-  }
-
   sparseHandle_t GetSparseHandle() const {
     PD_CHECK(sparse_handle_ != nullptr, "the gpu sparse handle is nullptr.");
     return sparse_handle_;
@@ -878,7 +692,10 @@ void GPUContext::Init() {
   impl_->Init();
 }
 
-void GPUContext::SetStream(gpuStream_t stream) { impl_->SetStream(stream); }
+void GPUContext::SetStream(gpuStream_t stream) {
+  impl_->allocator_ = const_cast<Allocator*>(&this->GetAllocator());
+  impl_->SetStream(stream);
+}
 
 void GPUContext::SetEigenDevice(Eigen::GpuDevice* device) {
   impl_->SetEigenDevice(device);
@@ -888,6 +705,14 @@ void GPUContext::SetBlasHandle(blasHandle_t blas) {
   impl_->SetBlasHandle(blas);
 }
 
+void GPUContext::SetBlasTensorCoreHandle(blasHandle_t handle) {
+  impl_->SetBlasTensorCoreHandle(handle);
+}
+
+void GPUContext::SetBlasTF32Handle(blasHandle_t handle) {
+  impl_->SetBlasTF32Handle(handle);
+}
+
 void GPUContext::SetBlasLtHandle(blasLtHandle_t blaslt) {
   impl_->SetBlasLtHandle(blaslt);
 }
diff --git a/paddle/phi/backends/gpu/gpu_context.h b/paddle/phi/backends/gpu/gpu_context.h
index 8d44acaa4a083..db9f287041dfb 100644
--- a/paddle/phi/backends/gpu/gpu_context.h
+++ b/paddle/phi/backends/gpu/gpu_context.h
@@ -199,6 +199,10 @@ class PADDLE_API GPUContext : public DeviceContext {
 
   void SetBlasHandle(blasHandle_t);
 
+  void SetBlasTensorCoreHandle(blasHandle_t);
+
+  void SetBlasTF32Handle(blasHandle_t);
+
   void SetBlasLtHandle(blasLtHandle_t);
 
   void SetDnnHandle(dnnHandle_t);
diff --git a/paddle/phi/backends/gpu/gpu_resources.cc b/paddle/phi/backends/gpu/gpu_resources.cc
new file mode 100644
index 0000000000000..268024eb25949
--- /dev/null
+++ b/paddle/phi/backends/gpu/gpu_resources.cc
@@ -0,0 +1,271 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/backends/gpu/gpu_resources.h"
+
+#include "paddle/phi/api/include/tensor.h"
+#include "paddle/phi/backends/gpu/gpu_decls.h"
+#include "paddle/phi/backends/gpu/gpu_info.h"
+#include "paddle/phi/common/place.h"
+#include "paddle/phi/core/allocator.h"
+
+#ifdef PADDLE_WITH_CUDA
+#include "paddle/phi/backends/dynload/cublas.h"
+#include "paddle/phi/backends/dynload/cudnn.h"
+#include "paddle/phi/backends/dynload/cusolver.h"
+#include "paddle/phi/backends/dynload/cusparse.h"
+#if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL)
+#include "paddle/phi/backends/dynload/nccl.h"
+#endif  // !defined(__APPLE__) && defined(PADDLE_WITH_NCCL)
+#endif  // PADDLE_WITH_CUDA
+
+#include "unsupported/Eigen/CXX11/Tensor"
+
+// TODO(phi): remove fluid header.
+#include "paddle/fluid/platform/enforce.h"
+
+namespace phi {
+
+void InitGpuProperties(Place place,
+                       int* compute_capability,
+                       int* runtime_version,
+                       int* driver_version,
+                       int* multi_process,
+                       int* max_threads_per_mp,
+                       int* max_threads_per_block,
+                       std::array<int, 3>* max_grid_dim_size) {
+  backends::gpu::GPUDeviceGuard guard(place.GetDeviceId());
+  *compute_capability =
+      backends::gpu::GetGPUComputeCapability(place.GetDeviceId());
+  *multi_process = backends::gpu::GetGPUMultiProcessors(place.GetDeviceId());
+  *max_threads_per_mp =
+      backends::gpu::GetGPUMaxThreadsPerMultiProcessor(place.GetDeviceId());
+  *max_grid_dim_size = backends::gpu::GetGpuMaxGridDimSize(place.GetDeviceId());
+  *max_threads_per_block =
+      backends::gpu::GetGPUMaxThreadsPerBlock(place.GetDeviceId());
+  *driver_version = backends::gpu::GetGPUDriverVersion(place.GetDeviceId());
+  *runtime_version = backends::gpu::GetGPURuntimeVersion(place.GetDeviceId());
+
+  // TODO(wilber): glog may be replaced in the future?
+  LOG_FIRST_N(WARNING, 1) << "Please NOTE: device: "
+                          << static_cast<int>(place.device)
+                          << ", GPU Compute Capability: "
+                          << *compute_capability / 10 << "."
+                          << *compute_capability % 10
+                          << ", Driver API Version: " << *driver_version / 1000
+                          << "." << (*driver_version % 100) / 10
+                          << ", Runtime API Version: "
+                          << *runtime_version / 1000 << "."
+                          << (*runtime_version % 100) / 10;
+#ifdef PADDLE_WITH_HIP
+  size_t miopen_major, miopen_minor, miopen_patch;
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      dynload::miopenGetVersion(&miopen_major, &miopen_minor, &miopen_patch));
+  auto cudnn_dso_ver =
+      (miopen_major * 1000 + miopen_minor * 10 + miopen_patch) / 10;
+  auto compile_miopen_version = MIOPEN_VERSION / 10;
+  if (cudnn_dso_ver < static_cast<size_t>(compile_miopen_version)) {
+    LOG_FIRST_N(WARNING, 1)
+        << "WARNING: device: " << static_cast<int>(place.device)
+        << ". The installed Paddle is compiled with MIOPEN "
+        << compile_miopen_version / 100 << "." << compile_miopen_version % 100
+        << ", but MIOPEN version in your machine is " << cudnn_dso_ver / 100
+        << "." << cudnn_dso_ver % 100
+        << ", which may cause serious incompatible bug. "
+        << "Please recompile or reinstall Paddle with compatible MIOPEN "
+           "version.";
+  }
+#else
+  size_t cudnn_dso_ver = dynload::cudnnGetVersion();
+  LOG_FIRST_N(WARNING, 1) << "device: " << static_cast<int>(place.device)
+                          << ", cuDNN Version: " << cudnn_dso_ver / 1000 << "."
+                          << (cudnn_dso_ver % 1000) / 100 << ".";
+
+  // Check CUDA/CUDNN version compatiblity
+  auto local_cuda_version =
+      (*driver_version / 1000) * 10 + (*driver_version % 100) / 10;
+  auto compile_cuda_version =
+      (CUDA_VERSION / 1000) * 10 + (CUDA_VERSION % 100) / 10;
+  if (local_cuda_version < compile_cuda_version) {
+    LOG_FIRST_N(WARNING, 1)
+        << "WARNING: device: " << static_cast<int>(place.device)
+        << ". The installed Paddle is compiled with CUDA "
+        << compile_cuda_version / 10 << "." << compile_cuda_version % 10
+        << ", but CUDA runtime version in your machine is "
+        << local_cuda_version / 10 << "." << local_cuda_version % 10
+        << ", which may cause serious incompatible bug. "
+        << "Please recompile or reinstall Paddle with compatible CUDA "
+           "version.";
+  }
+#endif
+}
+
+void InitStream(gpuStream_t* stream) {
+#ifdef PADDLE_WITH_HIP
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      hipStreamCreateWithPriority(stream, hipStreamDefault, 0));
+#else
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      cudaStreamCreateWithPriority(stream, cudaStreamDefault, 0));
+#endif
+}
+
+void DestoryStream(gpuStream_t stream) {
+  if (stream != nullptr) {
+#ifdef PADDLE_WITH_HIP
+    PADDLE_ENFORCE_GPU_SUCCESS(hipStreamDestroy(stream));
+#else
+    PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamDestroy(stream));
+#endif
+  }
+  stream = nullptr;
+}
+
+void InitBlasHandle(blasHandle_t* blas_handle, gpuStream_t stream) {
+#ifdef PADDLE_WITH_HIP
+  phi::dynload::rocblas_create_handle(blas_handle);
+  phi::dynload::rocblas_set_stream(*blas_handle, stream);
+#else   // PADDLE_WITH_CUDA
+  PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cublasCreate(blas_handle));
+  PADDLE_RETRY_CUDA_SUCCESS(
+      phi::dynload::cublasSetStream(*blas_handle, stream));
+#endif  // PADDLE_WITH_HIP
+}
+
+void DestroyBlasHandle(blasHandle_t handle) {
+#ifdef PADDLE_WITH_HIP
+  if (handle != nullptr) {
+    phi::dynload::rocblas_destroy_handle(handle);
+    handle = nullptr;
+  }
+#else
+  if (handle != nullptr) {
+    phi::dynload::cublasDestroy(handle);
+    handle = nullptr;
+  }
+#endif  // PADDLE_WITH_HIP
+}
+
+void InitBlasLtHandle(blasLtHandle_t* blaslt_handle) {
+#if defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 11060
+  phi::dynload::cublasLtCreate(blaslt_handle);
+#endif
+}
+
+void DestroyBlasLtHandle(blasLtHandle_t handle) {
+#if defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 11060
+  if (handle != nullptr) {
+    phi::dynload::cublasLtDestroy(handle);
+    handle = nullptr;
+  }
+#endif
+}
+
+void InitDnnHandle(dnnHandle_t* handle, gpuStream_t stream, Place place) {
+  if (phi::dynload::HasCUDNN()) {
+#ifdef PADDLE_WITH_HIP
+    size_t miopen_major, miopen_minor, miopen_patch;
+    PADDLE_ENFORCE_GPU_SUCCESS(
+        dynload::miopenGetVersion(&miopen_major, &miopen_minor, &miopen_patch));
+    auto local_miopen_version =
+        (miopen_major * 1000 + miopen_minor * 10 + miopen_patch) / 10;
+    auto compile_miopen_version = MIOPEN_VERSION / 10;
+    if (local_miopen_version < static_cast<size_t>(compile_miopen_version)) {
+      LOG_FIRST_N(WARNING, 1)
+          << "WARNING: device: " << place.device
+          << ". The installed Paddle is compiled with MIOPEN "
+          << compile_miopen_version / 100 << "." << compile_miopen_version % 100
+          << ", but MIOPEN version in your machine is "
+          << local_miopen_version / 100 << "." << local_miopen_version % 100
+          << ", which may cause serious incompatible bug. "
+          << "Please recompile or reinstall Paddle with compatible MIOPEN "
+             "version.";
+    }
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenCreate(handle));
+    PADDLE_ENFORCE_GPU_SUCCESS(dynload::miopenSetStream(*handle, stream));
+#else
+    auto local_cudnn_version = phi::dynload::cudnnGetVersion() / 100;
+    auto compile_cudnn_version = CUDNN_VERSION / 100;
+    if (local_cudnn_version < static_cast<size_t>(compile_cudnn_version)) {
+      LOG_FIRST_N(WARNING, 1)
+          << "WARNING: device: " << place.device
+          << ". The installed Paddle is compiled with CUDNN "
+          << compile_cudnn_version / 10 << "." << compile_cudnn_version % 10
+          << ", but CUDNN version in your machine is "
+          << local_cudnn_version / 10 << "." << local_cudnn_version % 10
+          << ", which may cause serious incompatible bug. "
+          << "Please recompile or reinstall Paddle with compatible CUDNN "
+             "version.";
+    }
+    PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cudnnCreate(handle));
+    PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cudnnSetStream(*handle, stream));
+#endif
+  } else {
+    *handle = nullptr;
+  }
+}
+
+void DestroyDnnHandle(dnnHandle_t handle) {
+#ifdef PADDLE_WITH_HIP
+  if (handle != nullptr) {
+    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenDestroy(handle));
+    handle = nullptr;
+  }
+#else
+  if (handle != nullptr) {
+    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnDestroy(handle));
+    handle = nullptr;
+  }
+#endif  // PADDLE_WITH_HIP
+}
+
+void InitSolverHandle(solverHandle_t* handle, gpuStream_t stream) {
+#ifndef PADDLE_WITH_HIP
+  PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cusolverDnCreate(handle));
+  PADDLE_RETRY_CUDA_SUCCESS(phi::dynload::cusolverDnSetStream(*handle, stream));
+#endif
+}
+
+void DestroySolverHandle(solverHandle_t solver_handle) {
+#ifndef PADDLE_WITH_HIP
+  if (solver_handle != nullptr) {
+    PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cusolverDnDestroy(solver_handle));
+    solver_handle = nullptr;
+  }
+#endif
+}
+
+void InitSparseHandle(sparseHandle_t* handle, gpuStream_t stream) {
+// ROCM is not yet supported
+#if defined(PADDLE_WITH_CUDA)
+// The generic APIs is supported from CUDA10.1
+#if CUDA_VERSION >= 10010
+  PADDLE_RETRY_CUDA_SUCCESS(dynload::cusparseCreate(handle));
+  PADDLE_RETRY_CUDA_SUCCESS(dynload::cusparseSetStream(*handle, stream));
+#endif
+#endif
+}
+
+void DestroySparseHandle(sparseHandle_t handle) {
+#ifdef PADDLE_WITH_CUDA
+#if CUDA_VERSION >= 10010
+  if (handle != nullptr) {
+    PADDLE_RETRY_CUDA_SUCCESS(dynload::cusparseDestroy(handle));
+    handle = nullptr;
+  }
+#endif
+#endif
+}
+
+}  // namespace phi
diff --git a/paddle/phi/backends/gpu/gpu_resources.h b/paddle/phi/backends/gpu/gpu_resources.h
new file mode 100644
index 0000000000000..07ccb6215409a
--- /dev/null
+++ b/paddle/phi/backends/gpu/gpu_resources.h
@@ -0,0 +1,51 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include <array>
+#include "paddle/phi/backends/gpu/gpu_decls.h"
+#include "paddle/phi/common/place.h"
+
+namespace phi {
+
+void InitGpuProperties(Place place,
+                       int* compute_capability,
+                       int* runtime_version,
+                       int* driver_version,
+                       int* multi_process,
+                       int* max_threads_per_mp,
+                       int* max_threads_per_block,
+                       std::array<int, 3>* max_grid_dim_size);
+
+void InitStream(gpuStream_t* stream);
+void DestoryStream(gpuStream_t stream);
+
+void InitBlasHandle(blasHandle_t* blas_handle, gpuStream_t stream);
+void DestroyBlasHandle(blasHandle_t handle);
+
+void InitBlasLtHandle(blasLtHandle_t* blaslt_handle);
+void DestroyBlasLtHandle(blasLtHandle_t handle);
+
+void InitDnnHandle(dnnHandle_t* handle, gpuStream_t stream, Place place);
+void DestroyDnnHandle(dnnHandle_t handle);
+
+void InitSolverHandle(solverHandle_t* handle, gpuStream_t stream);
+void DestroySolverHandle(solverHandle_t solver_handle);
+
+void InitSparseHandle(sparseHandle_t* handle, gpuStream_t stream);
+void DestroySparseHandle(sparseHandle_t handle);
+
+// void InitDnnWorkspace();
+
+}  // namespace phi